## Data cleaning

In [1]:
import json
import pandas as pd
import pickle
import re

In [2]:
# function to cheack whether an object (data) contains attribute (x) or not
def check_var(data, x) :
    try :
        data[x]
        return True
    except :
        return False

In [3]:
from nltk.corpus import stopwords
import string
stopset = stopwords.words("english") + list(string.punctuation)

In [4]:
# function to turn json into dataframe
def json_to_df(all_file, hashtag) :
# we only keep text and hashtags
    df_list = []
    text_set = set()
    for file in all_file :
        with open(file, 'r', encoding = 'utf8') as f :
            all_data = json.load(f)
     
            for data in all_data :
                hashtag_list = [] # to collect list of hashtags
            
                # case 1: check whether it is a retweet data or not
                if (check_var(data, 'retweeted_status')) :
                
                    Text = data['retweeted_status']['full_text']
                    for t in data['retweeted_status']['entities']['hashtags'] :
                        hashtag_list.append(t['text'])                

                    # check whether it is retweet of quoted tweet or not
                    if (check_var(data['retweeted_status'], 'quoted_status')) :
                        Text = Text + ' || ' + data['retweeted_status']['quoted_status']['full_text']
                        for t in data['retweeted_status']['entities']['hashtags'] :
                            hashtag_list.append(t['text'])               
 
                # case 2: check whether it is a quote tweet data or not
                elif (check_var(data, 'quoted_status')) :
                    Text = data['full_text'] 
                    Text = Text + ' || ' + data['quoted_status']['full_text']
                    for t in data['entities']['hashtags'] :
                        hashtag_list.append(t['text'])
                    for t in data['quoted_status']['entities']['hashtags'] :
                        hashtag_list.append(t['text'])

                # if it does not belong to case 1 and case 2, it is an ordinary tweet data
                else :
                    Text = data['full_text']
                    for t in data['entities']['hashtags'] :
                        hashtag_list.append(t['text'])
            
                hashtag_list = [h.lower() for h in hashtag_list]
                hashtag_list = [h for h in hashtag_list if h != hashtag] # remove our hashtag from hashtag_list  
            
                Text = remove_emoji(Text) # remove emoji
                Text = re.sub("[-/]", " ", Text) # substitute dash and slash with space       
            
                text_split = Text.lower().split() # split string into list
            
                remove_list = []
                for t in text_split :
                    if hashtag in t : # remove all words containing our hashtag
                        remove_list.append(t)
                    if 'http' in t : # remove link
                        remove_list.append(t)
                    if '@' in t : # remove ...@...
                        remove_list.append(t)
                    if t == '\n' : # remove new line character
                        remove_list.append(t)
                text_split = [t for t in text_split if t not in remove_list]                          
                        
                for i in range(len(text_split)) :
                    t = text_split[i]
                    if t.startswith("'") or t.startswith('"') : # remove quotation mark
                        text_split[i] = t[1:] 
                    if t.endswith("'") or t.endswith('"') : 
                        text_split[i] = t[:len(t) - 1]         
                    text_split[i] = re.sub("[^a-z']", "", t)

                # if text contains only stop words, we will not keep it in our dataframe
                words = [t for t in text_split if t not in stopset]  
                if len(words) == 0 : 
                    continue 
            
                Text = ' '.join(text_split) # join the list back to string
            
                for h in hashtag_list :
                    Text = Text.replace('#' + h.lower(), h.lower()) # for other hashtags, we change them to words 
            
                if Text not in text_set : # check whether new data has already been kept in our list or not
                    df_list.append((Text, hashtag_list))  
                    text_set.add(Text)

    df = pd.DataFrame(df_list, columns = ['Text', 'Hashtag'])
    df['Classification'] = [hashtag] * df.shape[0]
    df = df[['Classification', 'Text', 'Hashtag']]
                    
    return df                

In [5]:
def remove_emoji(string) :
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags = re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [6]:
file1 = 'christmas-out-23K.json'
file2 = 'christmas-out-15K.json'
christmas_df = json_to_df([file1, file2], 'christmas')

In [7]:
christmas_df

Unnamed: 0,Classification,Text,Hashtag
0,christmas,tco tsrcaibt is the perfect place for the holi...,"[coldpresssoap, bodyscrub, soap, soapmaking, e..."
1,christmas,follow amp retweet for the chance to win a jo...,"[competition, win]"
2,christmas,it's winitwednesday follow amp rt amp to win t...,"[winitwednesday, win, astreetcatnamedbob, comp..."
3,christmas,there will be obstacles there will be doubters...,"[cryptocurrency, binary, motivation, investor,..."
4,christmas,do you know how i plan to celebrate this year ...,"[durgapuja, diwali]"
...,...,...,...
18780,christmas,thanks so much also have a carol concert toni...,"[concert, virtualevents, biggive2020, christma..."
18781,christmas,make positive use of this yuletide season be a...,"[december, cybersecurity, google, tuesdaymotiv..."
18782,christmas,calling love this vintage phone and for more ...,"[calling, love, vintage, phone, unique]"
18783,christmas,new earl the squirrel coffee mug universal stu...,[]


In [8]:
file1 = 'covid-out-23K.json'
file2 = 'covid-out-15K.json'
covid_df = json_to_df([file1, file2], 'covid')

In [9]:
covid_df

Unnamed: 0,Classification,Text,Hashtag
0,covid,when i wonder how long well be in lockdown i t...,[]
1,covid,so if a cure for has been found in just a few ...,[cancer]
2,covid,us deaths plus source coronavirus resource c...,[]
3,covid,why schools closing and bars restaurants and g...,"[nyc, schools]"
4,covid,my great aunt died this evening of at the age ...,[covid_19]
...,...,...,...
17933,covid,here at tco eciaduveq we've got some new stays...,"[staysafe, shopping, holidayshopping, shop, ac..."
17934,covid,apple iwatch repair visit our site tco swiudcz...,"[melbournemobilephonereapirs, mmpr, covid19, m..."
17935,covid,to keep moving forward we must look back in f...,"[culture, reconnection, onlineevent, teamboost..."
17936,covid,has disproportionately grown the education ga...,[ocfgrowingminds]


In [10]:
file1 = 'spacex-out-35K.json'
file2 = 'spacex-out-20K.json'
spacex_df = json_to_df([file1, file2], 'spacex')

In [11]:
spacex_df

Unnamed: 0,Classification,Text,Hashtag
0,spacex,the return of crew 's booster b after changing...,"[crew2, spacexfleet, crew1, launchamerica]"
1,spacex,weather for nov st at pm est pm pst nov nd a...,"[starlink, falcon9]"
2,spacex,falcon landing by harv nft cryptoart tco jdw...,"[nft, cryptoart]"
3,spacex,moonship crew section moonship tco ajhwmihpzm,[moonship]
4,spacex,looks like the left leg's crush core wasn't th...,[spacexfleet]
...,...,...,...
12467,spacex,starlink will get a much needed feature usabil...,[usability]
12468,spacex,no clouds in the sky degrees awesome time at ...,"[kennedyspacecenter, falcon9, rocket]"
12469,spacex,nasa astronaut is sleeping in the cockpit of c...,[nasa]
12470,spacex,space the final frontier tco yfpcygabo lagoon...,[]


In [12]:
spacex_df = spacex_df.sample(n = 12000) 
christmas_df = christmas_df.sample(n = 12000) 
covid_df = covid_df.sample(n = 12000) 

In [13]:
spacex_df

Unnamed: 0,Classification,Text,Hashtag
7428,spacex,the crew is on track to lift off from cape can...,[crew1]
11373,spacex,tco hzpifdto brand new paperback in the mark n...,"[paperback, book, sciencefiction, sfgiants, ha..."
6727,spacex,this is amazing the amount of energy used for ...,"[energy, spacelaunchlive, launchamerica]"
11951,spacex,our tools are widely used in the aerospace and...,"[aviation, aerospace, boeing]"
8278,spacex,leak check is good gooooaaaalllllll launchamer...,"[launchamerica, nasa, crew1]"
...,...,...,...
6346,spacex,reentry burn on the falcon first stage nasa cr...,"[nasa, crew1]"
1358,spacex,the run into tesla's addition next month may s...,"[tesla, tsla, elonmusk, teslamotors, teslamode..."
8959,spacex,me building a makeshift telescope with x zoom ...,"[nasasocial, nasa]"
7913,spacex,minutes to launch nasa tco flsbsmzl polls ar...,"[nasa, nasa]"


In [14]:
christmas_df

Unnamed: 0,Classification,Text,Hashtag
12503,christmas,live pm est hi december vibes shreddin tha slo...,"[steep, snowday]"
8819,christmas,charming miniature door wreath for vour dollho...,[]
11638,christmas,in the midst of the snow storm and winter in h...,"[tuesdaytune, tistheseason, forgiving, tagsfor..."
864,christmas,usborne books hosted a book fair benefitting c...,"[usbornebooks, casa, books]"
13382,christmas,fantastic bbc tv schedulefor subscribers tco v...,[bbc]
...,...,...,...
8964,christmas,who will have the balls to send out the first ...,"[holidaycard, greetingcards, holidaygreetingca..."
14094,christmas,graboids love tco edqbhb,[]
10480,christmas,gift peter rabbit savings jar spending money j...,"[peter, etsy, gift]"
16616,christmas,nothing wrong with this movie tco mmqfkmlx tco...,[]


In [15]:
covid_df

Unnamed: 0,Classification,Text,Hashtag
9890,covid,hello i am a professional logo designer if you...,"[eijazkhan, bitcoin, blackpink, blockchain, fi..."
839,covid,at the old lab shared office space we were alw...,"[covidpiggybacking, coviduk, comorbidity, asym..."
13282,covid,disaster relief loan funding program for busin...,"[relief, businesses, relieffunding]"
15000,covid,why is one of your hospice directors still emp...,[coronavirus]
13204,covid,is your eligibility towards permanentresidence...,"[permanentresidence, covid19, pnp]"
...,...,...,...
5224,covid,love and kindness are never wasted beyondright...,"[beyondrights, charity, nonprofit, donate, lov..."
4964,covid,people on my shit list for lying denying reali...,[]
4339,covid,is there a connection between air pollution an...,"[airpollution, pollution, covid19, coronavirus..."
251,covid,icymi the loser whos barricaded himself in the...,[]


## Pickle the data for Convenience 

In [16]:
with open("covid.txt", "wb") as handle : # write dataframe into text file
  pickle.dump(covid_df, handle)
with open("christmas.txt", "wb") as handle : # write dataframe into text file
  pickle.dump(christmas_df, handle)
with open("spacex.txt", "wb") as handle : # write dataframe into text file
  pickle.dump(spacex_df, handle)