## Import libraries

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup 
import os
import re
import spacy
import en_core_web_sm
from  spacy.lang.en.stop_words import STOP_WORDS
import pickle
nlp = spacy.load('en_core_web_sm')



## Obtain list of personalities

In [1]:
occupations = {'Leader' : ['Barack Obama', 'Joe Biden', 'Elizabeth Warren', 'Pope Francis', 'Donald Trump'],
             
            'Athlete' : ['LeBron James', 'John Cena', 'Kevin Durant', 'Ronda Rousey', 'Anthony Joshua'],
             
            'Entertainer' : ['Kevin Hart', 'Emma Watson', 'Neil Patrick Harris',
                 'Harry Styles.', 'Dwayne Johnson'],
             
            'TV Personality' : ['Ellen DeGeneres', 'jimmy fallon','Oprah Winfrey', "Conan O'Brien",
              'Gordon Ramsay', 'daniel tosh'],
             
            'Entrepreneur' : ['Jeff Weiner', 'Bill Gates', 'Elon Musk', 'Kylie Jenner', 'Tim Cook'],
             
            'Artist' : ['Lady Gaga', 'Wiz Khalifa', 'Louis Tomlinson',
          'Alicia Keys', 'Mariah Carey']}

personalities = []
for person in occupations.values():
    personalities.extend(person)


## Load Data

In [5]:
# access directory with .xlsx files
data_dir = os.getcwd()+'/data/'

df = pd.DataFrame()
for file in os.listdir(data_dir):    
    if file[-4:] != '.csv':
        df_temp = pd.read_excel(data_dir+file)
        if df_temp.shape[0] >500:
            df = df.append(df_temp)

for name in personalities:
    if name not in df['Username'].value_counts().index:
        print(name)


Joe Biden
Donald Trump
daniel tosh
Elon Musk


In [6]:
# load elom musk and daniel tosh files
elon_musk = pd.read_excel(data_dir+'Elon Musk.xlsx')
daniel_tosh = pd.read_excel(data_dir+'Daniel Tosh.xlsx')

In [7]:
# add the elon musk and daniel tosh files to the dataframe 
df = df.append(elon_musk)
df = df.append(daniel_tosh)

# keep the first 6 colomns
df = df.iloc[:,:6]


In [8]:
# load the Joe Biden data
joe_biden = pd.read_csv('data/Joe Biden.csv')
joe_biden['Username'] = ['Joe Biden' for _ in range(len(joe_biden))]
joe_biden['User handle'] = ['@JoeBiden' for _ in range(len(joe_biden))]
joe_biden = joe_biden[['Username', 'User handle', 'timestamp', 'tweet', 'retweets','likes']]
joe_biden.columns = df.columns

# change joe biden columns to other df columns
joe_biden.head()

Unnamed: 0,Username,User handle,Date of posting,Text,Retweet count,Like count
0,Joe Biden,@JoeBiden,2007-10-24 22:45,Tune in 11:30 ET tomorrow for a live webcast o...,5,11
1,Joe Biden,@JoeBiden,2007-12-29 15:35,"Iowans, there's a good chance there's a Biden ...",16,22
2,Joe Biden,@JoeBiden,2012-04-09 09:42,We're excited to announce that @JoeBiden is be...,82,20
3,Joe Biden,@JoeBiden,2012-04-09 09:43,Campaign staff will run this account to keep y...,76,51
4,Joe Biden,@JoeBiden,2012-04-09 13:11,News for you this morning: VP Biden will speak...,54,5


In [10]:
# load the Donald Trump data
donald_trump =pd.read_csv('data/Donald Trump.csv')
donald_trump['Username'] = ['Donald Trump' for _ in range(len(donald_trump))]
donald_trump['User handle'] = ['@realDonaldTrump' for _ in range(len(donald_trump))]
donald_trump = donald_trump[['Username', 'User handle', 'Date', 'Tweet_Text', 'Retweets','twt_favourites_IS_THIS_LIKE_QUESTION_MARK']]
donald_trump.columns = df.columns 
# change donald trump columns to other df columns
donald_trump.head()

Unnamed: 0,Username,User handle,Date of posting,Text,Retweet count,Like count
0,Donald Trump,@realDonaldTrump,16-11-11,Today we express our deepest gratitude to all ...,41112,127213
1,Donald Trump,@realDonaldTrump,16-11-11,Busy day planned in New York. Will soon be mak...,28654,141527
2,Donald Trump,@realDonaldTrump,16-11-11,Love the fact that the small groups of protest...,50039,183729
3,Donald Trump,@realDonaldTrump,16-11-11,Just had a very open and successful presidenti...,67010,214001
4,Donald Trump,@realDonaldTrump,16-11-11,A fantastic day in D.C. Met with President Oba...,36688,178499


In [11]:
# add the Joe Biden and Donald Trump data to the data frame
df = df.append(joe_biden)
df = df.append(donald_trump)
df = df.reset_index(drop=True)

## Data Preprocessing

In [14]:
# create function that will identify occupation given the name of the celebrity
def identify_occupation(name):
    for occupation in occupations.keys():
        if name in occupations[occupation]:
            return occupation

In [15]:
# apply identify_occupation function for every celebrity in the dataset
df['Occupation'] = df['Username'].apply(lambda x: identify_occupation(x))
df.head()

Unnamed: 0,Username,User handle,Date of posting,Text,Retweet count,Like count,Occupation
0,Alicia Keys,@aliciakeys,Fri Feb 12 03:16:07 +0000 2021,The maestro! The musical magician! The one and...,170.0,1973.0,Artist
1,Alicia Keys,@aliciakeys,Wed Feb 10 21:31:09 +0000 2021,"Your glow is about to be on 100,000!!! ✨As we ...",101.0,1171.0,Artist
2,Alicia Keys,@aliciakeys,Wed Feb 10 21:25:58 +0000 2021,🥰✨💜💫 @keyssoulcare https://t.co/MXpCgDdBdt,53.0,777.0,Artist
3,Alicia Keys,@aliciakeys,Wed Feb 10 01:32:56 +0000 2021,Woke up in such a good vibe.⁣ Gen was funky &a...,267.0,3659.0,Artist
4,Alicia Keys,@aliciakeys,Mon Feb 08 01:41:28 +0000 2021,One of my favorite small businesses is @unionl...,144.0,816.0,Artist


In [16]:
# create function that removes links from text
def remove_link(text):
    return re.sub(r"http\S+", " ", text)

# apply remove_linkfunction
df['Text (EPA)'] = df['Text'].apply(lambda x: remove_link(str(x)))
df['Text (Model)'] = df['Text'].apply(lambda x: remove_link(str(x)))


In [17]:
# create and apply function that removes handles from the dataset
def remove_handle(text):
    return re.sub(r"@", " ", text)
df['Text (EPA)'] = df['Text (EPA)'].apply(lambda x: remove_handle(str(x)))

In [18]:
# create and apply function that removes hashtags from the dataset
def remove_hashtag(text):
    return re.sub(r"#[\w]*", " ", text)
df['Text (EPA)'] = df['Text (EPA)'].apply(lambda x: remove_hashtag(str(x)))

In [19]:
# create and apply funciton that empties text that are retweets
def remove_retweet(text):
    if 'RT' in text:
        return ""
    else:
        return text

df['Text (EPA)'] = df['Text (EPA)'].apply(lambda x: remove_retweet(str(x)))    
df['Text (Model)'] = df['Text (Model)'].apply(lambda x: remove_retweet(str(x)))    


In [20]:
# create and apply function that removes symbols and numbers from the text
def remove_numbers_symbols(text):
     # lower casing
    text_lower = text.lower()
    
    # tokenization
    text_nlp = nlp(text_lower)
    
    text_clean = " ".join([word.text for word in text_nlp if not word.is_punct])
    text_clean = re.sub('\n', ' ', text_clean)
    
    return text_clean

df['Text (EPA)'] = df['Text (EPA)'].apply(lambda x: remove_numbers_symbols(str(x)))    

In [21]:
# create and apply function that removes stop words from text
def remove_stopwords(text):
    text_nlp = nlp(text)
    
    # Remove stop words
    text_nlp_clean = " ".join([word.text for word in text_nlp if word.is_stop == False])
    
    return text_nlp_clean

df['Text (EPA)'] = df['Text (EPA)'].apply(lambda x: remove_stopwords(str(x)))    


In [22]:
# create and apply function that removes emojis from text
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

df['Text (Model)'] = df['Text (Model)'].apply(lambda x: remove_emoji(str(x)))
df['Text (EPA)'] = df['Text (EPA)'].apply(lambda x: remove_emoji(str(x)))


In [23]:
# load pickle file that contains characters to omit from text
pickle_in = open('char_to_remove.pickle', 'rb')
bad_char = pickle.load(pickle_in)
bad_char_str = "".join(bad_char)
bad_char_str

'\n{|}~\xa0¡£©\xad®·¿ßàáãäåçèéêìíïðñóôöùúüāğıōœşʉʻ̱ωабвгдезиклмнопрстуьяёӕԍԏԡեבהוחטכמנקשתأابةتجحدرزسشصضعغفلمنهوىيُِّ٪\u06dd۪ۢกขคงชดตทนบปพภมยรลวษสหอะัาำิีุูเโๆ่้\u200a\u200b–—‘’“”•…‼\u2060\u2063\u2066\u2069€⃣™→⇢⌛⌨⏪⏰⏱⏳'

In [24]:
# create and apply function that returns lower cased text
def lower_and_acii(text):
    text_lower = text.lower()
    
    return text_lower

df['Text (Model)'] = df['Text (Model)'].apply(lambda x: lower_and_acii(x))

In [25]:
# create and apply function that removes unwanted characters
def remove_bad_char(text):
    text_clean = text
    for char in bad_char:
        # remvoe unwanted characters
        text_clean = text_clean.replace(char, "")
    text_clean = text_clean.replace("  ", " ")
    return text_clean
df['Text (Model)'] = df['Text (Model)'].apply(lambda x: remove_bad_char(x))

In [26]:
# create and apply function that remove words that are less than 3 characters long
def remove_short_text(text):
    text_long_words = " ".join([word for word in text.split()])
    # tokenization
    tokens = nlp(text_long_words)
    
    list_words = []
    for word in tokens:
        if len(word.text) >3:
            list_words.append(word.text)
    text_long_words = " ".join(list_words)  
    return text_long_words

df['Text (EPA)'] = df['Text (EPA)'].apply(lambda x: remove_short_text(str(x)))


In [27]:
# create a new column that counts the number of words in each string
df['Word Count'] = df['Text (Model)'].apply(lambda x: len(x.split()))

In [28]:
# remove tweets that are not more than 3 words long
no_short_text = df['Word Count'] >3
df = df[no_short_text]

In [30]:
# preview of the dataset
df.head(5)

Unnamed: 0,Username,User handle,Date of posting,Text,Retweet count,Like count,Occupation,Text (EPA),Text (Model),Word Count
85535,Donald Trump,@realDonaldTrump,15-07-16,.@KarlRove wasted $400 million + and didn۪t wi...,543.0,886.0,Leader,karlrove wasted million didn۪t racea total los...,.@karlrove wasted $400 million + and didnt win...,13
85536,Donald Trump,@realDonaldTrump,15-07-16,.@FoxNews You shouldn۪t have @KarlRove on the ...,415.0,717.0,Leader,foxnews shouldn۪t karlrove airhe۪s clown zero ...,.@foxnews you shouldnt have @karlrove on the a...,14
85537,Donald Trump,@realDonaldTrump,15-07-16,.@BradSteinle Great talking to you and your pa...,310.0,710.0,Leader,bradsteinle great talking parentsfantastic peo...,.@bradsteinle great talking to you and your pa...,17
85538,Donald Trump,@realDonaldTrump,15-07-16,".@andydean2014 Thank you, you were great. You ...",89.0,356.0,Leader,andydean2014 thank great defend anytime amazing,".@andydean2014 thank you, you were great. you ...",13
85539,Donald Trump,@realDonaldTrump,15-07-16,"""@joshdronzek @realDonaldTrump @Macys GO TRUMP""",113.0,338.0,Leader,joshdronzek realdonaldtrump macys trump,"""@joshdronzek @realdonaldtrump @macys go trump""",5
85540,Donald Trump,@realDonaldTrump,15-07-16,I hope the boycott of @Macys continues forever...,1156.0,2423.0,Leader,hope boycott macys continues forever people cu...,i hope the boycott of @macys continues forever...,25
85541,Donald Trump,@realDonaldTrump,15-07-16,I loved firing goofball atheist Penn @pennjill...,431.0,953.0,Leader,loved firing goofball atheist penn pennjillett...,i loved firing goofball atheist penn @pennjill...,22
85542,Donald Trump,@realDonaldTrump,15-07-16,I hear @pennjillette show on Broadway is terri...,1086.0,1175.0,Leader,hear pennjillette broadway terrible surprised ...,i hear @pennjillette show on broadway is terri...,22
85543,Donald Trump,@realDonaldTrump,15-07-16,Irrelevant clown @KarlRove sweats and shakes n...,930.0,1494.0,Leader,irrelevant clown karlrove sweats shakes nervou...,irrelevant clown @karlrove sweats and shakes n...,24
85544,Donald Trump,@realDonaldTrump,15-07-16,"""@HoustonWelder: Donald Trump is one of the se...",1738.0,1800.0,Leader,houstonwelder donald trump sexiest planet woma...,"""@houstonwelder: donald trump is one of the se...",27


In [29]:
# dimensions of the preprocessed dataset
df.shape

(36343, 10)

## Export Preprocessed Data Frame

In [50]:
# create and export a pickle file storing the preprocessed data frame 
pickle_out = open('df.pickle', 'wb')
pickle.dump(df, pickle_out)
pickle_out.close()