In [9]:
import json 
import pandas as pd
import tweepy as tw
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import os
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import TweetTokenizer
import nltk
import re
from sklearn.feature_extraction import stop_words
stopwords = stop_words.ENGLISH_STOP_WORDS
nltk.download('vader_lexicon')

class TextCleaner():
    '''
    This class instantiates an object with attributes of text preprocessing dictionaries and 
    a method for applying this to a list of text. 
    '''
    def __init__(self):
        '''
        Removed groups: 
            r"[!?$%()*+,-./:;<=>\^_`{|}~]"
        '''
        self.re_substitution_groups = [r'^RT', r'^rt', r'http\S+', r'&amp; ', r'^[@#]\w+']
        self.text_abbrevs = { 'lol': 'laughing out loud', 'bfn': 'bye for now', 'cuz': 'because',
                            'afk': 'away from keyboard', 'nvm': 'never mind', 'iirc': 'if i recall correctly',
                            'ttyl': 'talk to you later', 'imho': 'in my honest opinion', 'brb': 'be right back',
                            "fyi": "for your information" }
        self.grammar_abbrevs = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                             "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                             "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                             "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                             "mustn't":"must not", "'s":"s"}


    def clean_tweets(self, df_tweet_text, last_clean_step=6):
        '''
        INPUT: df_tweet_text <string>
        This function will clean the text of tweets, with ability to very the last step of cleaning.
        order:
        1. lowercase
        2. change txt abbreviations
        3. change grammar abbreviation
        4. remove punctuation
        5. remove special (utf-8) characters
        6. remove stop words
        Run on one tweet at a time, for example:
        cleaner = TextCleaner()
        df['clean_tweets'] = df['full_text'].apply(lambda x: cleaner.clean_tweets(x, 5))
        '''
        df_tweet_text_sw = str(df_tweet_text)

        if last_clean_step == 0:
            clean_text = df_tweet_text_sw

        elif last_clean_step == 1:
            clean_text = df_tweet_text_sw.lower()

        elif last_clean_step == 2:
            lower = df_tweet_text_sw.lower()
            clean_text = ' '.join([self.text_abbrevs.get(elem, elem) for elem in lower.split()])
        
        elif last_clean_step == 3:
            lower = df_tweet_text_sw.lower()
            without_text_abbrevs = ' '.join([self.text_abbrevs.get(elem, elem) for elem in lower.split()])
            clean_text = ' '.join([self.grammar_abbrevs.get(elem, elem) for elem in without_text_abbrevs.split()])
        
        elif last_clean_step == 4:
            lower = df_tweet_text_sw.lower()
            without_text_abbrevs = ' '.join([self.text_abbrevs.get(elem, elem) for elem in lower.split()])
            without_grammar_abbrevs = ' '.join([self.grammar_abbrevs.get(elem, elem) for elem in without_text_abbrevs.split()])
            
            joined_re_groups = '|'.join([group for group in self.re_substitution_groups])
            clean_text = ' '.join([re.sub(joined_re_groups,' ',word) for word in without_grammar_abbrevs.split()])
        
        elif last_clean_step == 5:
            lower = df_tweet_text_sw.lower()
            without_text_abbrevs = ' '.join([self.text_abbrevs.get(elem, elem) for elem in lower.split()])
            without_grammar_abbrevs = ' '.join([self.grammar_abbrevs.get(elem, elem) for elem in without_text_abbrevs.split()])
            
            joined_re_groups = '|'.join([group for group in self.re_substitution_groups])
            without_re_groups = ' '.join([re.sub(joined_re_groups,' ',word) for word in without_grammar_abbrevs.split()])

            clean_text = re.sub(r'\W',' ',without_re_groups)

        elif last_clean_step == 6:
            lower = df_tweet_text_sw.lower()
            without_text_abbrevs = ' '.join([self.text_abbrevs.get(elem, elem) for elem in lower.split()])
            without_grammar_abbrevs = ' '.join([self.grammar_abbrevs.get(elem, elem) for elem in without_text_abbrevs.split()])
            
            joined_re_groups = '|'.join([group for group in self.re_substitution_groups])
            without_re_groups = ' '.join([re.sub(joined_re_groups,' ',word) for word in without_grammar_abbrevs.split()])

            without_nontext = re.sub(r'\W',' ',without_re_groups)

            clean_text = ' '.join([word for word in without_nontext.split() if word not in stopwords])
        
        # words_greater_than_two_char = ' '.join([word for word in clean_text.split() if len(word) >= 3])

        one_space_separated_tweet = ' '.join([word for word in clean_text.split()])

        return one_space_separated_tweet

vectorizer = TfidfVectorizer(
    tokenizer=my_tokenizer,
    stop_words='english',
    max_features=5000)

mat = vectorizer.fit_transform(X).toarray()
X_full=pd.DataFrame(data=mat)

def cross_val(X,y,over_=True):

    rec_scores=[]
    prec_scores=[]
    f1_scores=[]

    for urgh in range(1):
        kf = KFold(n_splits=5,shuffle=True)
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            if over_==True:
                over = SMOTE(sampling_strategy='not majority')
                X_train,y_train=over.fit_resample(X_train,y_train)
                X_train=pd.DataFrame(columns=X_test.columns,data=X_train)


            model.fit(X_train,y_train)

            preds=model.predict(X_test)

            prec_scores.append(precision_score(y_test,preds))
            rec_scores.append(recall_score(y_test,preds))
            f1_scores.append(f1_score(y_test,preds))
            print (precision_score(y_test,preds),recall_score(y_test,preds))
    prec_res=np.mean(prec_scores)
    rec_res=np.mean(rec_scores)
    f1_res=np.mean(f1_scores)
    print ('Precision = {:.2f}, Recall = {:.2f}, F1 score = {:.2f}'.format(prec_res,rec_res,f1_res))
    return (np.mean(prec_scores),np.mean(rec_scores),np.mean(f1_scores))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Trevor\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Loading the labelled hashtags

In [3]:
annots = json.load(open("C:/Users/Trevor/Downloads/first_half_annotated.json"))
hashes=list(annots['#presidentialcandidate'].values())
values=list(annots['0'].values())
annots1={hashes[i]:values[i] for i in range(len(hashes))}

annots2 = json.load(open("C:/Users/Trevor/Downloads/second_half_hashtags_annotated.json"))

annots1_df=pd.DataFrame.from_dict(annots1, orient='index')
annots2_df=pd.DataFrame.from_dict(annots2, orient='index')
annots=pd.concat([annots1_df,annots2_df])
pro_trump=list(annots[annots[0]==1].index)
pro_biden=list(annots[annots[0]==-1].index)
neutral=list(annots[annots[0]==0].index)

def clean_hash(x):
    x=['#' + i.replace('#','') for i in x]
    return x

pro_trump=clean_hash(pro_trump)
pro_biden=clean_hash(pro_biden)
neutral=clean_hash(neutral)



Scraping tweets containing the hashtags

In [None]:
import os
import tweepy as tw
import pandas as pd

consumer_key= ''
consumer_secret= ''
access_token= ''
access_token_secret= ''

auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

import random
tmp_lst=pro_trump.copy() + pro_biden.copy()
random.shuffle(tmp_lst)

for j in range(len(tmp_lst)):
    try:
        search = tw.Cursor(api.search,
                    q=tmp_lst[j],
                    lang="en",
                    tweet_mode="extended").items(100)
        lst=[i for i in search]
        tweets=[]
        for i in lst:
            try:
                tweets.append(i.retweeted_status.full_text)
            
            except:
                tweets.append(i.full_text)
        all_tweets+=tweets
    except:
        pass

    pd.Series(all_tweets).to_csv('scraped_tweets.csv',index=False)

In [6]:

def my_tokenizer(doc):
    """Tokenizes document using RegExpTokenizer
    Args:
        doc: string
    Returns:
        list: tokenized words
    """

    tokenizer = RegexpTokenizer(r'\w+')
    # tokenizer= TweetTokenizer()
    article_tokens = tokenizer.tokenize(doc.lower())
    return article_tokens

cleaner = TextCleaner()

Combining the scraped tweets with the original dataset, and dropping duplicates

In [2]:

df=pd.read_csv('C:/Users/Trevor/Downloads/scraped_tweets.csv')
df=df.drop_duplicates()
df.reset_index(inplace=True)
df.drop(columns='index',inplace=True)

df['0']=df['0'].apply(lambda x: ' '.join(x.replace('\n',' ').split()))

df2=pd.read_json('C:/Users/Trevor/Downloads/concatenated_abridged.jsonl.gz',compression='gzip',lines=True)
more_tweets=[]

for i in range(df2.shape[0]):
    try:
        more_tweets.append(df2.iloc[i]['retweeted_status']['full_text'])
    except:
        more_tweets.append(df2.iloc[i]['full_text'])
df2['tweet']=more_tweets
df2['tweet']=df2['tweet'].apply(lambda x: ' '.join(x.replace('\n',' ').split()))
df_tweets=pd.concat([df2['tweet'],df['0']])

df_tweets=df_tweets.drop_duplicates()



Scoring each tweet (biden/trump) using the hashtags

In [4]:
scores=[]
for i in list(df_tweets):
    trump_score=0
    neutral_score=0
    biden_score=0
    for j in pro_trump:
        if j in i:
            trump_score+=1
    for k in neutral:
        if k in i:
            neutral_score+=1
    for l in pro_biden:
        if l in i:
            biden_score+=1
    scores.append((trump_score,neutral_score,biden_score))

Training the Biden classifier model


In [5]:
all_scores=pd.DataFrame(columns=['pro_trump','neutral','pro_biden'],data=np.vstack(scores))

all_scores['biden_score']=all_scores['pro_biden']-all_scores['pro_trump']
all_scores['trump_score']=all_scores['pro_trump']-all_scores['pro_biden']
all_scores['trump']=all_scores['trump_score'].apply(lambda x: 1 if x>0 else 0)
all_scores['biden']=all_scores['biden_score'].apply(lambda x: 1 if x>0 else 0)

X=df_tweets.iloc[all_scores[(all_scores['pro_trump']>0) | (all_scores['pro_biden']>0)].index]

X=X.reset_index()[0]

X=X.apply(lambda x: x.split('http')[0])

y=all_scores[(all_scores['pro_trump']>0) | (all_scores['pro_biden']>0)]['biden']
y=pd.DataFrame(y).reset_index()['biden']

In [7]:

X=X.apply(lambda x: ' '.join([i.lower() for i in x.split() if '#' not in i]))

X=X.apply(lambda x: re.sub(r'[^\w\s]','',x))

from nltk.corpus import stopwords 
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
X=X.apply(lambda x: ' '.join([i.lower() for i in x.split() if i not in stop_words]))

y=y[~(X=='')]
X=X[~(X=='')]

X.to_csv('biden_data.csv',index=False)
y.to_csv('biden_target.csv',index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Trevor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
X=pd.read_csv('biden_data.csv')['0']
y=pd.read_csv('biden_target.csv').iloc[:,0]

vectorizer.fit(X)

mat = vectorizer.transform(X).toarray()
X_full=pd.DataFrame(data=mat)

from sklearn.naive_bayes import MultinomialNB

model=MultinomialNB()
model.fit(X_full,y)

import pickle

filename = 'NB_biden.sav'
pickle.dump(model, open(filename, 'wb'))

Training the Trump classifier model

In [None]:
X=df_tweets.iloc[all_scores[(all_scores['pro_trump']>0) | (all_scores['pro_biden']>0)].index]

X=X.reset_index()[0]

X=X.apply(lambda x: x.split('http')[0])

y=all_scores[(all_scores['pro_trump']>0) | (all_scores['pro_biden']>0)]['trump']
y=pd.DataFrame(y).reset_index()['trump']

X=X.apply(lambda x: ' '.join([i.lower() for i in x.split() if '#' not in i]))

X=X.apply(lambda x: re.sub(r'[^\w\s]','',x))

from nltk.corpus import stopwords 
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
X=X.apply(lambda x: ' '.join([i.lower() for i in x.split() if i not in stop_words]))

y=y[~(X=='')]
X=X[~(X=='')]

X.to_csv('trump_data.csv',index=False)
y.to_csv('trump_target.csv',index=False)

X=pd.read_csv('trump_data.csv')['0']
y=pd.read_csv('trump_target.csv').iloc[:,0]

vectorizer.fit(X)

mat = vectorizer.transform(X).toarray()
X_full=pd.DataFrame(data=mat)

from sklearn.naive_bayes import MultinomialNB

model=MultinomialNB()
model.fit(X_full,y)

import pickle

filename = 'NB_trump.sav'
pickle.dump(model, open(filename, 'wb'))