Created on July 23rd
Authors: Yuan-Chi Yang, Angel Xie

In [None]:
import pandas as pd
import nltk
import re

# Importing the data and perform some checks

It consists of all the tweets classified as the 'p' class by the best performing classfiers to date.

In [None]:
df = pd.read_csv('./political-tweets-streaming.csv',header = 0, keep_default_na=False,dtype={'tweet_id':str})

In [None]:
len(df)

In [None]:
df.columns

### Check Duplicates

In [None]:
df.duplicated(subset = ['tweet_id'], keep=False).sum()

### Include 'text_remove_stopwords' column

In [None]:
def loadStopWords(FILENAME):
    stopword_list = []
    infile = open(FILENAME)
    for line in infile:
        stopword_list.append(line.strip())
    print(len(stopword_list))
    return stopword_list

In [None]:
def hashtag_funx(hashtag):
    #clean the symbols and numbers
    hashtag=re.sub(r'([^A-Za-z])',' ',hashtag)#clear symbols
    hashtag = re.sub(r'([A-Z][a-z]{1,})',lambda x: ' '+x.group(),hashtag)
    hashtag = re.sub(r'([A-Z]{2,})',lambda x: ' '+x.group(),hashtag)
    hashtag=re.sub(r'^([A-Z]{1,1}\s+)|(\s+[A-Z]{1,1}\s+)|(\s+[A-Z]{1,1})$',' ',hashtag)
    hashtag=re.sub(r'\s{2,}',' ',hashtag)
    return hashtag

def processing_text_remove_stopwords(tweet_text,stop_words):
    # replace '&amp' with 'and'
    tweet_text = re.sub(r'&amp;', "and", tweet_text)
    # remove url
    tweet_text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', tweet_text)
    # remove username
    tweet_text = re.sub('(@[\S]+)', '', tweet_text) #tweet_text = re.sub('(@[A-Za-z0-9\_]+)', '', tweet_text)
    # extract the meaningful terms in the hashtag
    tweet_text=re.sub('(#[\S]+)',lambda x: hashtag_funx(x.group()),tweet_text)
    # remove non-english charactor
    tweet_text = re.sub("[^a-zA-Z_-]", " ", tweet_text)
    tweet_text = tweet_text.lower()
    tweet_text = re.sub(r'\s{2,}', " ", tweet_text)
    tweet_text_save=tweet_text
    tweet_text = [t for t in tweet_text.split() if (not t in stop_words and len(t)>=3)]
    if len(tweet_text)==0 and "medicaid for all" in tweet_text_save:
        tweet_text=["medicaid for all"]
    return ' '.join(tweet_text)

In [None]:
stopwords = set(loadStopWords('./stopwords.txt'))

In [None]:
df['text_remove_stopwords'] = df['unprocessed_text'].apply(lambda x: processing_text_remove_stopwords(x,stopwords))

## Extract and Remove Near-duplicates

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def remove_duplicates(df,col_title):
    indices=[]
    df=df.sort_values(by=[col_title],ascending=True)
    str_prev=df.iloc[0][col_title]
    for index in range(len(df[col_title])):
        curr=index
        str_curr=df.iloc[curr][col_title]
        if((abs(len(str_curr)-len(str_prev))<max(len(str_prev),len(str_curr))/8) and fuzz.token_set_ratio(str_curr,str_prev)>97):
            if(curr>0):
                indices.append(curr)
        str_prev=str_curr
    to_b_removed=[df.index[ind] for ind in indices]
    df_removed=[df.iloc[indd,] for indd in indices]
    pd.DataFrame(df_removed).to_csv("nearduplicates.csv")
    df=df.drop(to_b_removed)
    return df

## Find the most frequent words in text

In [None]:
unigrams = ' '.join(df['text_remove_stopwords'].to_list()).split()
bigrams = []
for tweet in df['text_remove_stopwords'].to_list():
    bigrams += list(nltk.bigrams(tweet.split()))

trigrams = []
for tweet in df['text_remove_stopwords'].to_list():
    trigrams += list(nltk.trigrams(tweet.split()))
uni_fd = nltk.FreqDist(unigrams)
big_fd = nltk.FreqDist(bigrams)
trig_fd = nltk.FreqDist(trigrams)

### Find the number of tweets containing each of the top 40 terms (uni,bi,trigrams)

In [None]:
def wordInNumTweets(s):
    pattern = rf'(^|[^a-zA-Z]){s}([^a-zA-Z]|$)' #rf is for using a variable inside
    count=0
    for i in range(len(df['text_remove_stopwords'])):
         if re.search(pattern,df['text_remove_stopwords'].iloc[i]) is not None :
            count=count+1
    return count

### Output results to the file 'ubt40.csv'

In [None]:
num = 40
#uni
fd = uni_fd
fd_list = [(x,fd[x]) for x in fd]
fd_list.sort(key = lambda x: x[1], reverse = True)
df_term1=[]
df_count1=[]
df_numtweets1=[]
for i in range(0,num):
    df_term1.append(fd_list[i][0])
    df_count1.append(fd_list[i][1])
    df_numtweets1.append(wordInNumTweets(fd_list[i][0]))
#bi
fd = big_fd
fd_list = [(x,fd[x]) for x in fd]
fd_list.sort(key = lambda x: x[1], reverse = True)
df_term2=[]
df_count2=[]
df_numtweets2=[]
for i in range(0,num):
    x, y= fd_list[i][0]
    term = x + ' '+ y
    df_term2.append(term)
    df_count2.append(fd_list[i][1])
    df_numtweets2.append(wordInNumTweets(term))
#tri
fd = trig_fd
fd_list = [(x,fd[x]) for x in fd]
fd_list.sort(key = lambda x: x[1], reverse = True)
df_term3=[]
df_count3=[]
df_numtweets3=[]
for i in range(0,num):
    x, y, z= fd_list[i][0]
    term = x + ' '+ y + ' ' + z
    df_term3.append(term)
    df_count3.append(fd_list[i][1])
    df_numtweets3.append(wordInNumTweets(term))
df_ubt={'uni_term':df_term1,'uni_count':df_count1,'uni_numtw':df_numtweets1,'bi_term':df_term2,'bi_count':df_count2,'bi_numtw':df_numtweets2,'tri_term':df_term3,'tri_count':df_count3,'tri_numtw':df_numtweets3}
df_ubt=pd.DataFrame(df_ubt)
df_ubt.to_csv('ubt40.csv')

## Add sentiment scores to all tweets

In [None]:
from textblob import TextBlob
def addpolarityscores(text):
    t=TextBlob(text)
    return t.sentiment.polarity
def addsubjectivityscores(text):
    t=TextBlob(text)
    return t.sentiment.subjectivity

In [None]:
df['polarity'] = df['text_remove_stopwords'].apply(lambda x:addpolarityscores(x))
df['subjectivity'] = df['text_remove_stopwords'].apply(lambda x:addsubjectivityscores(x))

## Add label of 1/0 based on the existence/absence of each interesting term-the columns are named after the terms

In [None]:
def highfreqword(text,terms):
    exist=0
    for j in range(len(terms)):
        pattern = rf'(^|[^a-zA-Z]){terms[j]}([^a-zA-Z]|$)' #rf is for using a variable inside
        if(re.search(pattern,text)!=None):
            return 1
    return 0

In [None]:
term_ls=[['cut social security','cutting social security','cuts social security','social security cuts'],['mental health'],['middle class'],['affordable care act','aca'],['tax cut','tax cuts'],['food stamps'],['low income'],['planned parenthood'],['minimum wage'],['illegal immigrants']]
for i in range(10):
    label_name=(term_ls[i])[0]
    df[label_name] = df['text_remove_stopwords'].apply(lambda x:highfreqword(x,term_ls[i]))
df.to_csv('df_labels_scores.csv')

### Load the new dataframe 'df_labels_scores.csv' with added labels and scores

In [None]:
import pandas as pd
import nltk
import re
df = pd.read_csv('./df_labels_scores.csv',header = 0, keep_default_na=False,dtype={'tweet_id':str})
### reformat date
df['yr_month'] = df['time'].apply(lambda x:x[:7])


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import PercentFormatter

### Polarity Distribution Plots

In [None]:
figure,axes=plt.subplots(3,4,tight_layout=True,sharey="all",figsize=(15,15))
titles=['affordable care act','food stamps','cut social security','tax cut','middle class','low income','mental health','minimum wage','illegal immigrants','planned parenthood']
binls=[-1,-0.75,-0.5,-0.25,0,0.25,0.5,0.75,1]
axes=axes.ravel()
for ax,title in zip(axes,titles):
    subset=df[df[title]==1]
    ax.hist(subset['polarity'],weights=np.ones(len(subset))/len(subset),bins=binls)
    ax.axvline(subset['polarity'].median(),linestyle='dashed',color='r',label="median "+str(round(np.median(subset['polarity']),3)))
    ax.axvline(subset['polarity'].mean(),linestyle='dashed',color='y',label="mean "+str(round(np.mean(subset['polarity']),3)))
    ax.legend(loc='upper right')
    title1='Polarity:'+title
    ax.set_title(title1)
    ax.yaxis.set_major_formatter(PercentFormatter(1))
plt.show()

### Subjectivity Distribution Plots

In [None]:
figure,axes=plt.subplots(3,4,tight_layout=True,sharey="all",figsize=(15,15))
titles=['affordable care act','food stamps','cut social security','tax cut','middle class','low income','mental health','minimum wage','illegal immigrants','planned parenthood']
binls=[0,0.125,0.25,0.375,0.5,0.625,0.75,0.875,1]
axes=axes.ravel()
for ax,title in zip(axes,titles):
    subset=df[df[title]==1]
    ax.hist(subset['subjectivity'],weights=np.ones(len(subset))/len(subset),bins=binls)
    ax.axvline(subset['subjectivity'].median(),linestyle='dashed',color='r',label="median "+str(round(np.median(subset['subjectivity']),3)))
    ax.axvline(subset['subjectivity'].mean(),linestyle='dashed',color='y',label="mean "+str(round(np.mean(subset['subjectivity']),3)))
    ax.legend(loc='upper right')
    title1='Subjectivity:'+title
    ax.set_title(title1)
    ax.yaxis.set_major_formatter(PercentFormatter(1))
plt.show()

### Generate a Table with:
##### Term_Name,Year_Month,Mean_Scores,Monthly Standard Deviations, and Monthly Number of Occurrences

In [None]:
import numpy as np
titles=['affordable care act','food stamps','cut social security','tax cut','middle class','low income','mental health','minimum wage','illegal immigrants','planned parenthood']
terms=[]
times=[]
occs=[]
pol_means=[]
subj_means=[]
pol_err=[]
subj_err=[]
for title in titles:
    df1=df[df[title]==1]
    x1=list(df1.groupby(['yr_month']).groups.keys())
    for xx1 in x1:
        terms.append(title)
        times.append(xx1)
        subset=df1[df1['yr_month']==xx1]
        subj_err.append(np.std(subset['subjectivity']))
        subj_means.append((subset['subjectivity']).mean())
        pol_err.append(np.std(subset['polarity']))
        pol_means.append((subset['polarity']).mean())
        occs.append(len(subset))
table={'term':terms,'yr_month':times,'num_of_occurrences':occs,'polarity_mean':pol_means,'polarity_st_dev':pol_err,'subjectivity_mean':subj_means,'subjectivity_st_dev':subj_err}
table=pd.DataFrame(table)
table.to_csv('times_series_table.csv')


### Time Series plots of Polarity Trend for all ten terms

In [None]:
#plt.errorbar(x1,y1,yerr=se1,marker='s',mfc='red',mec='blue',ms=3,mew=5)
figure,axes=plt.subplots(3,4,tight_layout=True,sharey="all",figsize=(20,20))
titles=['affordable care act','food stamps','cut social security','tax cut','middle class','low income','mental health','minimum wage','illegal immigrants','planned parenthood']
axes=axes.ravel()
for ax,title in zip(axes,titles):
    subset=table[table['term']==title]
    x_labels=subset['yr_month'].apply(lambda x:(str(x)[2:4]+'\n'+str(x)[5:7]))
    ax.errorbar(x_labels,subset['polarity_mean'],yerr=subset['polarity_st_dev'],marker='s',ms=7)
    title1='Polarity Trend of:'+title
    ax.set_title(title1)
plt.show()

### Time Series plots of Subjectivity Trend for all ten terms

In [None]:
#plt.errorbar(x1,y1,yerr=se1,marker='s',mfc='red',mec='blue',ms=3,mew=5)
figure,axes=plt.subplots(3,4,tight_layout=True,sharey="all",figsize=(20,20))
titles=['affordable care act','food stamps','cut social security','tax cut','middle class','low income','mental health','minimum wage','illegal immigrants','planned parenthood']
axes=axes.ravel()
for ax,title in zip(axes,titles):
    subset=table[table['term']==title]
    x_labels=subset['yr_month'].apply(lambda x:(str(x)[2:4]+'\n'+str(x)[5:7]))
    ax.errorbar(x_labels,subset['subjectivity_mean'],yerr=subset['subjectivity_st_dev'],marker='s',ms=7)
    title1='Subjectivity Trend of:'+title
    ax.set_title(title1)
plt.show()

### Graph Monthly Occurrences for all ten terms

In [None]:
#plt.errorbar(x1,y1,yerr=se1,marker='s',mfc='red',mec='blue',ms=3,mew=5)
figure,axes=plt.subplots(3,4,tight_layout=True,sharey="all",figsize=(20,20))
titles=['affordable care act','food stamps','cut social security','tax cut','middle class','low income','mental health','minimum wage','illegal immigrants','planned parenthood']
axes=axes.ravel()
for ax,title in zip(axes,titles):
    subset=table[table['term']==title]
    x_labels=subset['yr_month'].apply(lambda x:(str(x)[2:4]+'\n'+str(x)[5:7]))
    ax.plot(x_labels,subset['num_of_occurrences'],marker='s',ms=7)
    title1='Occurrences of:'+title
    ax.set_title(title1)
plt.show()
