Created on 4/2/2020
Author: Yuan-Chi Yang

Objective: template for content analysis on political feedback, please feel free to modify and play around

In [40]:
import pandas as pd
import nltk
import re

# Importing the data and perform some checks

The data is from "C:\Users\yyang60\PostDoct-Emory\medicaid-project\medicaid-classifier\labeling-data\whole_dataset\BERT_20200228\political-tweets-streaming.csv"

It consists of all the tweets classified as the 'p' class by the best performing classfiers to date.

In [41]:
df = pd.read_csv('./political-tweets-streaming.csv',header = 0, keep_default_na=False,dtype={'tweet_id':str})

In [42]:
len(df)

383969

In [43]:
df.columns

Index(['tweet_id', 'unprocessed_text', 'class'], dtype='object')

### Check Duplicates
No Duplicated

In [44]:
df.duplicated(subset = ['tweet_id'], keep=False).sum()

0

### Include 'text_remove_stopwords' column

In [45]:
def loadStopWords(FILENAME):
    stopword_list = []
    infile = open(FILENAME)
    for line in infile:
        stopword_list.append(line.strip())
    print(len(stopword_list))
    return stopword_list

In [46]:
def processing_text_remove_stopwords(tweet_text,stop_words):
    tweet_text = re.sub(r'&amp;', "and", tweet_text)
    tweet_text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '',
                        tweet_text)
    #tweet_text = re.sub('(@[A-Za-z0-9\_]+)', '', tweet_text)
    tweet_text = re.sub('(@[\S]+)', '', tweet_text)#added Angel
    ####the following section separate words following each hashtag
    list_hashtag1=re.findall('(#[\S]+)',tweet_text)
    list_hashtag2=[]
    for p in list_hashtag1:
        p=re.sub('(#)','',p)#string
        p1=re.findall('([A-Z]{2,})',p)#list
        for p2 in p1:
            list_hashtag2.append(p2)
        p=re.sub('([A-Z]{2,})',' ',p)
        p1=re.findall('([A-Z][a-z]{1,})',p)
        for p2 in p1:
            list_hashtag2.append(p2)
        p=re.sub('([A-Z][a-z]{1,})',' ',p)
        p=re.sub('([A-Z])',' ',p)
        p=re.sub('([^A-Za-z])',' ',p)
        p1=p.split()
        for p2 in p1:
            list_hashtag2.append(p2)
    tweet_text=re.sub('(#[\S]+)','',tweet_text)
    
    tweet_text = re.sub("[^a-zA-Z_-]", " ", tweet_text)
    tweet_text = tweet_text.lower()
    tweet_text = re.sub(r'\s{2,}', " ", tweet_text)
    list_hashtag= [h for h in list_hashtag2 if (not h in stop_words and len(h)>3)]
    tweet_text = [t for t in tweet_text.split() if (not t in stop_words and len(t)>3)]
    tweet_text.extend(list_hashtag)
    return ' '.join(tweet_text)

In [47]:
stopwords = set(loadStopWords('./stopwords.txt'))

261


In [48]:
df['text_remove_stopwords'] = df['unprocessed_text'].apply(lambda x: processing_text_remove_stopwords(x,stopwords))

## Find the most frequent words in text

In [49]:
unigrams = ' '.join(df['text_remove_stopwords'].to_list()).split()
bigrams = []
for tweet in df['text_remove_stopwords'].to_list():
    bigrams += list(nltk.bigrams(tweet.split()))
    
trigrams = []
for tweet in df['text_remove_stopwords'].to_list():
    trigrams += list(nltk.trigrams(tweet.split()))
uni_fd = nltk.FreqDist(unigrams)
big_fd = nltk.FreqDist(bigrams)
trig_fd = nltk.FreqDist(trigrams)


In [70]:
num = 15
print('unigrams:\n')
fd = uni_fd
fd_list = [(x,fd[x]) for x in fd]
fd_list.sort(key = lambda x: x[1], reverse = True)
for i in range(0,num):
    print(fd_list[i][0], fd_list[i][1])
    
print('--------------------------------------------------------\n')
    
print('bigrams:\n')
fd = big_fd
fd_list = [(x,fd[x]) for x in fd]
fd_list.sort(key = lambda x: x[1], reverse = True)
for i in range(0,num):
    x, y= fd_list[i][0]
    term = x + ' '+ y
    print(term, fd_list[i][1])

print('--------------------------------------------------------\n')
print('trigrams:\n')

fd = trig_fd
fd_list = [(x,fd[x]) for x in fd]
fd_list.sort(key = lambda x: x[1], reverse = True)
for i in range(0,num):
    x, y, z= fd_list[i][0]
    term = x + ' '+ y + ' ' + z
    print(term, fd_list[i][1])

unigrams:

people 66954
social 64162
security 59281
care 51287
health 41356
would 40135
insurance 39921
trump 39077
healthcare 38641
expansion 38198
like 35822
state 29422
cuts 28059
need 26424
want 26282
--------------------------------------------------------

bigrams:

social security 56353
food stamps 18234
health care 16445
health insurance 8658
private insurance 5960
cuts social 4551
take away 4155
work requirements 3638
middle class 3531
trump budget 3182
Medicare Medicaid 3103
mental health 3037
budget cuts 3009
pre-existing conditions 2964
insurance companies 2753
--------------------------------------------------------

trigrams:

cuts social security 4383
cutting social security 1805
welfare food stamps 1738
billion social security 1732
like social security 1466
trillion billion billion 1304
billion billion social 1173
social security cuts 1139
social security food 1107
social security public 1086
food stamps housing 1059
social security budget 1031
security food stamps 965


## Finding the interesting terms

In [71]:
def highfreqword(text,term):
    pattern = rf'(^|[^a-zA-Z]){term}([^a-zA-Z]|$)' #rf is for using a variable inside
    #pattern = rf'{term}'
    if(re.search(pattern,text)!=None):
        return 1
    else:
        return 0

In [113]:
bigrams_ls=['social security','healthcare','insurance','food stamps','health care','health insurance','private insurance','cuts social','take away','work requirements','middle class','trump budget','pre-existing conditions','insurance companies','expansion','budget cuts']
for i in range(16):
    coln=bigrams_ls[i]
    df[coln] = df['text_remove_stopwords'].apply(lambda x:highfreqword(x,coln))

In [114]:
df.sample(n=5)
df_temp=df[df['social security'].apply(lambda x:x==1)]
len(df_temp)

54964

In [115]:
df_temp1=df_temp[df['pre-existing conditions'].apply(lambda x:x==1)]
len(df_temp1)

  df_temp1=df_temp[df['pre-existing conditions'].apply(lambda x:x==1)]


506

In [54]:
#term2 = 'trump budget'
#pattern = rf'(^|[^a-zA-Z]){term2}([^a-zA-Z]|$)' #rf is for using a variable inside
#df_temp2 = df[df['text_remove_stopwords'].apply(lambda x: re.search(pattern,x)!=None)]
#print('# of tweets:',len(df_temp2))


In [55]:
#df_temp12 =df_temp1.append(df_temp2)
#print('# of duplicated tweets:',df_temp12.duplicated(subset = ['tweet_id'], keep=False).sum())

# of duplicated tweets: 0


In [116]:
num = 30
temp = df_temp1.sample(n=num)
for i in range(0,num):
    print(temp['unprocessed_text'].iloc[i],'\n\n')
    #print(temp['text_remove_stopwords'].iloc[i],'\n\n')
    print('---------------------------------------------------------------------------------')

@lakesideSMM @Prise88 @ByronYork you do realize they are going to dismantle aca, cut medicare, medicaid and #Moscowmitch wants to take away social security altogether that we paid in that is our money not the gov't money.  and they will do away with pre-existing conditions.  130 million americans can't afford it 


---------------------------------------------------------------------------------
@VP @realDonaldTrump @IngrahamAngle LIES LIES LIES!!!

Donald Trump wants to take healthcare away from millions!  Protections off pre-existing conditions, gut Medicare, Medicaid and Social Security!! 

Top Diplomat Testifies: THERE WAS A QUID PRO QUO!!!! 


---------------------------------------------------------------------------------
@mommamia1217 6 months to primaries impeachment is a nonissue now.. focus on getting the message out about cuts to Medicaid and Medicare . Pre-existing conditions , social security. The Dems in power have chosen to lets us down. Absolutely no voice or message. 

In [118]:
temp.to_csv('ss_pec_sample.csv')

