In [2]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import regex as re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from textblob import TextBlob

In [3]:
data = pd.read_csv('/Users/derekmcadam/GA/projects/project_5/client_project_5/cleandata/new_tweets_noDuplicates.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,date,text,state,query_term
0,0,2020-04-24 21:59:42+00:00,The economy will collapse in the long term if ...,DC,business closures
1,1,2020-04-23 21:28:12+00:00,Classic counsel on cover letters! Covid-19 and...,DC,business closures
2,2,2020-04-17 01:01:43+00:00,I'm guessing that would violate the mayor's no...,DC,business closures
3,3,2020-04-16 18:19:35+00:00,"Ok, why did this take so long...and yet mandat...",DC,business closures
4,4,2020-04-16 01:53:41+00:00,Explain exactly how government-forced business...,DC,business closures


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59930 entries, 0 to 59929
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  59930 non-null  int64 
 1   date        59930 non-null  object
 2   text        59930 non-null  object
 3   state       59930 non-null  object
 4   query_term  59930 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.3+ MB


In [6]:
data.isna().sum()

Unnamed: 0    0
date          0
text          0
state         0
query_term    0
dtype: int64

In [7]:
data.drop(columns=['Unnamed: 0'], inplace=True)

In [8]:
data.head()

Unnamed: 0,date,text,state,query_term
0,2020-04-24 21:59:42+00:00,The economy will collapse in the long term if ...,DC,business closures
1,2020-04-23 21:28:12+00:00,Classic counsel on cover letters! Covid-19 and...,DC,business closures
2,2020-04-17 01:01:43+00:00,I'm guessing that would violate the mayor's no...,DC,business closures
3,2020-04-16 18:19:35+00:00,"Ok, why did this take so long...and yet mandat...",DC,business closures
4,2020-04-16 01:53:41+00:00,Explain exactly how government-forced business...,DC,business closures


In [9]:
# Adapted from DC_Flex week 5
def create_lemmas(text):
    '''return lowercased, lemmatizeed list of words as a string from a document passed in '''
   
    lemmer = WordNetLemmatizer()
    return ' '.join([lemmer.lemmatize(word) for word in text.split()])

In [10]:
#  remoiving emojis and other unicode from text 
# adatped from https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python

def remove_emoji(data):
    '''
    Removes unicode text and replace with nothing
    '''
    
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emojis
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

In [11]:
def clean_text(text):
    '''
    Remove Emoji, html tags, URLS and punctuation -- returns lemmatized text.
    '''
    text = remove_emoji(text)
    # remove HTML tags and URLs
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'^https?:\/\/.*[\r\n]*','',text)
    
    # keep only text without punctuation
    text = re.sub(r'[^\w\s]','',text)

    lemmas = create_lemmas(text)
    
    return lemmas

In [12]:
data['clean_tweets_lemmas'] = data['text'].apply(lambda x: clean_text(x))

In [13]:
data.head()

Unnamed: 0,date,text,state,query_term,clean_tweets_lemmas
0,2020-04-24 21:59:42+00:00,The economy will collapse in the long term if ...,DC,business closures,The economy will collapse in the long term if ...
1,2020-04-23 21:28:12+00:00,Classic counsel on cover letters! Covid-19 and...,DC,business closures,Classic counsel on cover letter Covid19 and bu...
2,2020-04-17 01:01:43+00:00,I'm guessing that would violate the mayor's no...,DC,business closures,Im guessing that would violate the mayor nones...
3,2020-04-16 18:19:35+00:00,"Ok, why did this take so long...and yet mandat...",DC,business closures,Ok why did this take so longand yet mandatory ...
4,2020-04-16 01:53:41+00:00,Explain exactly how government-forced business...,DC,business closures,Explain exactly how governmentforced business ...


In [14]:
def sentiment_polarity(text):
    tweet = TextBlob(text)
    return tweet.polarity 

In [15]:
data['polarity'] = data['clean_tweets_lemmas'].apply(lambda x: sentiment_polarity(x))

In [16]:
def sentiment_subjectivity(text):
    tweet = TextBlob(text)
    return tweet.subjectivity 

In [17]:
data['subjectivity'] = data['clean_tweets_lemmas'].apply(lambda x: sentiment_subjectivity(x))

In [21]:
TextBlob.sentiment

<textblob.decorators.cached_property at 0x7fa230ad43d0>

In [17]:
data.head()

Unnamed: 0,date,text,state,query_term,clean_tweets_lemmas,polarity,subjectivity
0,2020-04-24 21:59:42+00:00,The economy will collapse in the long term if ...,DC,business closures,The economy will collapse in the long term if ...,-0.003571,0.334524
1,2020-04-23 21:28:12+00:00,Classic counsel on cover letters! Covid-19 and...,DC,business closures,Classic counsel on cover letter Covid19 and bu...,0.118056,0.451389
2,2020-04-17 01:01:43+00:00,I'm guessing that would violate the mayor's no...,DC,business closures,Im guessing that would violate the mayor nones...,0.0,0.0
3,2020-04-16 18:19:35+00:00,"Ok, why did this take so long...and yet mandat...",DC,business closures,Ok why did this take so longand yet mandatory ...,0.233333,0.683333
4,2020-04-16 01:53:41+00:00,Explain exactly how government-forced business...,DC,business closures,Explain exactly how governmentforced business ...,0.25,0.25


In [33]:
data.state.replace({
'Alabama': 'AL',
'Alaska': 'AK',
'Arizona':'AZ',
'Arkansas':'AR',
'California':'CA',
'Colorado':'CO',
'Connecticut':'CT',
'Delaware':'DE',
'Florida':'FL',
'Georgia':'GA',
'Hawaii':'HI',
'Idaho':'ID',
'Illinois':	'IL',
'Indiana':'IN',
'Iowa':'IA',
'Kansas':'KS',
'Kentucky':	'KY',
'Louisiana':'LA',
'Maine':'ME',
'Maryland':'MD',
'Massachusetts':'MA',
'Michigan':	'MI',
'Minnesota':'MN',
'Mississippi':'MS',
'Missouri':	'MO',
'Montana':'MT',
'Nebraska':'NE',
'Nevada':'NV',
'New Hampshire':'NH',
'New Jersey':'NJ',
'New Mexico':'NM',
'New York':'NY',
'North Carolina':'NC',
'North Dakota':'ND',
'Ohio':'OH',
'Oklahoma':'OK',
'Oregon':'OR',
'Pennsylvania':'PA',
'Rhode Island':	'RI',
'South Carolina':'SC',
'South Dakota':'SD',
'Tennessee':'TN',
'Texas':'TX',
'Utah':'UT',
'Vermont':'VT',
'Virginia':	'VA',
'Washington':'WA',
'West Virginia':'WV',
'Wisconsin':'WI',
'Wyoming':'WY',
}, inplace=True)

In [34]:
data.state.unique()

array(['DC', 'MA', 'MS', 'NY', 'WY', 'CT', 'DE', 'GA', 'MO', 'OR', 'PA',
       'RI', 'AL', 'AK', 'AZ', 'CA', 'CO', 'FL', 'ID', 'IN', 'KY', 'LA',
       'MD', 'MI', 'MN', 'NV', 'NM', 'OH', 'OK', 'SC', 'TX', 'UT', 'VA',
       'WI', 'AR', 'HI', 'IL', 'IA', 'KS', 'ME', 'NH', 'NC', 'SD', 'TN',
       'VT', 'WA', 'NE', 'NJ', 'MT', 'WV'], dtype=object)

In [40]:
data.head()

Unnamed: 0,date,text,state,query_term,clean_tweets_lemmas,polarity,subjectivity
0,2020-04-24 21:59:42+00:00,The economy will collapse in the long term if ...,DC,business closures,The economy will collapse in the long term if ...,-0.003571,0.334524
1,2020-04-23 21:28:12+00:00,Classic counsel on cover letters! Covid-19 and...,DC,business closures,Classic counsel on cover letter Covid19 and bu...,0.118056,0.451389
2,2020-04-17 01:01:43+00:00,I'm guessing that would violate the mayor's no...,DC,business closures,Im guessing that would violate the mayor nones...,0.0,0.0
3,2020-04-16 18:19:35+00:00,"Ok, why did this take so long...and yet mandat...",DC,business closures,Ok why did this take so longand yet mandatory ...,0.233333,0.683333
4,2020-04-16 01:53:41+00:00,Explain exactly how government-forced business...,DC,business closures,Explain exactly how governmentforced business ...,0.25,0.25


In [66]:
state_avg_sentiment_score = data.groupby('state')[['polarity', 'subjectivity']].mean()
state_avg_sentiment_score

Unnamed: 0_level_0,polarity,subjectivity
state,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,0.067973,0.360932
AL,0.059283,0.340674
AR,0.051017,0.363592
AZ,0.059012,0.34436
CA,0.066916,0.347274
CO,0.057687,0.34345
CT,0.077335,0.363663
DC,0.072997,0.353666
DE,0.077937,0.359825
FL,0.056697,0.330574


In [67]:
state_avg_sentiment_score.to_csv('data/state_avg_sentiment_score.csv')

In [64]:
state_avg_subjectivity_score = data.groupby('state')['subjectivity'].mean()
state_avg_subjectivity_score

state
AK    0.360932
AL    0.340674
AR    0.363592
AZ    0.344360
CA    0.347274
CO    0.343450
CT    0.363663
DC    0.353666
DE    0.359825
FL    0.330574
GA    0.360597
HI    0.306500
IA    0.359890
ID    0.369897
IL    0.341589
IN    0.340085
KS    0.332817
KY    0.340552
LA    0.341518
MA    0.360112
MD    0.343050
ME    0.345876
MI    0.353546
MN    0.357171
MO    0.352723
MS    0.361340
MT    0.348590
NC    0.335306
NE    0.380241
NH    0.370515
NJ    0.336262
NM    0.349981
NV    0.362088
NY    0.369856
OH    0.333250
OK    0.352148
OR    0.368321
PA    0.361259
RI    0.357930
SC    0.346674
SD    0.330223
TN    0.363188
TX    0.345447
UT    0.356774
VA    0.339492
VT    0.347679
WA    0.366462
WI    0.405570
WV    0.650000
WY    0.354852
Name: subjectivity, dtype: float64

In [65]:
state_avg_subjectivity_score.to_csv('data/state_avg_subjectivity_score.csv')

In [39]:
#data.to_csv('sentiment-scored-data-abbrev.csv', index=False)

In [24]:
cvec = CountVectorizer(stop_words='english', 
                       strip_accents='unicode', 
                       preprocessor=create_lemmas)

In [23]:
# Adapted From https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34

def get_top_n_words(corpus, n=None):
    '''
    Vectorize a corpus, sum the frequency, and return a sorted list
    '''
    
    # instanciate and fit countvectorizer on corpus
    cvec = CountVectorizer(stop_words='english').fit(corpus)
    
    # Transform corpus
    bag_of_words = cvec.transform(corpus)
    
    # sum words in bag of words
    sum_words = bag_of_words.sum(axis=0) 
    
    # tally word sum based on word
    words_freq = [(word, sum_words[0, x]) for word, x in cvec.vocabulary_.items()]
    
    # Sort words_freq
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [24]:
# Adapted From https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34

def get_top_n_words(corpus, n=None):
    '''
    Vectorize a corpus, sum the frequency, and return a sorted list
    '''
    
    # instanciate and fit countvectorizer on corpus
    cvec = CountVectorizer(stop_words='english', ngram_range=(2,2)).fit(corpus)
    
    # Transform corpus
    bag_of_words = cvec.transform(corpus)
    
    # sum words in bag of words
    sum_words = bag_of_words.sum(axis=0) 
    
    # tally word sum based on word
    words_freq = [(word, sum_words[0, x]) for word, x in cvec.vocabulary_.items()]
    
    # Sort words_freq
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [25]:
get_top_n_words(data.text)

121),
 ('100 000', 120),
 ('19 response', 120),
 ('post covid', 120),
 ('com coronavirus', 119),
 ('healthcare workers', 119),
 ('000 deaths', 118),
 ('contact tracing', 118),
 ('social distance', 117),
 ('mask social', 117),
 ('small business', 116),
 ('time social', 116),
 ('utm_medium social', 116),
 ('04 30', 115),
 ('coronavirus lockdown', 115),
 ('maintain social', 114),
 ('com article', 113),
 ('dr fauci', 113),
 ('die covid', 113),
 ('distancing masks', 112),
 ('new normal', 111),
 ('doesn mean', 109),
 ('social amp', 108),
 ('people dying', 107),
 ('don care', 107),
 ('dying covid', 107),
 ('2020 03', 106),
 ('new jersey', 106),
 ('distancing amp', 105),
 ('died coronavirus', 105),
 ('high risk', 105),
 ('60 000', 105),
 ('murder hornets', 105),
 ('coronavirus vaccine', 104),
 ('health officials', 104),
 ('gov whitmer', 104),
 ('coronavirus patients', 103),
 ('19 relief', 103),
 ('facebook com', 102),
 ('19 virus', 102),
 ('cases coronavirus', 101),
 ('tested covid', 99),
 ('c

In [26]:
get_top_n_words(data.text)

121),
 ('100 000', 120),
 ('19 response', 120),
 ('post covid', 120),
 ('com coronavirus', 119),
 ('healthcare workers', 119),
 ('000 deaths', 118),
 ('contact tracing', 118),
 ('social distance', 117),
 ('mask social', 117),
 ('small business', 116),
 ('time social', 116),
 ('utm_medium social', 116),
 ('04 30', 115),
 ('coronavirus lockdown', 115),
 ('maintain social', 114),
 ('com article', 113),
 ('dr fauci', 113),
 ('die covid', 113),
 ('distancing masks', 112),
 ('new normal', 111),
 ('doesn mean', 109),
 ('social amp', 108),
 ('people dying', 107),
 ('don care', 107),
 ('dying covid', 107),
 ('2020 03', 106),
 ('new jersey', 106),
 ('distancing amp', 105),
 ('died coronavirus', 105),
 ('high risk', 105),
 ('60 000', 105),
 ('murder hornets', 105),
 ('coronavirus vaccine', 104),
 ('health officials', 104),
 ('gov whitmer', 104),
 ('coronavirus patients', 103),
 ('19 relief', 103),
 ('facebook com', 102),
 ('19 virus', 102),
 ('cases coronavirus', 101),
 ('tested covid', 99),
 ('c

In [27]:
# Adapted From https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34

def get_top_n_words(corpus, n=None):
    '''
    Vectorize a corpus, sum the frequency, and return a sorted list
    '''
    
    # instanciate and fit countvectorizer on corpus
    cvec = CountVectorizer(stop_words='english', ngram_range=(3,3)).fit(corpus)
    
    # Transform corpus
    bag_of_words = cvec.transform(corpus)
    
    # sum words in bag of words
    sum_words = bag_of_words.sum(axis=0) 
    
    # tally word sum based on word
    words_freq = [(word, sum_words[0, x]) for word, x in cvec.vocabulary_.items()]
    
    # Sort words_freq
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [28]:
get_top_n_words(data.text)

19 people', 32),
 ('deaths http dlvr', 32),
 ('health covid 19', 32),
 ('free covid 19', 32),
 ('social distancing seriously', 32),
 ('wearing mask social', 32),
 ('self quarantine weeks', 32),
 ('dr anthony fauci', 31),
 ('cdc gov coronavirus', 31),
 ('carolina https www', 31),
 ('news covid 19', 31),
 ('soc_trk tw yahoonews', 31),
 ('deaths weeks ago', 31),
 ('com article news', 31),
 ('risk covid 19', 31),
 ('covid 19 need', 31),
 ('place social distancing', 31),
 ('cloth face coverings', 31),
 ('19 social distancing', 31),
 ('covid 19 health', 31),
 ('support social distancing', 31),
 ('social distancing order', 31),
 ('social distancing place', 31),
 ('break social distancing', 31),
 ('breaking news newswars', 31),
 ('news newswars share', 31),
 ('newswars share links', 31),
 ('https hubs ly', 30),
 ('new york https', 30),
 ('gov coronavirus 2019', 30),
 ('recovered covid 19', 30),
 ('covid 19 impact', 30),
 ('https www cnbc', 30),
 ('www cnbc com', 30),
 ('cnbc com 2020', 30),
 (

In [29]:
# Adapted From https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34

def get_top_n_words(corpus, n=None):
    '''
    Vectorize a corpus, sum the frequency, and return a sorted list
    '''
    
    # instanciate and fit countvectorizer on corpus
    cvec = TfidfVectorizer(stop_words='english', ngram_range=(1,1)).fit(corpus)
    
    # Transform corpus
    bag_of_words = cvec.transform(corpus)
    
    # sum words in bag of words
    sum_words = bag_of_words.sum(axis=0) 
    
    # tally word sum based on word
    words_freq = [(word, sum_words[0, x]) for word, x in cvec.vocabulary_.items()]
    
    # Sort words_freq
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [30]:
get_top_n_words(data.text)

571376),
 ('especially', 74.48533831987471),
 ('seriously', 74.08900030379536),
 ('ways', 73.95261544941525),
 ('probably', 73.88521887750603),
 ('fear', 73.63004564140849),
 ('friend', 73.5605461448404),
 ('control', 73.416246124195),
 ('left', 73.3314884362739),
 ('election', 73.18234450481022),
 ('different', 72.85217434439322),
 ('including', 72.77499553515227),
 ('cnn', 72.75583630463657),
 ('ok', 72.73226285883372),
 ('join', 72.72370778252626),
 ('hey', 72.67502144056581),
 ('resources', 72.51234871565828),
 ('seeing', 72.51003423348823),
 ('pretty', 72.38091536790255),
 ('second', 71.95655682888854),
 ('healthy', 71.88851759465571),
 ('gop', 71.88272826666584),
 ('try', 71.8778370319431),
 ('15', 71.86777063536537),
 ('kill', 71.25415493097645),
 ('members', 71.17291613867822),
 ('travel', 71.0876567117131),
 ('course', 70.98993335751763),
 ('fact', 70.91631727544856),
 ('sign', 70.91084041976512),
 ('feet', 70.70180413195432),
 ('hear', 70.67270696491258),
 ('50', 70.583360957

In [31]:
def get_top_n_words(corpus, n=None):
    '''
    Vectorize a corpus, sum the frequency, and return a sorted list
    '''
    
    # instanciate and fit countvectorizer on corpus
    cvec = TfidfVectorizer(stop_words='english', ngram_range=(2,2)).fit(corpus)
    
    # Transform corpus
    bag_of_words = cvec.transform(corpus)
    
    # sum words in bag of words
    sum_words = bag_of_words.sum(axis=0) 
    
    # tally word sum based on word
    words_freq = [(word, sum_words[0, x]) for word, x in cvec.vocabulary_.items()]
    
    # Sort words_freq
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [32]:
get_top_n_words(data.text)

844007365),
 ('don believe', 10.523238295351838),
 ('000 new', 10.516930417759538),
 ('coronavirus isn', 10.507883470360097),
 ('https fxn', 10.480453726920496),
 ('fxn ws', 10.480453726920496),
 ('home coronavirus', 10.468314920323106),
 ('www youtube', 10.465935645034971),
 ('jared kushner', 10.428370479877952),
 ('hand washing', 10.41817295760127),
 ('covid thing', 10.399811044441705),
 ('distancing isn', 10.388622624703142),
 ('right wing', 10.387111890895365),
 ('supply chain', 10.386451236895699),
 ('impacted covid', 10.383533539406438),
 ('tinyurl com', 10.377915957752657),
 ('south dakota', 10.376968731132301),
 ('pandemic http', 10.365725980090211),
 ('19 updates', 10.341139166071736),
 ('coronavirus originated', 10.32257039722551),
 ('coronavirus case', 10.319773872027895),
 ('https etsy', 10.30083954622732),
 ('https news', 10.291790929093015),
 ('ve heard', 10.287415417747214),
 ('don worry', 10.2630205707004),
 ('cnn com', 10.242879715510604),
 ('daily coronavirus', 10.224

In [33]:
def get_top_n_words(corpus, n=None):
    '''
    Vectorize a corpus, sum the frequency, and return a sorted list
    '''
    
    # instanciate and fit countvectorizer on corpus
    cvec = TfidfVectorizer(stop_words='english', ngram_range=(3,3)).fit(corpus)
    
    # Transform corpus
    bag_of_words = cvec.transform(corpus)
    
    # sum words in bag of words
    sum_words = bag_of_words.sum(axis=0) 
    
    # tally word sum based on word
    words_freq = [(word, sum_words[0, x]) for word, x in cvec.vocabulary_.items()]
    
    # Sort words_freq
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [34]:
get_top_n_words(data.text)

5.22604236998172),
 ('social distancing important', 5.211452319554938),
 ('lifting shelter place', 5.205465433125191),
 ('respect social distancing', 5.197576795514005),
 ('encourage social distancing', 5.191591500111459),
 ('age social distancing', 5.177061002072233),
 ('covid 19 briefing', 5.173367767635253),
 ('daily covid 19', 5.171406523106979),
 ('county health department', 5.161745124123077),
 ('day covid 19', 5.159651606717197),
 ('coronavirus emergency declaration', 5.141309313169058),
 ('washingtontimes com news', 5.138284122621953),
 ('2020 05 08', 5.137575807864128),
 ('costs covid 19', 5.130606254560671),
 ('social distancing think', 5.128929078729413),
 ('covid 19 trump', 5.117106610487626),
 ('nursing homes covid', 5.1153637675172865),
 ('coronavirus cases deaths', 5.107196654510391),
 ('people stay home', 5.105807405570502),
 ('social distancing precautions', 5.105440735967446),
 ('stayactive socialdistancing stayhealthy', 5.098897584431104),
 ('gov gretchen whitmer', 5

In [None]:
def sentiment(text):
    