## Analysis of Covid Tweets with NLP

Question: How seriously are Twitter users from each country taking Covid-19?

In [1]:
import pandas as pd
import numpy as np
import us
import re
import nltk
import sklearn
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
data = pd.read_csv("covid19_tweets.csv")

In [3]:
data.iloc[16]

user_name                                              ChennaiCityNow
user_location                                                     NaN
user_description    Individual tweeting about significant happenin...
user_created                                      2009-04-26 09:38:11
user_followers                                                   3987
user_friends                                                       53
user_favourites                                                   749
user_verified                                                   False
date                                              2020-07-25 12:26:44
text                July 25 #COVID19 update\n#TamilNadu - 6988\nDi...
hashtags                          ['COVID19', 'TamilNadu', 'chennai']
source                                             Twitter for iPhone
is_retweet                                                      False
Name: 16, dtype: object

In [4]:
data_1 = data[['user_location', 'user_description', 'user_verified', 'date', 'text', 'hashtags', 'is_retweet']]

In [5]:
data_1.user_location = data_1.user_location.str.split(',').str[-1].astype(str).str.strip().apply(us.states.lookup).astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [6]:
data_1

Unnamed: 0,user_location,user_description,user_verified,date,text,hashtags,is_retweet
0,,wednesday addams as a disney princess keepin i...,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,False
1,New York,"Husband, Father, Columnist & Commentator. Auth...",True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,False
2,Kentucky,#Christian #Catholic #Conservative #Reagan #Re...,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],False
3,,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],False
4,,🖊️Official Twitter handle of Department of Inf...,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",False
...,...,...,...,...,...,...,...
179103,,Animal Scientist|| Muslim|| Real Madrid/Chelsea,False,2020-08-29 19:44:21,Thanks @IamOhmai for nominating me for the @WH...,['WearAMask'],False
179104,,When your cat has more baking soda than Ninja ...,False,2020-08-29 19:44:16,2020! The year of insanity! Lol! #COVID19 http...,['COVID19'],False
179105,,⚒️ The Architects of Free Trade ⚒️ Really Did ...,False,2020-08-29 19:44:15,@CTVNews A powerful painting by Juan Lucena. I...,,False
179106,,"Global UX UI Visual Designer. StoryTeller, Mus...",False,2020-08-29 19:44:14,"More than 1,200 students test positive for #CO...",['COVID19'],False


In [7]:
data_1.groupby(['user_location']).count()

Unnamed: 0_level_0,user_description,user_verified,date,text,hashtags,is_retweet
user_location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama,179,181,181,181,114,181
Alaska,65,65,65,65,50,65
Arizona,474,491,491,491,331,491
Arkansas,175,178,178,178,135,178
California,3879,3998,3998,3998,2819,3998
Colorado,452,461,461,461,316,461
Connecticut,266,270,270,270,190,270
Delaware,39,40,40,40,31,40
District of Columbia,1449,1456,1456,1456,1034,1456
Florida,1526,1590,1590,1590,1115,1590


In [8]:
data_2 = data_1[data_1['user_location'] != 'None']

In [9]:
data_2

Unnamed: 0,user_location,user_description,user_verified,date,text,hashtags,is_retweet
1,New York,"Husband, Father, Columnist & Commentator. Auth...",True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,False
2,Kentucky,#Christian #Catholic #Conservative #Reagan #Re...,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],False
6,Florida,Workplace tips and advice served up in a frien...,False,2020-07-25 12:27:03,How #COVID19 Will Change Work in General (and ...,"['COVID19', 'Recruiting']",False
27,New York,"These days, I expose colonizers & exploits @ t...",False,2020-07-25 12:26:26,I can imagine the same people profiting off th...,['COVID19'],False
32,Florida,We beautify data to learn and gain insight fro...,False,2020-07-25 12:26:17,"An update on the total #covid19 cases, recover...","['covid19', 'Africa']",False
...,...,...,...,...,...,...,...
179093,Connecticut,Norwalk Public Library is a public library loc...,False,2020-08-29 19:44:48,What are those #library #cats doing now? #COV...,"['library', 'cats', 'COVID19', 'pandemic', 'co...",False
179095,Michigan,Host of the Morning Wake Up w/Dave Akerly @132...,False,2020-08-29 19:44:42,#COVID19 Update: 23 new cases today in the Tri...,['COVID19'],False
179096,California,"Creative Director/AD/CW, Advertising & Brandin...",False,2020-08-29 19:44:40,We were really bummed we couldn’t cop one of t...,,False
179100,New Jersey,@njherald reporter || chasing crime and coveri...,False,2020-08-29 19:44:27,Wallkill school nurse adds COVID-19 monitoring...,"['nurses', 'COVID19', 'coronavirus', 'schools']",False


In [10]:
train = data_2.sample(frac = .02)
test = data_2.drop(train.index)
train_new = pd.read_csv("covid19_tweets_train_new.csv")
train_new

Unnamed: 0,Index,user_location,user_description,user_verified,date,text,is_serious,hashtags,is_retweet
0,44450,Pennsylvania,Center of emphasis @CHOP_Research informing ch...,False,2020-07-31 19:19:58,Our Scientific Director Meredith Matone told @...,1.0,['COVID19'],False
1,11375,New York,"A child of God,wife,mother,sister,aunt,godmoth...",False,2020-07-25 04:36:37,Give $2000/month to every American #moneyforth...,0.0,"['moneyforthepeople', 'covid19']",False
2,128634,Pennsylvania,,False,2020-08-14 04:26:43,More people died of the seasonal flu in EVERY ...,0.0,['Covid_19'],False
3,54732,Georgia,Advanced Practice Registered Nurse working in ...,False,2020-08-01 17:50:53,Thanks to @macmillanpages for having me on her...,1.0,['COVID19'],False
4,65317,Indiana,"Local, state and national news from the WFIU n...",True,2020-08-02 17:57:00,"As of today at noon, the @StateHealthIN says 6...",1.0,,False
...,...,...,...,...,...,...,...,...,...
518,40860,Texas,it has electrolytes,False,2020-07-29 16:15:08,@abc13houston @DrJAshton @ABC Dear #DrAnthonyF...,0.0,['DrAnthonyFauci'],False
519,69717,Illinois,Building a revolutionary workers party dedicat...,False,2020-08-04 06:29:25,What set off my #bullshitdetector on early #Bo...,0.0,"['bullshitdetector', 'BourgeoisPress', 'COVID19']",False
520,29486,New Jersey,"Connecting People, Connecting Communities - ...",False,2020-07-27 03:50:44,"#COVID19 Dashboard (by State) with Confirmed, ...",1.0,['COVID19'],False
521,53091,Louisiana,Chief BeenBlocked \n#facts and #reason. #noGod...,False,2020-08-01 18:41:04,@RVAwonk @SnowflakeSnark #COVID19 looking for ...,0.0,['COVID19'],False


In [11]:
stop_words = set(stopwords.words('english')) 
porter = PorterStemmer()

In [12]:
train_new.text = train_new.text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [13]:
train_new

Unnamed: 0,Index,user_location,user_description,user_verified,date,text,is_serious,hashtags,is_retweet
0,44450,Pennsylvania,Center of emphasis @CHOP_Research informing ch...,False,2020-07-31 19:19:58,Our Scientific Director Meredith Matone told @...,1.0,['COVID19'],False
1,11375,New York,"A child of God,wife,mother,sister,aunt,godmoth...",False,2020-07-25 04:36:37,Give $2000/month every American #moneyforthepe...,0.0,"['moneyforthepeople', 'covid19']",False
2,128634,Pennsylvania,,False,2020-08-14 04:26:43,More people died seasonal flu EVERY YEAR Obama...,0.0,['Covid_19'],False
3,54732,Georgia,Advanced Practice Registered Nurse working in ...,False,2020-08-01 17:50:53,Thanks @macmillanpages show talk nurse's persp...,1.0,['COVID19'],False
4,65317,Indiana,"Local, state and national news from the WFIU n...",True,2020-08-02 17:57:00,"As today noon, @StateHealthIN says 67,857 peop...",1.0,,False
...,...,...,...,...,...,...,...,...,...
518,40860,Texas,it has electrolytes,False,2020-07-29 16:15:08,@abc13houston @DrJAshton @ABC Dear #DrAnthonyF...,0.0,['DrAnthonyFauci'],False
519,69717,Illinois,Building a revolutionary workers party dedicat...,False,2020-08-04 06:29:25,What set #bullshitdetector early #BourgeoisPre...,0.0,"['bullshitdetector', 'BourgeoisPress', 'COVID19']",False
520,29486,New Jersey,"Connecting People, Connecting Communities - ...",False,2020-07-27 03:50:44,"#COVID19 Dashboard (by State) Confirmed, Death...",1.0,['COVID19'],False
521,53091,Louisiana,Chief BeenBlocked \n#facts and #reason. #noGod...,False,2020-08-01 18:41:04,@RVAwonk @SnowflakeSnark #COVID19 looking wind...,0.0,['COVID19'],False


In [33]:
def stem_sent(sentence):
    tokens = sentence.split()
    stemmed_tokens = [porter.stem(i) for i in tokens]
    return ' '.join(stemmed_tokens)

train_new.text = train_new.text.apply(stem_sent).str.replace('[^\w\s]','').str.replace('\d+', '').apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [34]:
train_new

Unnamed: 0,Index,user_location,user_description,user_verified,date,text,is_serious,hashtags,is_retweet
0,44450,Pennsylvania,Center of emphasis @CHOP_Research informing ch...,False,2020-07-31 19:19:58,scientif director meredith maton told busi wit...,1.0,['COVID19'],False
1,11375,New York,"A child of God,wife,mother,sister,aunt,godmoth...",False,2020-07-25 04:36:37,give month everi american moneyforthepeopl cov...,0.0,"['moneyforthepeople', 'covid19']",False
2,128634,Pennsylvania,,False,2020-08-14 04:26:43,peopl die season flu everi year obama presid d...,0.0,['Covid_19'],False
3,54732,Georgia,Advanced Practice Registered Nurse working in ...,False,2020-08-01 17:50:53,thank macmillanpag show talk nur perspect covi...,1.0,['COVID19'],False
4,65317,Indiana,"Local, state and national news from the WFIU n...",True,2020-08-02 17:57:00,As today noon statehealthin say peopl indiana ...,1.0,,False
...,...,...,...,...,...,...,...,...,...
518,40860,Texas,it has electrolytes,False,2020-07-29 16:15:08,abchouston drjashton abc dear dranthonyfauci t...,0.0,['DrAnthonyFauci'],False
519,69717,Illinois,Building a revolutionary workers party dedicat...,False,2020-08-04 06:29:25,set bullshitdetector earli bourgeoispress repo...,0.0,"['bullshitdetector', 'BourgeoisPress', 'COVID19']",False
520,29486,New Jersey,"Connecting People, Connecting Communities - ...",False,2020-07-27 03:50:44,covid dashboard bi state confirm death death c...,1.0,['COVID19'],False
521,53091,Louisiana,Chief BeenBlocked \n#facts and #reason. #noGod...,False,2020-08-01 18:41:04,rvawonk snowflakesnark covid look window get r...,0.0,['COVID19'],False


In [35]:
corpus = train_new[['user_location', 'text']]
corpus.index = corpus['user_location']
corpus

Unnamed: 0_level_0,user_location,text
user_location,Unnamed: 1_level_1,Unnamed: 2_level_1
Pennsylvania,Pennsylvania,scientif director meredith maton told busi wit...
New York,New York,give month everi american moneyforthepeopl cov...
Pennsylvania,Pennsylvania,peopl die season flu everi year obama presid d...
Georgia,Georgia,thank macmillanpag show talk nur perspect covi...
Indiana,Indiana,As today noon statehealthin say peopl indiana ...
...,...,...
Texas,Texas,abchouston drjashton abc dear dranthonyfauci t...
Illinois,Illinois,set bullshitdetector earli bourgeoispress repo...
New Jersey,New Jersey,covid dashboard bi state confirm death death c...
Louisiana,Louisiana,rvawonk snowflakesnark covid look window get r...


In [36]:
cv = CountVectorizer()
data_tdm = cv.fit_transform(corpus.text)
data_tdm_1 = pd.DataFrame(data_tdm.toarray(), columns = cv.get_feature_names())
data_tdm_1.index = corpus.index

In [37]:
data_tdm_1

Unnamed: 0_level_0,aap,ab,abati,abc,abchouston,abl,absolut,abu,academi,accel,...,youd,young,youtub,yr,zer,zip,zombi,②positivetest,③case,④top
user_location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Pennsylvania,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
New York,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Pennsylvania,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Georgia,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Indiana,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Texas,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Illinois,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
New Jersey,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Louisiana,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Exploratory Data Analysis

### Top Words

In [38]:
data_tdm_agg = data_tdm_1.groupby(['user_location']).sum()

In [39]:
data_tdm_agg.sum().sort_values(ascending = False).head(20)

covid         343
new            37
test           35
peopl          34
amp            32
coronaviru     30
case           29
mask           27
pandem         26
us             25
death          24
thi            21
work           21
like           21
go             20
say            20
we             20
report         19
need           19
die            19
dtype: int64

### Vocabulary

In [40]:
data_tdm_trans = data_tdm_agg.transpose()

In [41]:
data_td_null = data_tdm_agg.replace(0, np.nan)
data_td_null
vocab = data_td_null.count(axis = 1).sort_values(ascending = False)
vocab # the more tweets the more unique words - could sample words from each state to compare vocab

user_location
California                  677
New York                    535
Texas                       339
Florida                     316
Illinois                    213
District of Columbia        193
Oregon                      177
Georgia                     158
Arizona                     153
Pennsylvania                149
Ohio                        133
Washington                  132
Massachusetts               127
Maryland                    114
Colorado                    101
Oklahoma                     98
Minnesota                    91
Wisconsin                    88
Louisiana                    84
Tennessee                    84
Nevada                       80
North Carolina               78
South Carolina               72
Michigan                     69
Connecticut                  66
Alabama                      65
Indiana                      64
New Jersey                   56
Virginia                     53
Arkansas                     48
Hawaii                    

### Length of Tweets

In [42]:
corpus_new = corpus.drop(['user_location'], axis = 1)
tweet_count = corpus_new.groupby(['user_location']).count()
total_words = pd.DataFrame(data_td_null.sum(axis = 1))

In [43]:
total_words

Unnamed: 0_level_0,0
user_location,Unnamed: 1_level_1
Alabama,73.0
Arizona,169.0
Arkansas,50.0
California,952.0
Colorado,114.0
Connecticut,71.0
Delaware,8.0
District of Columbia,230.0
Florida,363.0
Georgia,189.0


In [44]:
tweet_length = (total_words[0]/tweet_count['text']).sort_values(ascending = False)

In [45]:
tweet_length

user_location
Missouri                    13.000000
Oregon                      12.687500
Michigan                    12.666667
Ohio                        12.583333
West Virginia               12.500000
Arkansas                    12.500000
North Carolina              12.142857
Louisiana                   12.125000
Hawaii                      12.000000
Utah                        12.000000
Northern Mariana Islands    12.000000
Washington                  11.916667
Connecticut                 11.833333
New Jersey                  11.800000
Wisconsin                   11.750000
Indiana                     11.666667
Pennsylvania                11.666667
Maine                       11.500000
District of Columbia        11.500000
Nebraska                    11.500000
California                  11.469880
Colorado                    11.400000
New York                    11.365079
Florida                     11.343750
Arizona                     11.266667
Massachusetts               11.23076

## Training Naive Bayes Model

In [63]:
labels = pd.DataFrame(train_new['is_serious'])
labels.index = data_tdm_1.index

In [90]:
train_df = pd.concat([labels, data_tdm_1], axis=1).dropna()

In [91]:
clf = MultinomialNB()
train_col = train_df.drop('is_serious', axis = 1)
train_labels = train_df['is_serious']
clf.fit(train_col, train_labels)

MultinomialNB()

In [105]:
predictions = pd.DataFrame(clf.predict(train_col), columns = ['predicted'])
predictions.index = train_labels.index
train_labels = pd.DataFrame(train_labels)
p_t = pd.concat([train_labels, predictions], axis=1)

In [110]:
p_t['correct'] = np.where(p_t['is_serious'] == p_t['predicted'], True, False)

In [112]:
p_t['correct'].mean()

0.9980842911877394