In [2]:
#Data
import requests
import csv

#EDA and Visualizations
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#modeling
from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, confusion_matrix, mean_squared_error
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import psutil

import datetime



In [3]:
google = pd.read_csv('/covid_searches.csv')
google

Unnamed: 0,Category: All categories
Week,Symptoms of COVID-19: (United States)
2020-05-03,35
2020-05-10,34
2020-05-17,32
2020-05-24,30
...,...
2021-12-12,34
2021-12-19,56
2021-12-26,78
2022-01-02,81


In [4]:
google.reset_index(inplace=True)

google.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 2 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   index                     90 non-null     object
 1   Category: All categories  90 non-null     object
dtypes: object(2)
memory usage: 1.5+ KB


In [5]:
google.drop(index=google.index[0], 
        axis=0, 
        inplace=True)

In [6]:
google['date'] = google['index']
google['relevance'] = google['Category: All categories']

google.drop(columns='index', inplace=True)
google.drop(columns='Category: All categories', inplace=True)

google

Unnamed: 0,date,relevance
1,2020-05-03,35
2,2020-05-10,34
3,2020-05-17,32
4,2020-05-24,30
5,2020-05-31,27
...,...,...
85,2021-12-12,34
86,2021-12-19,56
87,2021-12-26,78
88,2022-01-02,81


In [7]:
google['date'] = pd.to_datetime(google['date'])

In [8]:
google['relevance'] = google['relevance'].astype(int)

In [9]:
google['outbreak'] = [x for x in google['relevance'] > 50]

In [10]:
google.head(21)

Unnamed: 0,date,relevance,outbreak
1,2020-05-03,35,False
2,2020-05-10,34,False
3,2020-05-17,32,False
4,2020-05-24,30,False
5,2020-05-31,27,False
6,2020-06-07,32,False
7,2020-06-14,50,False
8,2020-06-21,74,True
9,2020-06-28,88,True
10,2020-07-05,93,True


In [None]:
google.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   index      89 non-null     int64         
 1   date       89 non-null     datetime64[ns]
 2   relevance  89 non-null     int64         
 3   outbreak   89 non-null     bool          
dtypes: bool(1), datetime64[ns](1), int64(2)
memory usage: 2.3 KB


### CDC Data

In [None]:
CDC = pd.read_csv('/USA_covid_stats.csv')
CDC

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,created_at,consent_cases,consent_deaths
0,03/11/2021,KS,297229,241035.0,56194.0,0,0.0,4851,,,0,0.0,03/12/2021 03:20:13 PM,Agree,
1,06/11/2021,TX,2965966,,,1463,355.0,51158,,,17,0.0,06/13/2021 12:00:00 AM,Not agree,Not agree
2,01/02/2022,AS,11,,,0,0.0,0,,,0,0.0,01/03/2022 03:18:16 PM,,
3,08/22/2020,AR,56199,,,547,0.0,674,,,11,0.0,08/23/2020 02:15:28 PM,Not agree,Not agree
4,07/17/2020,MP,37,37.0,0.0,1,0.0,2,2.0,0.0,0,0.0,07/19/2020 12:00:00 AM,Agree,Agree
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43135,05/28/2020,IA,18585,,,228,0.0,506,,,14,0.0,05/29/2020 02:19:55 PM,Not agree,Not agree
43136,06/07/2020,SD,5438,,,71,0.0,65,64.0,1.0,0,1.0,06/08/2020 02:55:08 PM,,Agree
43137,04/30/2021,SD,122660,,,128,17.0,1967,1601.0,366.0,5,1.0,05/01/2021 01:43:22 PM,,Agree
43138,03/10/2021,SD,113962,,,209,37.0,1904,1546.0,358.0,3,0.0,03/11/2021 03:36:21 PM,,Agree


In [None]:
CDC['date'] = CDC['submission_date']

CDC.drop(columns='submission_date', axis=0, inplace=True)

In [None]:
col = ['state', 'conf_cases', 'prob_cases', 'pnew_case', 'conf_death', 'prob_death', 'pnew_death', 'created_at', 'consent_cases', 'consent_deaths']

CDC.drop(columns=col, inplace=True)

In [None]:
CDC.dropna(how='all', inplace=True)

In [None]:
CDC['date'] = pd.to_datetime(CDC['date']).dt.normalize()

In [None]:
CDC.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73649 entries, 0 to 73648
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tot_cases  73649 non-null  object 
 1   new_case   73648 non-null  float64
 2   tot_death  73648 non-null  float64
 3   new_death  73648 non-null  float64
 4   date       73648 non-null  object 
dtypes: float64(3), object(2)
memory usage: 2.8+ MB


In [None]:
CDC.isna().sum()

tot_cases    0
new_case     1
tot_death    1
new_death    1
date         1
dtype: int64

In [None]:
CDC

Unnamed: 0,tot_cases,new_case,tot_death,new_death,date
0,297229,0,4851,0,2021-03-11
1,2965966,1463,51158,17,2021-06-11
2,11,0,0,0,2022-01-02
3,56199,547,674,11,2020-08-22
4,37,1,2,0,2020-07-17
...,...,...,...,...,...
43135,18585,228,506,14,2020-05-28
43136,5438,71,65,0,2020-06-07
43137,122660,128,1967,5,2021-04-30
43138,113962,209,1904,3,2021-03-10


### Twitter Data

In [11]:
tweets = pd.read_csv('/covid_tweets.csv')
tweets.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624.0,950.0,18775.0,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253.0,1677.0,24.0,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275.0,9525.0,7254.0,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197.0,987.0,1488.0,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone,False
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009.0,168.0,101.0,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False


In [None]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179108 entries, 0 to 179107
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_name         179108 non-null  object
 1   user_location     142337 non-null  object
 2   user_description  168822 non-null  object
 3   user_created      179108 non-null  object
 4   user_followers    179108 non-null  int64 
 5   user_friends      179108 non-null  int64 
 6   user_favourites   179108 non-null  int64 
 7   user_verified     179108 non-null  bool  
 8   date              179108 non-null  object
 9   text              179108 non-null  object
 10  hashtags          127774 non-null  object
 11  source            179031 non-null  object
 12  is_retweet        179108 non-null  bool  
dtypes: bool(2), int64(3), object(8)
memory usage: 15.4+ MB


In [12]:
column = ['user_name','user_location','user_description','user_created','user_followers','user_favourites','user_verified','hashtags','is_retweet', 'source', 'user_friends']

tweets.drop(columns=column, inplace=True)

In [13]:
tweets['date'] = pd.DatetimeIndex(tweets['date']).normalize()

In [14]:
tweets = tweets.groupby(['text', pd.Grouper(key='date', freq='W-SUN')]).sum().reset_index().sort_values('date')

In [None]:
tweets

Unnamed: 0,text,date
89439,Half of all tributes on the memorial site come...,2020-08-02
150017,There is absolutely no reason to make a rule o...,2020-08-02
56559,A huge thank you to our officers who have ensu...,2020-08-02
56562,A huge thanks to Monty Devchand from Laxmi Cas...,2020-08-02
56574,A key part of the Tax Reliefs granted by the S...,2020-08-02
...,...,...
153129,"This week on #MtMRadio,we each bring the other...",2020-09-06
50147,@lebronsonroids @CapeCodAngel3 Because; #COVID...,2020-09-06
153122,This week in #AbstractScience: a #Dengue virus...,2020-09-06
50053,@kyledcheney #SouthCarolina with #AbsenteeBall...,2020-09-06


In [None]:
tweets.date.value_counts()

2020-08-23    44930
2020-08-16    35624
2020-08-09    34725
2020-08-02    32113
2020-08-30    19042
2020-09-06    12446
Name: date, dtype: int64

In [None]:
tweets.isna().sum()

date    0
text    0
dtype: int64

## Combining Dataframes

In [15]:
tash = tweets.merge(google, how='inner', on='date')

In [16]:
tash.drop(columns='index', inplace=True)

KeyError: ignored

In [17]:
tash

Unnamed: 0,text,date,relevance,outbreak
0,!!! #TWEETofTheDay AND #SCIENCEofTheDay : #Co...,2020-07-26,66,True
1,"Amid the ongoing pandemic, managers need to ta...",2020-07-26,66,True
2,Amid the deteriorating #COVID19 situation in #...,2020-07-26,66,True
3,"Amid the #COVID19 outbreak, many of you are no...",2020-07-26,66,True
4,"Amid the #COVID19 crisis, #HR professionals ar...",2020-07-26,66,True
...,...,...,...,...
71045,"1384 new #COVID19 cases detected, 10 die in #O...",2020-08-09,53,True
71046,Looking forward to a day of holiday on Friday ...,2020-08-09,53,True
71047,"Save the date! Global virtual conference on ""R...",2020-08-09,53,True
71048,1384 new #COVID19 positive cases have been rep...,2020-08-09,53,True


# Functions

In [18]:
def tokenize(tweet):
    tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, 
                           preserve_case=False)
    return tknzr.tokenize(tweet)

In [38]:
def classifiers(x_train, y_train):
    nb  = MultinomialNB()
    nb.fit(x_train, y_train)
    log = LogisticRegression()
    log.fit(x_train, y_train)
    forest = RandomForestClassifier(n_estimators=100, max_depth=5)
    forest.fit(x_train, y_train)
    gradboost = GradientBoostingClassifier(random_state=123, max_depth=5, 
                                          learning_rate = 0.01)
    gradboost.fit(x_train, y_train)
    adaboost = AdaBoostClassifier(n_estimators=100)
    adaboost.fit(x_train, y_train)
    svm = SVC(kernel='linear', probability=True)
    svm.fit(x_train, y_train)
    return [nb, log, forest, gradboost, adaboost, svm]


In [39]:
def classifier_performance(vectorizer, train_data, test_data, y_test):
    accuracy_df = []
    x_train = vectorizer.fit_transform(train_data)
    x_test = vectorizer.transform(test_data)
    
    classifier_list = classifiers(x_train, y_train)
    for i in classifier_list:
        preds = i.predict(x_test)
        accuracy = accuracy_score(y_test, preds)
        accuracy_df.append(accuracy*100)
    accuracy_df = pd.DataFrame(accuracy_df)
    classifiers_key = ['Naive Bayes', 'Random Forest', 'Gradient Boost',
                      'AdaBoost', 'Support Vector Machine']
    accuracy_df['Model'] = classifiers_key
    accuracy_df.rename(columns={0: 'Accuracy'}, inplace=True)
    fin_accuracy_df = accuracy_df[['Model', 'Accuracy']]
    return fin_accuracy_df

In [40]:
df = tash.sample(frac=0.25)

In [41]:
data = df['text']
labels = df['relevance']
train_data, test_data, y_train, y_test = train_test_split(data, labels, test_size=0.25)
tfidfvec = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
tfidfvec2 = TfidfVectorizer(stop_words='english', tokenizer=tokenize, ngram_range=(1,2))
tfidfvec3 = TfidfVectorizer(stop_words='english', tokenizer=tokenize, ngram_range=(1,3))
countvec = CountVectorizer(stop_words='english', tokenizer=tokenize)
countvec2 = CountVectorizer(stop_words='english', tokenizer=tokenize, ngram_range=(1,2))
countvec3 = CountVectorizer(stop_words='english', tokenizer=tokenize, ngram_range=(1,3))

In [None]:
#train_data.to_csv('tweets_train.csv', index=False)
#test_data.to_csv('tweets_test.csv', index=False)
#y_train.to_csv('tweets_ytrain.csv', index=False)
#y_test.to_csv('tweets_ytest.csv', index=False)

## Count Vectorization

#### Unigram

In [42]:
classifier_performance(countvec, train_data, test_data, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


ValueError: ignored

In [46]:
x_train = countvec.fit_transform(train_data)
x_test = countvec.transform(test_data)

logreg = LogisticRegression()
logreg.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [47]:
pred = logreg.predict(x_test)

accuracy_score(y_test, pred)

0.6068453051114614

In [48]:
nb  = MultinomialNB()
nb.fit(x_train, y_train)
forest = RandomForestClassifier(n_estimators=100, max_depth=5)
forest.fit(x_train, y_train)
gradboost = GradientBoostingClassifier(random_state=123, max_depth=5, learning_rate = 0.01)
gradboost.fit(x_train, y_train)
adaboost = AdaBoostClassifier(n_estimators=100)
adaboost.fit(x_train, y_train)
svm = SVC(kernel='linear', probability=True)
svm.fit(x_train, y_train)

SVC(kernel='linear', probability=True)

In [49]:
predz = nb.predict(x_test)

accuracy_score(y_test, predz)

0.628687232605269

In [50]:
preds = forest.predict(x_test)

accuracy_score(y_test, preds)

0.5879306462508445

In [51]:
predd = gradboost.predict(x_test)

accuracy_score(y_test, predd)

0.598964197252871

In [52]:
pred1 = adaboost.predict(x_test)

accuracy_score(y_test, pred1)

0.5951362305786985

In [53]:
preddy = svm.predict(x_test)

accuracy_score(y_test, preddy)

0.5832019815356901

#### Bigram

In [None]:
classifier_performance(countvec2, train_data, test_data, y_test)

#### Trigram

In [None]:
classifier_performance(countvec3, train_data, test_data, y_test)

## TF-IDF Vectorization

#### Unigram

In [None]:
classifier_performance(tfidfvec, train_data, test_data, y_test)

#### Bigram

In [None]:
classifier_performance(tfidfvec2, train_data, test_data, y_test)

#### Trigram

In [None]:
classifier_performance(tfidfvec3, train_data, test_data, y_test)