In [36]:
#Data
import requests
import csv

#EDA and Visualizations
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#modeling
from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, confusion_matrix, mean_squared_error
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import psutil

import datetime



### CDC Data

In [2]:
CDC = pd.read_csv('Data/USA_covid_stats.csv')
CDC

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,created_at,consent_cases,consent_deaths
0,03/11/2021,KS,297229,241035.0,56194.0,0,0.0,4851,,,0,0.0,03/12/2021 03:20:13 PM,Agree,
1,06/11/2021,TX,2965966,,,1463,355.0,51158,,,17,0.0,06/13/2021 12:00:00 AM,Not agree,Not agree
2,01/02/2022,AS,11,,,0,0.0,0,,,0,0.0,01/03/2022 03:18:16 PM,,
3,08/22/2020,AR,56199,,,547,0.0,674,,,11,0.0,08/23/2020 02:15:28 PM,Not agree,Not agree
4,07/17/2020,MP,37,37.0,0.0,1,0.0,2,2.0,0.0,0,0.0,07/19/2020 12:00:00 AM,Agree,Agree
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43135,05/28/2020,IA,18585,,,228,0.0,506,,,14,0.0,05/29/2020 02:19:55 PM,Not agree,Not agree
43136,06/07/2020,SD,5438,,,71,0.0,65,64.0,1.0,0,1.0,06/08/2020 02:55:08 PM,,Agree
43137,04/30/2021,SD,122660,,,128,17.0,1967,1601.0,366.0,5,1.0,05/01/2021 01:43:22 PM,,Agree
43138,03/10/2021,SD,113962,,,209,37.0,1904,1546.0,358.0,3,0.0,03/11/2021 03:36:21 PM,,Agree


In [3]:
CDC['date'] = CDC['created_at']

CDC.drop(columns='created_at', axis=0, inplace=True)

In [4]:
CDC.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43140 entries, 0 to 43139
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   submission_date  43140 non-null  object 
 1   state            43140 non-null  object 
 2   tot_cases        43140 non-null  int64  
 3   conf_cases       23254 non-null  float64
 4   prob_cases       23182 non-null  float64
 5   new_case         43140 non-null  int64  
 6   pnew_case        39181 non-null  float64
 7   tot_death        43140 non-null  int64  
 8   conf_death       22968 non-null  float64
 9   prob_death       22968 non-null  float64
 10  new_death        43140 non-null  int64  
 11  pnew_death       39120 non-null  float64
 12  consent_cases    35945 non-null  object 
 13  consent_deaths   36669 non-null  object 
 14  date             43140 non-null  object 
dtypes: float64(6), int64(4), object(5)
memory usage: 4.9+ MB


In [5]:
CDC.isna().sum()

submission_date        0
state                  0
tot_cases              0
conf_cases         19886
prob_cases         19958
new_case               0
pnew_case           3959
tot_death              0
conf_death         20172
prob_death         20172
new_death              0
pnew_death          4020
consent_cases       7195
consent_deaths      6471
date                   0
dtype: int64

### Twitter Data

In [6]:
tweets = pd.read_csv('Data/covid_tweets.csv')
tweets.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone,False
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False


In [7]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179108 entries, 0 to 179107
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_name         179108 non-null  object
 1   user_location     142337 non-null  object
 2   user_description  168822 non-null  object
 3   user_created      179108 non-null  object
 4   user_followers    179108 non-null  int64 
 5   user_friends      179108 non-null  int64 
 6   user_favourites   179108 non-null  int64 
 7   user_verified     179108 non-null  bool  
 8   date              179108 non-null  object
 9   text              179108 non-null  object
 10  hashtags          127774 non-null  object
 11  source            179031 non-null  object
 12  is_retweet        179108 non-null  bool  
dtypes: bool(2), int64(3), object(8)
memory usage: 15.4+ MB


In [8]:
tweets.hashtags.fillna('None', inplace=True)

In [9]:
tweets.dropna(subset=['source'], inplace=True)

In [10]:
tweets.user_description.fillna('None', inplace=True)

In [11]:
tweets.user_location.fillna('None', inplace=True)

In [12]:
tweets.user_location.value_counts()

None                                    36746
India                                    3741
United States                            2455
New Delhi, India                         1721
Mumbai, India                            1401
                                        ...  
CHICAGO, IL                                 1
Center of the Universe                      1
Entre Murcia, Madrid y León (España)        1
London +                                    1
Umkomaas, South Africa                      1
Name: user_location, Length: 26919, dtype: int64

In [13]:
tweets.isna().sum()

user_name           0
user_location       0
user_description    0
user_created        0
user_followers      0
user_friends        0
user_favourites     0
user_verified       0
date                0
text                0
hashtags            0
source              0
is_retweet          0
dtype: int64

# Functions

In [14]:
def tokenize(tweet):
    tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, 
                           preserve_case=False)
    return tknzr.tokenize(tweet)

In [44]:
def classifiers(x_train, y_train):
    nb  = MultinomialNB()
    nb.fit(x_train, y_train)
    log = LogisticRegression()
    log.fit(x_train, y_train)
    forest = RandomForestClassifier(n_estimators=100, max_depth=5)
    forest.fit(x_train, y_train)
    gradboost = GradientBoostingClassifier(random_state=123, max_depth=5, 
                                          learning_rate = 0.01)
    gradboost.fit(x_train, y_train)
    adaboost = AdaBoostClassifier(n_estimators=100)
    adaboost.fit(x_train, y_train)
    svm = SVC(kernel='linear', probability=True)
    svm.fit(x_train, y_train)
    return [nb, log, forest, gradboost, adaboost, svm]


In [45]:
def classifier_performance(vectorizer, train_data, test_data, y_test):
    accuracy_df = []
    x_train = vectorizer.fit_transform(train_data)
    x_test = vectorizer.transform(test_data)
    
    classifier_list = classifiers(x_train, y_train)
    for i in classifier_list:
        preds = i.predict(x_test)
        accuracy = accuracy_score(y_test, preds)
        accuracy_df.append(accuracy*100)
    accuracy_df = pd.DataFrame(accuracy_df)
    classifiers_key = ['Naive Bayes', 'Logistic Regression', 'Random Forest', 'Gradient Boost',
                      'AdaBoost', 'Support Vector Machine']
    accuracy_df['Model'] = classifiers_key
    accuracy_df.rename(columns={0: 'Accuracy'}, inplace=True)
    fin_accuracy_df = accuracy_df[['Model', 'Accuracy']]
    return fin_accuracy_df

In [46]:
df = tweets.sample(frac=0.01)

In [47]:
data = df['text']
labels = df['date']
train_data, test_data, y_train, y_test = train_test_split(data, labels, test_size=0.5)
tfidfvec = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
tfidfvec2 = TfidfVectorizer(stop_words='english', tokenizer=tokenize, ngram_range=(1,2))
tfidfvec3 = TfidfVectorizer(stop_words='english', tokenizer=tokenize, ngram_range=(1,3))
countvec = CountVectorizer(stop_words='english', tokenizer=tokenize)
countvec2 = CountVectorizer(stop_words='english', tokenizer=tokenize, ngram_range=(1,2))
countvec3 = CountVectorizer(stop_words='english', tokenizer=tokenize, ngram_range=(1,3))

In [18]:
#train_data.to_csv('tweets_train.csv', index=False)
#test_data.to_csv('tweets_test.csv', index=False)
#y_train.to_csv('tweets_ytrain.csv', index=False)
#y_test.to_csv('tweets_ytest.csv', index=False)

## Count Vectorization

#### Unigram

In [48]:
classifier_performance(countvec, train_data, test_data, y_test)

Unnamed: 0,Model,Accuracy
0,Naive Bayes,0.0
1,Logistic Regression,0.0
2,Random Forest,0.0
3,Gradient Boost,0.0
4,AdaBoost,0.0
5,Support Vector Machine,0.0


#### Bigram

In [None]:
classifier_performance(countvec2, train_data, test_data, y_test)

#### Trigram

In [None]:
classifier_performance(countvec3, train_data, test_data, y_test)

## TF-IDF Vectorization

#### Unigram

In [None]:
classifier_performance(tfidfvec, train_data, test_data, y_test)

#### Bigram

In [None]:
classifier_performance(tfidfvec2, train_data, test_data, y_test)

#### Trigram

In [None]:
classifier_performance(tfidfvec3, train_data, test_data, y_test)

In [None]:
x_train =  countvec.fit_transform(train_data)
x_test = countvec.transform(test_data)

In [None]:
nb  = MultinomialNB()
nb.fit(x_train, y_train)

MemoryError: Unable to allocate 109. GiB for an array with shape (134273, 108590) and data type int64