## Contents:

- **[Import Libraries](#Import-Libraries)**. 

- **[Loading Data](#Loading-Data)**.

- **[Getting Word2vec](#Getting-Word2vec)**. 
    
- **[Data Cleaning & EDA](#Data-Cleaning-Exploratory-Data-Analysis)**.  

- **[Model Preparation](#Model-Preparation)**. 

 - **[Modeling](#Modeling)**.  
   - **[Baseline Model](#Baseline-Model)**. 
   - **[DBSCAN](#DBSCAN)**.  
   - **[K-Means](#K-Means)**.  
   
- **[Model Evaluation](#Model-Evaluation)**.  

- **[Conclusions and Recommendations](#Conclusions-and-Recommendations)**.  

- **[References](#References)**.

### Importing Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import gensim
import re
from gensim.models.word2vec import Word2Vec 
from nltk.tokenize import RegexpTokenizer 
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.cluster import KMeansClusterer
import nltk

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN , KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split

#from sklearn.datasets import make_blobs
#from sklearn.linear_model import LogisticRegression
#from sklearn.decomposition import PCA

RANDOM_STATE = 7777
%matplotlib inline


### Loading Data

In [4]:
df = pd.read_csv("../adam/datasets/scrape_5.12.csv")

df.head()

Unnamed: 0,tweet_id,username,text,tweet_date,search_term,city,lat,long,radius,query_start
0,1234258409408602118,e.p.c.,"Did the first of several shopping runs, though...",2020-03-01 23:25:10,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01
1,1234253374725459968,@geminiwoe,Looks like adios muchachos for the US #COVID #...,2020-03-01 23:05:10,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01
2,1234241890700218370,enigma4ever 🌹🆘 🌊 🕊️🍑👩‍⚕️💉😷,Humour..snl...on #COVID #coronavirususahttps:/...,2020-03-01 22:19:32,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01
3,1234238588474331136,° ° °,had to go to three diff convenient stores to f...,2020-03-01 22:06:25,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01
4,1234238537068883968,enigma4ever 🌹🆘 🌊 🕊️🍑👩‍⚕️💉😷,"#COVID #Coronovirius in nursing home setting ,...",2020-03-01 22:06:12,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01


In [5]:
df.drop_duplicates(inplace = True)

In [6]:
df.shape

(114044, 10)

### Getting Word2vec

In [50]:
df['text'] = df['text'].astype(str).str.lower()
df['token_text'] = df['text'].str.replace('([^ a-zA-Z0-9])', '').str.replace('http\S+|www.\S+', '', case=False).replace('coronavirus', 'covid19')

stop = stopwords.words('english')
df['stop_text'] = df['token_text'].apply(lambda x: [item for item in str(x).split() 
                                                    if item not in stop])

In [None]:
#df['text'].head()

In [9]:
#df['stop_text'].head()

In [10]:
#df['token_text'].head()

In [11]:
sent = [row for row in df['stop_text']]

In [12]:
tweet_w2v = Word2Vec(size=1000, min_count=100, window=10)

In [13]:
tweet_w2v.build_vocab(sent)

In [14]:
tweet_w2v.most_similar('positive')

[('quiet', 0.14115533232688904),
 ('precautions', 0.11180473864078522),
 ('quarantinelife', 0.1069624051451683),
 ('city', 0.10363449156284332),
 ('ya', 0.09774534404277802),
 ('liar', 0.09590863436460495),
 ('manhattan', 0.09519168734550476),
 ('holy', 0.09087195992469788),
 ('pain', 0.0882231667637825),
 ('stopped', 0.08818729221820831)]

### Data Cleaning & EDA.

In [16]:
df.isnull().sum()

tweet_id       0
username       2
text           0
tweet_date     0
search_term    0
city           0
lat            0
long           0
radius         0
query_start    0
token_text     0
stop_text      0
dtype: int64

In [17]:
#df['username']

In [18]:
df = df.dropna() #droping null rows

In [19]:
df.isnull().sum().sum()

0

In [20]:
df.shape

(114042, 12)

### Model Preparation

In [51]:
X = tweet_w2v[tweet_w2v.wv.vocab]

In [40]:
#Train-test-split
X_train, X_test = train_test_split(X, random_state = RANDOM_STATE)

In [36]:
# #scale our data 
# ss = StandardScaler()

# #fit and transform Data
# X_train_ss = ss.fit_transform(X_train)
# X_test_ss = ss.transform(X_test)

### Modeling

#### Baseline Model

#### K-Means 

In [22]:
##### K Means Clustering with NLTK Library

In [41]:
NUM_CLUSTERS=3
kclusterer = KMeansClusterer(NUM_CLUSTERS, 
                             distance=nltk.cluster.util.cosine_distance, 
                             repeats=25)
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
#print (assigned_clusters)

In [30]:
words = list(tweet_w2v.wv.vocab)
for i, word in enumerate(words):
    print (word + ":" + str(assigned_clusters[i]))

first:1
several:1
shopping:0
runs:2
though:1
ended:2
walking:2
without:1
buying:1
anything:1
rate:1
people:1
coughing:0
well:0
covid19:0
thursday:0
cover:1
fucking:2
cough:2
looks:0
like:2
us:2
covid:0
coronavirus:1
coronavirususa:0
go:0
three:2
stores:0
find:0
real:2
lower:2
non:0
stuff:2
buy:2
fine:0
wow:1
19:2
panic:1
begins:0
manhattan:0
nursing:2
home:1
setting:1
50:1
symptoms:1
problems:0
officials:1
keep:2
nurse:2
protect:1
especially:0
immune:1
issues:0
includes:0
wearing:0
mask:1
hand:1
face:0
washing:1
alcohol:1
products:1
getting:1
flu:0
shot:2
trump:2
saying:1
hoax:0
instead:1
public:0
health:2
something:0
may:0
maga:1
pandemic:1
un:0
agency:2
warns:0
criminal:1
kills:2
heres:2
hoping:0
china:1
clean:1
energy:1
outbreak:0
since:1
got:0
serious:0
air:0
quality:0
problem:2
v:0
high:1
cases:0
patients:0
w:1
respiratory:2
disease:1
likely:0
vs:1
ones:2
wo:2
conditions:0
la:2
published:1
john:2
history:0
spanish:0
influenza:0
unicef:1
work:0
improve:1
weak:2
systems:0
help:2
par

moving:0
control:1
hiring:2
contact:2
street:2
roosevelt:2
ave:1
north:1
university:0
staff:0
tech:0
unemployment:2
website:0
site:1
tips:2
steps:2
tag:0
required:2
reading:0
press:2
conference:0
7:0
pm:0
actor:2
sus:1
stayhealthy:0
quedateencasa:0
hydroxychloroquine:0
fun:2
development:0
taste:0
normal:1
able:0
smell:2
line:1
wish:2
washington:2
square:1
alive:0
loving:0
enjoying:2
creative:2
memories:1
beautiful:1
incredible:0
responders:2
lmfao:2
prevention:2
sites:1
near:1
grocery:2
drug:2
surprised:1
phone:0
tired:1
restrictions:2
license:0
smart:2
hate:0
industry:2
jobs:0
beyond:0
pence:1
praying:1
mortality:2
allowed:2
oh:2
handle:2
positives:2
covid19nyc:0
leads:1
bills:2
awareness:2
remember:1
certain:2
coronavirusupdates:1
africa:2
paying:0
fast:0
primary:0
calm:0
actions:1
toll:2
georgia:1
stayathome:0
gotta:0
solidarity:1
plans:2
stimulus:2
economic:1
spoke:2
focus:0
scary:0
share:0
truly:0
ventilator:0
anyway:1
hundreds:0
protected:2
review:1
ongoing:2
basically:0
everywhe

In [None]:
##### K Means Clustering with Scikit-learn Library

In [43]:
#Fit a k-means clustering model
km = KMeans(n_clusters= NUM_CLUSTERS,
            random_state=2020)
#fitting model
km.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=2020, tol=0.0001, verbose=0)

In [47]:
km.labels_
km.cluster_centers_

array([[-1.2920758e-05,  5.7685106e-06,  2.0106123e-05, ...,
         2.4315186e-05,  2.2652225e-06, -1.0361520e-05],
       [ 3.0182728e-05,  1.9339932e-06, -2.5886088e-06, ...,
        -1.3100220e-05, -3.9500082e-06,  5.0791928e-06],
       [-1.4837901e-06, -8.9019723e-06,  6.4154437e-06, ...,
        -6.8201816e-06,  1.4116585e-05,  2.0139655e-06]], dtype=float32)

In [49]:
#sum of squared errors for each cluster.
print("Inertia: ", km.inertia_)

#Silhouette Score
print("Silhouette Score: ", silhouette_score(X, km.labels_))

Inertia:  0.20688046382865316
Silhouette Score:  0.0012302315


In [None]:
# # instantiate DummyClassifier
# dummy = DummyClassifier(strategy="most_frequent")
# dummy.fit(X_train, y_train)

# # score on test
# print('Test Score:', dummy.score(X_test, y_test))

# # score on train
# print('Train Score:', dummy.score(X_train, y_train))

# # score on cross val
# print('Cross Val Score:', cross_val_score(dummy, X, y, cv =5).mean())

In [None]:
# # Instantiate PCA & logisticRegression
# pca = PCA(random_state=RANDOM_STATE)
# lr_pca = LogisticRegression(random_state=RANDOM_STATE)

# # Get PC's by applying transformer on data
# Z_train = pca.fit_transform(X_train_ss)
# Z_test  = pca.transform(X_test_ss)

# #fitting model
# lr_pca.fit(Z_train, y_train)

# print('lr_pca Score:',lr_pca.score(Z_test, y_test))

In [52]:
#scaling 
# ss = StandardScaler()
# X_scaled = ss.fit_transform(df)

#### DBSCAN

In [53]:
dbscan = DBSCAN(eps = .5,
               min_samples= [5, 10, 20]
               ) 

dbscan.fit(X);

TypeError: '>' not supported between instances of 'list' and 'float'

In [None]:
print("Silhoutte Score : ", silhouette_score(X_scaled, dbscan.labels_))

In [None]:
knn_regressor

In [None]:
# import numpy as np
# from sklearn.base import BaseEstimator

# class KNNRegressor(BaseEstimator):
#     def __init__(self, k):
#         self.k = k
    
#     def __repr__(self):
#         return f"KNNRegressor {{k = {self.k}}}"
    
#     def euclidian_distance(self, a, b):
#         return np.sqrt(np.sum((a - b)**2))
    
#     def get_k_nearest_neigbhors_indices(self, distances):
#         # create a list of tuples with (distance, index) - HINT: try enumerate
#         distances_and_indices = [(distance, index) for index, distance in enumerate(distances)]

#         # sort distances with indices using .sort()
#         distances_and_indices.sort()

#         # select first k distances
#         k_distances_and_indices = distances_and_indices[:self.k]

#         # return a list of just the indices from first k distances
#         return [index for distance, index in k_distances_and_indices]

#     def predict_row(self, test_row):
#         # Use the `euclidian_distance` function create a list of distances 
#         # between the `test_row` and each row in `train_data`
#         distances = [self.euclidian_distance(row, test_row) for row in self.X_train_]

#         # call the `get_k_nearest_neigbhors_indices` function with the distances from the previous step and `k`
#         k_distances_and_indices = self.get_k_nearest_neigbhors_indices(distances)

#         # use the `k_distances_and_indices` from the previous step to index the `train_target`
#         k_nearest_neigbhor_targets = self.y_train_[k_distances_and_indices]

#         # return the mean of `k_nearest_neigbhor_targets` from the previous step
#         return np.mean(k_nearest_neigbhor_targets)
    
#     def r2_score(self, y_true, y_pred):
#         ### Sum of Squared Errors from Resisuals
#         residuals = y_true - y_pred
#         ss_residual = np.sum(residuals ** 2)

#         ### Sum of Squared Errors from baseline model (predicting mean)
#         mean = np.mean(y_true)
#         ss_total = np.sum((y_true - mean) ** 2)

#         if ss_total == 0: # can't divide by 0
#             print("Can't divide by 0")
#             return 0.0

#         return 1 - (ss_residual / ss_total)
    
#     def fit(self, X_train, y_train):
#         self.X_train_ = X_train
#         self.y_train_ = y_train
#         return self
    
#     def predict(self, X_test):
#         preds = [self.predict_row(test_row) for test_row in X_test]
#         return np.array(preds)
    
#     def score(self, X, y_true):
#         y_preds = self.predict(X)
#         return self.r2_score(y_true, y_preds)

In [None]:
knn = KNNRegressor(k = 5)
knn.fit(ss_X_train, y_train)

In [None]:
# train score
print("Train score: {:.2%}".format(knn.score(ss_X_train, y_train)))

# test score
print("Train score: {:.2%}".format(knn.score(ss_X_test, y_test)))

In [None]:
# cross val score
cross_val_score(knn, X, y, cv = 5).mean()

### Model Evaluation

### Conclusions and Recommendations

### References