## Contents:

- **[Import Libraries](#Import-Libraries)**.  
    
- **[Data Cleaning & EDA](#Data-Cleaning-Exploratory-Data-Analysis)**.  

- **[Model Preparation](#Model-Preparation)**. 

 - **[Modeling](#Modeling)**.  
   - **[Baseline Model](#Baseline-Model)**. 
   - **[DBSCAN](#DBSCAN)**.  
   - **[K-Means](#K-Means)**.  
   
- **[Model Evaluation](#Model-Evaluation)**.  

- **[Conclusions and Recommendations](#Conclusions-and-Recommendations)**.  

- **[References](#References)**.

### Importing Libraries

In [6]:
import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import gensim
import re
from gensim.models.word2vec import Word2Vec 
from nltk.tokenize import RegexpTokenizer 
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN , KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split

#from sklearn.datasets import make_blobs
#from sklearn.linear_model import LogisticRegression
#from sklearn.decomposition import PCA

RANDOM_STATE = 7777
%matplotlib inline


In [None]:
### Loading Data

In [15]:
df = pd.read_csv("../adam/datasets/scrape_5.11.csv")

df.head()

Unnamed: 0,tweet_id,username,text,tweet_date,search_term,city,lat,long,radius,query_start
0,1228415289269858310,Bearjew1964,Disease modelers gaze into their computers to ...,2020-02-14 20:26:42,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01
1,1228345808954765313,Carlos Salazar,I know we have a limited attention span and al...,2020-02-14 15:50:37,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01
2,1228149699145871360,Hefe O-Ren Ishii,As one Princess cruise is quarantined for COVI...,2020-02-14 02:51:20,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01
3,1228008158586720256,josh,how about we call it by it’s new name: COVID-1...,2020-02-13 17:28:54,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01
4,1227684262859636738,Helen Ong,"Scorpio: To me, the #COVID_19 19, as virulent ...",2020-02-12 20:01:52,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01


In [16]:
df.drop_duplicates(inplace = True)

In [17]:
df.shape

(96507, 10)

In [19]:
df['text'] = df['text'].astype(str).str.lower()
df['token_text'] = df['text'].str.replace('([^ a-zA-Z0-9])', '').str.replace('http\S+|www.\S+', '', case=False).replace('coronavirus', 'covid19')

stop = stopwords.words('english')
df['stop_text'] = df['token_text'].apply(lambda x: [item for item in str(x).split() 
                                                    if item not in stop])

In [20]:
sent = [row for row in df['stop_text']]

In [21]:
tweet_w2v = Word2Vec(size=1000, min_count=100, window=1)

In [22]:
tweet_w2v.build_vocab(sent)

In [23]:
tweet_w2v.most_similar('positive')

[('photo', 0.09064854681491852),
 ('caused', 0.08920827507972717),
 ('meanwhile', 0.08866623789072037),
 ('empty', 0.08706696331501007),
 ('leadership', 0.08595381677150726),
 ('ran', 0.08412136137485504),
 ('case', 0.08400179445743561),
 ('keeping', 0.0833788737654686),
 ('man', 0.08250793814659119),
 ('selfish', 0.08079575002193451)]

In [18]:
### Data Cleaning & EDA.

In [25]:
df.isnull().sum()

tweet_id       0
username       1
text           0
tweet_date     0
search_term    0
city           0
lat            0
long           0
radius         0
query_start    0
token_text     0
stop_text      0
dtype: int64

In [31]:
#df['username']

In [28]:
df = df.dropna() #droping null rows

In [29]:
df.isnull().sum().sum()

0

In [30]:
df.shape

(96506, 12)

### Model Preparation

In [None]:
X = 
y = 

In [None]:
#Train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = RANDOM_STATE)

In [None]:
#scale our data 
ss = StandardScaler()

#fit and transform Data
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

### Modeling

#### Baseline Model

In [None]:
# # instantiate DummyClassifier
# dummy = DummyClassifier(strategy="most_frequent")
# dummy.fit(X_train, y_train)

# # score on test
# print('Test Score:', dummy.score(X_test, y_test))

# # score on train
# print('Train Score:', dummy.score(X_train, y_train))

# # score on cross val
# print('Cross Val Score:', cross_val_score(dummy, X, y, cv =5).mean())

In [None]:
# # Instantiate PCA & logisticRegression
# pca = PCA(random_state=RANDOM_STATE)
# lr_pca = LogisticRegression(random_state=RANDOM_STATE)

# # Get PC's by applying transformer on data
# Z_train = pca.fit_transform(X_train_ss)
# Z_test  = pca.transform(X_test_ss)

# #fitting model
# lr_pca.fit(Z_train, y_train)

# print('lr_pca Score:',lr_pca.score(Z_test, y_test))

In [None]:
#scaling 
ss = StandardScaler()
X_scaled = ss.fit_transform(df)

#### DBSCAN

In [None]:
dbscan = DBSCAN(eps = [.5, .65, .8],
               min_samples= [5, 10, 20]
               ) 

dbscan.fit(X_scaled);

In [None]:
print("Silhoutte Score : ", silhouette_score(X_scaled, dbscan.labels_))

#### K-Means 

In [None]:
#Fit a k-means clustering model
km = KMeans(n_clusters= [3,5,10,15], 
            n_init= [10, 20, 30],
            max_iter= [300, 400, 500],
            random_state=2020)
#fitting model
km.fit(X_scaled)

In [None]:
#sum of squared errors for each cluster.
print("Inertia: ", km.inertia_)

#Silhouette Score
print("Silhouette Score: ", silhouette_score(X_scaled, km.labels_))

In [None]:
knn_regressor

In [None]:
# import numpy as np
# from sklearn.base import BaseEstimator

# class KNNRegressor(BaseEstimator):
#     def __init__(self, k):
#         self.k = k
    
#     def __repr__(self):
#         return f"KNNRegressor {{k = {self.k}}}"
    
#     def euclidian_distance(self, a, b):
#         return np.sqrt(np.sum((a - b)**2))
    
#     def get_k_nearest_neigbhors_indices(self, distances):
#         # create a list of tuples with (distance, index) - HINT: try enumerate
#         distances_and_indices = [(distance, index) for index, distance in enumerate(distances)]

#         # sort distances with indices using .sort()
#         distances_and_indices.sort()

#         # select first k distances
#         k_distances_and_indices = distances_and_indices[:self.k]

#         # return a list of just the indices from first k distances
#         return [index for distance, index in k_distances_and_indices]

#     def predict_row(self, test_row):
#         # Use the `euclidian_distance` function create a list of distances 
#         # between the `test_row` and each row in `train_data`
#         distances = [self.euclidian_distance(row, test_row) for row in self.X_train_]

#         # call the `get_k_nearest_neigbhors_indices` function with the distances from the previous step and `k`
#         k_distances_and_indices = self.get_k_nearest_neigbhors_indices(distances)

#         # use the `k_distances_and_indices` from the previous step to index the `train_target`
#         k_nearest_neigbhor_targets = self.y_train_[k_distances_and_indices]

#         # return the mean of `k_nearest_neigbhor_targets` from the previous step
#         return np.mean(k_nearest_neigbhor_targets)
    
#     def r2_score(self, y_true, y_pred):
#         ### Sum of Squared Errors from Resisuals
#         residuals = y_true - y_pred
#         ss_residual = np.sum(residuals ** 2)

#         ### Sum of Squared Errors from baseline model (predicting mean)
#         mean = np.mean(y_true)
#         ss_total = np.sum((y_true - mean) ** 2)

#         if ss_total == 0: # can't divide by 0
#             print("Can't divide by 0")
#             return 0.0

#         return 1 - (ss_residual / ss_total)
    
#     def fit(self, X_train, y_train):
#         self.X_train_ = X_train
#         self.y_train_ = y_train
#         return self
    
#     def predict(self, X_test):
#         preds = [self.predict_row(test_row) for test_row in X_test]
#         return np.array(preds)
    
#     def score(self, X, y_true):
#         y_preds = self.predict(X)
#         return self.r2_score(y_true, y_preds)

In [None]:
knn = KNNRegressor(k = 5)
knn.fit(ss_X_train, y_train)

In [None]:
# train score
print("Train score: {:.2%}".format(knn.score(ss_X_train, y_train)))

# test score
print("Train score: {:.2%}".format(knn.score(ss_X_test, y_test)))

In [None]:
# cross val score
cross_val_score(knn, X, y, cv = 5).mean()

### Model Evaluation

### Conclusions and Recommendations

### References