In [1]:
import numpy as np
import pandas as pd
import time
import gensim
import re

from numba import jit, cuda
from gensim.models.word2vec import Word2Vec 
from nltk.tokenize import RegexpTokenizer 
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN

  from pandas import Panel


## Data Cleansing

In [2]:
def stop_word_remover(df):
    df['text'] = df['text'].astype(str).str.lower()
    df['token_text'] = df['text'].str.replace('([^ a-zA-Z0-9])', '').str.replace('http\S+|www.\S+', '', case=False).replace('coronavirus', 'covid19')

    stop = stopwords.words('english')
    df['stop_text'] = df['token_text'].apply(lambda x: [item for item in str(x).split() 
                                                    if item not in stop])
    return df

In [3]:
df1 = pd.read_csv('../adam/datasets/scrape_5.12.csv')
pd.set_option('display.max_colwidth', -1)

  


In [4]:
df1.shape

(121482, 10)

In [5]:
df1.drop_duplicates(inplace=True)
df1.shape

(114044, 10)

In [6]:
df1 = stop_word_remover(df1)

In [7]:
df2 = pd.read_csv('../adam/datasets/test_5.14.csv')
pd.set_option('display.max_colwidth', -1)

  


In [8]:
df2.shape

(221214, 10)

In [9]:
df2.drop_duplicates(inplace=True)
df2.shape

(201794, 10)

In [10]:
df2 = stop_word_remover(df2)

In [11]:
df = pd.concat([df1,df2], ignore_index = True, verify_integrity = True)
df.shape

(315838, 12)

In [12]:
df['text'] = df['text'].astype(str).str.lower()
df['token_text'] = df['text'].str.replace('([^ a-zA-Z0-9])', '').str.replace('http\S+|www.\S+', '', case=False).replace('coronavirus', 'covid19')

stop = stopwords.words('english')
df['stop_text'] = df['token_text'].apply(lambda x: [item for item in str(x).split() 
                                                    if item not in stop])



#df['split_text'] = df['token_text'].astype(str).str.lower().str.split()

In [13]:
df['text'].head()

0    did the first of several shopping runs, though i ended up walking out of cvs without buying anything. if the rate of people coughing at me is any indication, we’ll all have covid-19 by thursday. \n\ncover your fucking mouths when you cough.
1    looks like adios muchachos for the us #covid #covid19us #coronavirushttps://twitter.com/nytimes/status/1234171621696557057 …                                                                                                                    
2    humour..snl...on #covid #coronavirususahttps://mashable.com/video/snl-coronavirus-democratic-candidates-cold-open/ …                                                                                                                            
3    had to go to three diff convenient stores to find real milk (not soy) and they were all out of the lower priced non organic stuff .. i buy organic so i was fine with it but wow. covid 19 panic buying begins in manhattan.                    
4    #covid #cor

In [14]:
df['stop_text'].head()

0    [first, several, shopping, runs, though, ended, walking, cvs, without, buying, anything, rate, people, coughing, indication, well, covid19, thursday, cover, fucking, mouths, cough]
1    [looks, like, adios, muchachos, us, covid, covid19us, coronavirus]                                                                                                                  
2    [humoursnlon, covid, coronavirususa]                                                                                                                                                
3    [go, three, diff, convenient, stores, find, real, milk, soy, lower, priced, non, organic, stuff, buy, organic, fine, wow, covid, 19, panic, buying, begins, manhattan]              
4    [covid, coronovirius, nursing, home, setting, kirkland, 50, symptoms, 3120]                                                                                                         
Name: stop_text, dtype: object

In [15]:
df['token_text'].head()

0    did the first of several shopping runs though i ended up walking out of cvs without buying anything if the rate of people coughing at me is any indication well all have covid19 by thursday cover your fucking mouths when you cough
1    looks like adios muchachos for the us covid covid19us coronavirus                                                                                                                                                                    
2    humoursnlon covid coronavirususa                                                                                                                                                                                                     
3    had to go to three diff convenient stores to find real milk not soy and they were all out of the lower priced non organic stuff  i buy organic so i was fine with it but wow covid 19 panic buying begins in manhattan               
4    covid coronovirius in nursing home setting  kirkland mo

## Word Vectorizer

In [16]:
sent = [row for row in df1['stop_text']]

In [17]:
tweet_w2v = Word2Vec(size=1000, min_count=500, window=10)

In [18]:
tweet_w2v.build_vocab(sent)

In [19]:
tweet_w2v.most_similar('covid')

  """Entry point for launching an IPython kernel.


[('right', 0.09585417807102203),
 ('twitter', 0.08774752914905548),
 ('president', 0.07988424599170685),
 ('waiting', 0.07965778559446335),
 ('chinese', 0.07624496519565582),
 ('safety', 0.07341757416725159),
 ('able', 0.07280883193016052),
 ('helping', 0.06696482747793198),
 ('united', 0.06436911970376968),
 ('community', 0.06277808547019958)]

## Tweet Vectorizer

In [20]:
# Functions found in this cell were created by instructor Noah Christiansen
# He is great
# Define vectorization function
#word2vecmodel = tweet_w2v
def vectorize_corpus(keyword_list):    
    # Instantiate counter for number of words in keyword_list that exists
    n_words = 0
    # Create template for cumulative corpus vector sum
    corpus_vec_sum = np.zeros((1,1000))                 
    # Scan through each word in list
    for word in keyword_list:
        if word in tweet_w2v.wv.vocab:                    
            word_vec = tweet_w2v.wv.word_vec(word)
            #print(word_vec)
            n_words +=1                                
            corpus_vec_sum = corpus_vec_sum + word_vec 
    # Compute average vector by taking cumulative vector sum and dividing it by number of words traced
    corpus_avg_vec = corpus_vec_sum/n_words
    # Squeeze this N-dimensional nested array object into a 1-D array to streamline future processing
    corpus_avg_vec = np.squeeze(corpus_avg_vec)
    return(corpus_avg_vec)

#defining cosine similarity function
def cos_sim(vector_1, vector_2):
    dp = np.dot(vector_1, vector_2)
    magnitude_v1 = np.sqrt(np.dot(vector_1,vector_1))
    magnitude_v2 = np.sqrt(np.dot(vector_2,vector_2))
    return(dp/(magnitude_v1*magnitude_v2))


In [21]:
vect_tweets = [vectorize_corpus(tweet) for tweet in df['stop_text']]
vect_df = pd.DataFrame(vect_tweets)



In [22]:
print(df.shape)
print(vect_df.shape)

(315838, 12)
(315838, 1000)


In [23]:
join_df = df.join(vect_df)
join_df.dropna(inplace = True)
join_df.shape

(293497, 1012)

In [24]:
clean_vec_df= join_df.iloc[:, [i for i in range(13,1012)]]
clean_og_df = join_df.iloc[:, [i for i in range(0,12)]]
clean_vec_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,990,991,992,993,994,995,996,997,998,999
0,-2.5e-05,3.7e-05,-2.5e-05,-8.6e-05,-7.3e-05,2.3e-05,-3e-06,-0.000106,0.000112,4.4e-05,...,-2.3e-05,-9.7e-05,4.1e-05,-8.2e-05,-8.1e-05,-5.2e-05,4.1e-05,-1.4e-05,0.000114,-1.8e-05
1,-0.000196,7.7e-05,-0.000188,-8e-05,0.000143,0.00016,-1.8e-05,-7.6e-05,-1.9e-05,-0.000159,...,0.000148,-4.9e-05,1.7e-05,-7.8e-05,-0.000169,3.5e-05,0.000131,6.5e-05,3.4e-05,8e-06
2,-0.000392,7.2e-05,6.4e-05,2.5e-05,3.8e-05,0.000198,-0.000349,4.4e-05,-0.000143,3.9e-05,...,-4.7e-05,0.000241,-0.000384,-4.6e-05,-1.6e-05,0.000207,0.000439,0.000362,4.2e-05,-0.000324
3,-0.000135,-9.2e-05,-0.000213,3.4e-05,0.000149,0.000215,0.000113,3.2e-05,-9.7e-05,0.000107,...,-1.2e-05,0.00019,-6.9e-05,7.4e-05,4e-06,0.000115,3.6e-05,5.7e-05,-6.2e-05,-5.2e-05
4,7.2e-05,-1.4e-05,-0.000211,5.2e-05,7.6e-05,0.000123,-0.00016,4.9e-05,-0.000215,-3.6e-05,...,8.8e-05,-0.00012,5.1e-05,0.000176,0.000139,0.00033,-0.000108,0.00017,8.8e-05,-0.000103


In [25]:
clean_og_df.head()

Unnamed: 0,tweet_id,username,text,tweet_date,search_term,city,lat,long,radius,query_start,token_text,stop_text
0,1234258409408602118,e.p.c.,"did the first of several shopping runs, though i ended up walking out of cvs without buying anything. if the rate of people coughing at me is any indication, we’ll all have covid-19 by thursday. \n\ncover your fucking mouths when you cough.",2020-03-01 23:25:10,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01,did the first of several shopping runs though i ended up walking out of cvs without buying anything if the rate of people coughing at me is any indication well all have covid19 by thursday cover your fucking mouths when you cough,"[first, several, shopping, runs, though, ended, walking, cvs, without, buying, anything, rate, people, coughing, indication, well, covid19, thursday, cover, fucking, mouths, cough]"
1,1234253374725459968,@geminiwoe,looks like adios muchachos for the us #covid #covid19us #coronavirushttps://twitter.com/nytimes/status/1234171621696557057 …,2020-03-01 23:05:10,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01,looks like adios muchachos for the us covid covid19us coronavirus,"[looks, like, adios, muchachos, us, covid, covid19us, coronavirus]"
2,1234241890700218370,enigma4ever 🌹🆘 🌊 🕊️🍑👩‍⚕️💉😷,humour..snl...on #covid #coronavirususahttps://mashable.com/video/snl-coronavirus-democratic-candidates-cold-open/ …,2020-03-01 22:19:32,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01,humoursnlon covid coronavirususa,"[humoursnlon, covid, coronavirususa]"
3,1234238588474331136,° ° °,had to go to three diff convenient stores to find real milk (not soy) and they were all out of the lower priced non organic stuff .. i buy organic so i was fine with it but wow. covid 19 panic buying begins in manhattan.,2020-03-01 22:06:25,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01,had to go to three diff convenient stores to find real milk not soy and they were all out of the lower priced non organic stuff i buy organic so i was fine with it but wow covid 19 panic buying begins in manhattan,"[go, three, diff, convenient, stores, find, real, milk, soy, lower, priced, non, organic, stuff, buy, organic, fine, wow, covid, 19, panic, buying, begins, manhattan]"
4,1234238537068883968,enigma4ever 🌹🆘 🌊 🕊️🍑👩‍⚕️💉😷,"#covid #coronovirius in nursing home setting , #kirkland more than 50 with symptoms ..3/1/20https://www.statnews.com/2020/02/29/new-covid-19-death-raises-concerns-about-virus-spread-in-nursing-homes/ …",2020-03-01 22:06:12,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01,covid coronovirius in nursing home setting kirkland more than 50 with symptoms 3120,"[covid, coronovirius, nursing, home, setting, kirkland, 50, symptoms, 3120]"


In [27]:
dbscan = DBSCAN(metric='cosine', eps=0.4, min_samples=30, n_jobs = -1) # you can change these parameters, given just for example 
cluster_labels = dbscan.fit_predict(clean_vec_df) # where X - is your matrix, where each row corresponds to one document (line) from the docs, you need to cluster 

In [28]:
clean_og_df['cluster'] = cluster_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [29]:
clean_og_df['cluster'].value_counts()

 0    232177
-1    61049 
 2    84    
 5    49    
 1    39    
 4    35    
 6    34    
 3    30    
Name: cluster, dtype: int64

In [30]:
clean_og_df.head(5)

Unnamed: 0,tweet_id,username,text,tweet_date,search_term,city,lat,long,radius,query_start,token_text,stop_text,cluster
0,1234258409408602118,e.p.c.,"did the first of several shopping runs, though i ended up walking out of cvs without buying anything. if the rate of people coughing at me is any indication, we’ll all have covid-19 by thursday. \n\ncover your fucking mouths when you cough.",2020-03-01 23:25:10,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01,did the first of several shopping runs though i ended up walking out of cvs without buying anything if the rate of people coughing at me is any indication well all have covid19 by thursday cover your fucking mouths when you cough,"[first, several, shopping, runs, though, ended, walking, cvs, without, buying, anything, rate, people, coughing, indication, well, covid19, thursday, cover, fucking, mouths, cough]",-1
1,1234253374725459968,@geminiwoe,looks like adios muchachos for the us #covid #covid19us #coronavirushttps://twitter.com/nytimes/status/1234171621696557057 …,2020-03-01 23:05:10,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01,looks like adios muchachos for the us covid covid19us coronavirus,"[looks, like, adios, muchachos, us, covid, covid19us, coronavirus]",0
2,1234241890700218370,enigma4ever 🌹🆘 🌊 🕊️🍑👩‍⚕️💉😷,humour..snl...on #covid #coronavirususahttps://mashable.com/video/snl-coronavirus-democratic-candidates-cold-open/ …,2020-03-01 22:19:32,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01,humoursnlon covid coronavirususa,"[humoursnlon, covid, coronavirususa]",0
3,1234238588474331136,° ° °,had to go to three diff convenient stores to find real milk (not soy) and they were all out of the lower priced non organic stuff .. i buy organic so i was fine with it but wow. covid 19 panic buying begins in manhattan.,2020-03-01 22:06:25,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01,had to go to three diff convenient stores to find real milk not soy and they were all out of the lower priced non organic stuff i buy organic so i was fine with it but wow covid 19 panic buying begins in manhattan,"[go, three, diff, convenient, stores, find, real, milk, soy, lower, priced, non, organic, stuff, buy, organic, fine, wow, covid, 19, panic, buying, begins, manhattan]",0
4,1234238537068883968,enigma4ever 🌹🆘 🌊 🕊️🍑👩‍⚕️💉😷,"#covid #coronovirius in nursing home setting , #kirkland more than 50 with symptoms ..3/1/20https://www.statnews.com/2020/02/29/new-covid-19-death-raises-concerns-about-virus-spread-in-nursing-homes/ …",2020-03-01 22:06:12,COVID,Brooklyn,40.650002,-73.949997,10mi,2020-02-01,covid coronovirius in nursing home setting kirkland more than 50 with symptoms 3120,"[covid, coronovirius, nursing, home, setting, kirkland, 50, symptoms, 3120]",0


In [33]:
mask = clean_og_df['cluster'] == 6
clean_og_df[mask].head(50)

Unnamed: 0,tweet_id,username,text,tweet_date,search_term,city,lat,long,radius,query_start,token_text,stop_text,cluster
238055,1259532432518402051,William Hoang,@smarcus online video sucks for edu. we working on a saas elearning platform with ai + remote features to solve that.\n\nhave paying customers currently and looking to raise our seed round.\n\ncap table is clean. love to connect with you on the topic to share our story and demo.,2020-05-10 17:15:07,round,Hoboken,40.744,-74.0324,10mi,2020-02-01,smarcus online video sucks for edu we working on a saas elearning platform with ai remote features to solve thathave paying customers currently and looking to raise our seed roundcap table is clean love to connect with you on the topic to share our story and demo,"[smarcus, online, video, sucks, edu, working, saas, elearning, platform, ai, remote, features, solve, thathave, paying, customers, currently, looking, raise, seed, roundcap, table, clean, love, connect, topic, share, story, demo]",6
238127,1259492524525334529,William Hoang,@blakeir online video sucks for edu. we working on a saas elearning platform with ai + remote features to solve that.\n\nhave paying customers currently and looking to raise our seed round.\n\ncap table is clean. love to connect with you on the topic to share our story and demo.,2020-05-10 14:36:32,round,Hoboken,40.744,-74.0324,10mi,2020-02-01,blakeir online video sucks for edu we working on a saas elearning platform with ai remote features to solve thathave paying customers currently and looking to raise our seed roundcap table is clean love to connect with you on the topic to share our story and demo,"[blakeir, online, video, sucks, edu, working, saas, elearning, platform, ai, remote, features, solve, thathave, paying, customers, currently, looking, raise, seed, roundcap, table, clean, love, connect, topic, share, story, demo]",6
238130,1259491712084426753,William Hoang,@byronling1 online video sucks for edu. we working on a saas elearning platform with ai + remote features to solve that.\n\nhave paying customers currently and looking to raise our seed round.\n\ncap table is clean. love to connect with you on the topic to share our story and demo.,2020-05-10 14:33:19,round,Hoboken,40.744,-74.0324,10mi,2020-02-01,byronling1 online video sucks for edu we working on a saas elearning platform with ai remote features to solve thathave paying customers currently and looking to raise our seed roundcap table is clean love to connect with you on the topic to share our story and demo,"[byronling1, online, video, sucks, edu, working, saas, elearning, platform, ai, remote, features, solve, thathave, paying, customers, currently, looking, raise, seed, roundcap, table, clean, love, connect, topic, share, story, demo]",6
238131,1259491649220198400,William Hoang,@rickheitzmann online video sucks for edu. we working on a saas elearning platform with ai + remote features to solve that.\n\nhave paying customers currently and looking to raise our seed round.\n\ncap table is clean. love to connect with you on the topic to share our story and demo,2020-05-10 14:33:04,round,Hoboken,40.744,-74.0324,10mi,2020-02-01,rickheitzmann online video sucks for edu we working on a saas elearning platform with ai remote features to solve thathave paying customers currently and looking to raise our seed roundcap table is clean love to connect with you on the topic to share our story and demo,"[rickheitzmann, online, video, sucks, edu, working, saas, elearning, platform, ai, remote, features, solve, thathave, paying, customers, currently, looking, raise, seed, roundcap, table, clean, love, connect, topic, share, story, demo]",6
238132,1259491611458908161,William Hoang,@alanjpatricof online video sucks for edu. we working on a saas elearning platform with ai + remote features to solve that.\n\nhave paying customers currently and looking to raise our seed round.\n\ncap table is clean. love to connect with you on the topic to share our story and demo,2020-05-10 14:32:55,round,Hoboken,40.744,-74.0324,10mi,2020-02-01,alanjpatricof online video sucks for edu we working on a saas elearning platform with ai remote features to solve thathave paying customers currently and looking to raise our seed roundcap table is clean love to connect with you on the topic to share our story and demo,"[alanjpatricof, online, video, sucks, edu, working, saas, elearning, platform, ai, remote, features, solve, thathave, paying, customers, currently, looking, raise, seed, roundcap, table, clean, love, connect, topic, share, story, demo]",6
238133,1259491568043655171,William Hoang,@hlmorgan online video sucks for edu. we working on a saas elearning platform with ai + remote features to solve that.\n\nhave paying customers currently and looking to raise our seed round.\n\ncap table is clean. love to connect with you on the topic to share our story and demo.,2020-05-10 14:32:44,round,Hoboken,40.744,-74.0324,10mi,2020-02-01,hlmorgan online video sucks for edu we working on a saas elearning platform with ai remote features to solve thathave paying customers currently and looking to raise our seed roundcap table is clean love to connect with you on the topic to share our story and demo,"[hlmorgan, online, video, sucks, edu, working, saas, elearning, platform, ai, remote, features, solve, thathave, paying, customers, currently, looking, raise, seed, roundcap, table, clean, love, connect, topic, share, story, demo]",6
238135,1259491028761001987,William Hoang,@mlifschitz32 online video sucks for edu. we working on a saas elearning platform with ai + remote features to solve that.\n\nhave paying customers currently and looking to raise our seed round.\n\ncap table is clean. love to connect with you on the topic to share our story and demo.,2020-05-10 14:30:36,round,Hoboken,40.744,-74.0324,10mi,2020-02-01,mlifschitz32 online video sucks for edu we working on a saas elearning platform with ai remote features to solve thathave paying customers currently and looking to raise our seed roundcap table is clean love to connect with you on the topic to share our story and demo,"[mlifschitz32, online, video, sucks, edu, working, saas, elearning, platform, ai, remote, features, solve, thathave, paying, customers, currently, looking, raise, seed, roundcap, table, clean, love, connect, topic, share, story, demo]",6
238137,1259490835974033409,William Hoang,@samirkaji online video sucks for edu. we working on a saas elearning platform with ai + remote features to solve that.\n\nhave paying customers currently and looking to raise our seed round.\n\ncap table is clean. love to connect with you on the topic to share our story and demo.,2020-05-10 14:29:50,round,Hoboken,40.744,-74.0324,10mi,2020-02-01,samirkaji online video sucks for edu we working on a saas elearning platform with ai remote features to solve thathave paying customers currently and looking to raise our seed roundcap table is clean love to connect with you on the topic to share our story and demo,"[samirkaji, online, video, sucks, edu, working, saas, elearning, platform, ai, remote, features, solve, thathave, paying, customers, currently, looking, raise, seed, roundcap, table, clean, love, connect, topic, share, story, demo]",6
238138,1259490514023395329,William Hoang,@jaykapoornyc online video sucks for edu. we working on a saas elearning platform with ai + remote features to solve that.\n\nhave paying customers currently and looking to raise our seed round.\n\ncap table is clean. love to connect with you on the topic to share our story and demo.,2020-05-10 14:28:33,round,Hoboken,40.744,-74.0324,10mi,2020-02-01,jaykapoornyc online video sucks for edu we working on a saas elearning platform with ai remote features to solve thathave paying customers currently and looking to raise our seed roundcap table is clean love to connect with you on the topic to share our story and demo,"[jaykapoornyc, online, video, sucks, edu, working, saas, elearning, platform, ai, remote, features, solve, thathave, paying, customers, currently, looking, raise, seed, roundcap, table, clean, love, connect, topic, share, story, demo]",6
238142,1259487665172463617,William Hoang,@heyreiwang online video sucks for edu. we working on a saas elearning platform with ai + remote features to solve that.\n\nhave paying customers currently and looking to raise our seed round.\n\ncap table is clean. love to connect with you on the topic to share our story and demo.,2020-05-10 14:17:14,round,Hoboken,40.744,-74.0324,10mi,2020-02-01,heyreiwang online video sucks for edu we working on a saas elearning platform with ai remote features to solve thathave paying customers currently and looking to raise our seed roundcap table is clean love to connect with you on the topic to share our story and demo,"[heyreiwang, online, video, sucks, edu, working, saas, elearning, platform, ai, remote, features, solve, thathave, paying, customers, currently, looking, raise, seed, roundcap, table, clean, love, connect, topic, share, story, demo]",6


In [38]:
clean_og_df.to_csv(r'./datasets/full_dataset.csv')