In [1]:
import numpy as np
import pandas as pd
import time
import gensim
import re

from numba import jit, cuda
from gensim.models.word2vec import Word2Vec 
from nltk.tokenize import RegexpTokenizer 
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN

  from pandas import Panel


## Data Cleansing

In [2]:
def stop_word_remover(df):
    df['text'] = df['text'].astype(str).str.lower()
    df['token_text'] = df['text'].str.replace('([^ a-zA-Z0-9])', '').str.replace('http\S+|www.\S+', '', case=False).replace('coronavirus', 'covid19')

    stop = stopwords.words('english')
    df['stop_text'] = df['token_text'].apply(lambda x: [item for item in str(x).split() 
                                                    if item not in stop])
    return df

In [3]:
df1 = pd.read_csv('../adam/datasets/scrape_5.12.csv')
pd.set_option('display.max_colwidth', -1)

  


In [4]:
df1.shape

(121482, 10)

In [5]:
df1.drop_duplicates(inplace=True)
df1.shape

(114044, 10)

In [6]:
df1 = stop_word_remover(df1)

In [7]:
df2 = pd.read_csv('../adam/datasets/test_5.14.csv')
pd.set_option('display.max_colwidth', -1)

  


In [8]:
df2.shape

(221214, 10)

In [9]:
df2.drop_duplicates(inplace=True)
df2.shape

(201794, 10)

In [10]:
df2 = stop_word_remover(df2)

In [11]:
df = pd.concat([df1,df2], ignore_index = True, verify_integrity = True)
df.shape

(315838, 12)

In [12]:
df['text'] = df['text'].astype(str).str.lower()
df['token_text'] = df['text'].str.replace('([^ a-zA-Z0-9])', '').str.replace('http\S+|www.\S+', '', case=False).replace('coronavirus', 'covid19')

stop = stopwords.words('english')
df['stop_text'] = df['token_text'].apply(lambda x: [item for item in str(x).split() 
                                                    if item not in stop])



#df['split_text'] = df['token_text'].astype(str).str.lower().str.split()

In [13]:
df['text'].head()

0    did the first of several shopping runs, though i ended up walking out of cvs without buying anything. if the rate of people coughing at me is any indication, we’ll all have covid-19 by thursday. \n\ncover your fucking mouths when you cough.
1    looks like adios muchachos for the us #covid #covid19us #coronavirushttps://twitter.com/nytimes/status/1234171621696557057 …                                                                                                                    
2    humour..snl...on #covid #coronavirususahttps://mashable.com/video/snl-coronavirus-democratic-candidates-cold-open/ …                                                                                                                            
3    had to go to three diff convenient stores to find real milk (not soy) and they were all out of the lower priced non organic stuff .. i buy organic so i was fine with it but wow. covid 19 panic buying begins in manhattan.                    
4    #covid #cor

In [14]:
df['stop_text'].head()

0    [first, several, shopping, runs, though, ended, walking, cvs, without, buying, anything, rate, people, coughing, indication, well, covid19, thursday, cover, fucking, mouths, cough]
1    [looks, like, adios, muchachos, us, covid, covid19us, coronavirus]                                                                                                                  
2    [humoursnlon, covid, coronavirususa]                                                                                                                                                
3    [go, three, diff, convenient, stores, find, real, milk, soy, lower, priced, non, organic, stuff, buy, organic, fine, wow, covid, 19, panic, buying, begins, manhattan]              
4    [covid, coronovirius, nursing, home, setting, kirkland, 50, symptoms, 3120]                                                                                                         
Name: stop_text, dtype: object

In [15]:
df['token_text'].head()

0    did the first of several shopping runs though i ended up walking out of cvs without buying anything if the rate of people coughing at me is any indication well all have covid19 by thursday cover your fucking mouths when you cough
1    looks like adios muchachos for the us covid covid19us coronavirus                                                                                                                                                                    
2    humoursnlon covid coronavirususa                                                                                                                                                                                                     
3    had to go to three diff convenient stores to find real milk not soy and they were all out of the lower priced non organic stuff  i buy organic so i was fine with it but wow covid 19 panic buying begins in manhattan               
4    covid coronovirius in nursing home setting  kirkland mo

## Word Vectorizer

In [16]:
sent = [row for row in df1['stop_text']]

In [17]:
tweet_w2v = Word2Vec(size=1000, min_count=500, window=10)

In [18]:
tweet_w2v.build_vocab(sent)

In [19]:
tweet_w2v.most_similar('covid')

  """Entry point for launching an IPython kernel.


[('right', 0.09585417807102203),
 ('twitter', 0.08774752914905548),
 ('president', 0.07988424599170685),
 ('waiting', 0.07965778559446335),
 ('chinese', 0.07624496519565582),
 ('safety', 0.07341757416725159),
 ('able', 0.07280883193016052),
 ('helping', 0.06696482747793198),
 ('united', 0.06436911970376968),
 ('community', 0.06277808547019958)]

## Tweet Vectorizer

In [20]:
# Functions found in this cell were created by instructor Noah Christiansen
# He is great
# Define vectorization function
#word2vecmodel = tweet_w2v
def vectorize_corpus(keyword_list):    
    # Instantiate counter for number of words in keyword_list that exists
    n_words = 0
    # Create template for cumulative corpus vector sum
    corpus_vec_sum = np.zeros((1,1000))                 
    # Scan through each word in list
    for word in keyword_list:
        if word in tweet_w2v.wv.vocab:                    
            word_vec = tweet_w2v.wv.word_vec(word)
            #print(word_vec)
            n_words +=1                                
            corpus_vec_sum = corpus_vec_sum + word_vec 
    # Compute average vector by taking cumulative vector sum and dividing it by number of words traced
    corpus_avg_vec = corpus_vec_sum/n_words
    # Squeeze this N-dimensional nested array object into a 1-D array to streamline future processing
    corpus_avg_vec = np.squeeze(corpus_avg_vec)
    return(corpus_avg_vec)

#defining cosine similarity function
def cos_sim(vector_1, vector_2):
    dp = np.dot(vector_1, vector_2)
    magnitude_v1 = np.sqrt(np.dot(vector_1,vector_1))
    magnitude_v2 = np.sqrt(np.dot(vector_2,vector_2))
    return(dp/(magnitude_v1*magnitude_v2))


In [None]:
vect_tweets = [vectorize_corpus(tweet) for tweet in df['stop_text']]
vect_df = pd.DataFrame(vect_tweets)



In [None]:
print(df.shape)
print(vect_df.shape)

In [None]:
join_df = df.join(vect_df)
join_df.dropna(inplace = True)
join_df.shape

In [None]:
clean_vec_df= join_df.iloc[:, [i for i in range(13,1012)]]
clean_og_df = join_df.iloc[:, [i for i in range(0,12)]]
clean_vec_df.head()

In [None]:
clean_og_df.head()

In [None]:
dbscan = DBSCAN(metric='cosine', eps=0.4, min_samples=30, n_jobs = -1) # you can change these parameters, given just for example 
cluster_labels = dbscan.fit_predict(data) # where X - is your matrix, where each row corresponds to one document (line) from the docs, you need to cluster 

In [None]:
clean_og_df['cluster'] = cluster_labels

In [None]:
clean_og_df['cluster'].value_counts()

In [None]:
clean_og_df.head(5)

In [None]:
mask = clean_og_df['cluster'] == 0
clean_og_df[mask].head(50)