In [1]:
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
np.random.seed(10)

In [2]:
df = pd.read_csv("./data/reddit_vm.csv")

In [3]:
df.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
0,Health Canada approves AstraZeneca COVID-19 va...,7,lt74vw,https://www.canadaforums.ca/2021/02/health-can...,0,1614400000.0,,2021-02-27 06:33:45
1,COVID-19 in Canada: 'Vaccination passports' a ...,2,lsh0ij,https://www.canadaforums.ca/2021/02/covid-19-i...,1,1614316000.0,,2021-02-26 07:11:07
2,Coronavirus variants could fuel Canada's third...,6,lohlle,https://www.canadaforums.ca/2021/02/coronaviru...,0,1613887000.0,,2021-02-21 07:50:08
3,Canadian government to extend COVID-19 emergen...,1,lnptv8,https://www.canadaforums.ca/2021/02/canadian-g...,0,1613796000.0,,2021-02-20 06:35:13
4,Canada: Pfizer is 'extremely committed' to mee...,6,lkslm6,https://www.canadaforums.ca/2021/02/canada-pfi...,0,1613468000.0,,2021-02-16 11:36:28


In [4]:
df.tail()

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
1419,Comment,1,ejackaa,,0,1553486000.0,I didn't say thimerosal is mercury. I said thi...,2019-03-25 05:50:41
1420,Comment,2,ejacj98,,0,1553486000.0,"The ""myth"" you're debunking is in regards to t...",2019-03-25 05:50:20
1421,Comment,2,ejabpdx,,0,1553485000.0,You'll have to read it again because I didn't ...,2019-03-25 05:40:03
1422,Comment,0,ej9xuaf,,0,1553475000.0,"What do you mean by ""your OP"". I am fairly new...",2019-03-25 02:45:21
1423,Comment,1,ej9x2qr,,0,1553474000.0,"When they say there's no thimerasol, they mean...",2019-03-25 02:35:47


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1424 entries, 0 to 1423
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      1424 non-null   object 
 1   score      1424 non-null   int64  
 2   id         1424 non-null   object 
 3   url        444 non-null    object 
 4   comms_num  1424 non-null   int64  
 5   created    1424 non-null   float64
 6   body       1059 non-null   object 
 7   timestamp  1424 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 89.1+ KB


In [6]:
df.isna().sum()

title          0
score          0
id             0
url          980
comms_num      0
created        0
body         365
timestamp      0
dtype: int64

In [7]:
#create corpus
title_text= []
for t in df['title'].dropna():
    title_text.append(t)
title_text

['Health Canada approves AstraZeneca COVID-19 vaccine',
 "COVID-19 in Canada: 'Vaccination passports' a near certainty says bio-ethicist",
 "Coronavirus variants could fuel Canada's third wave",
 'Canadian government to extend COVID-19 emergency benefits',
 "Canada: Pfizer is 'extremely committed' to meeting vaccine delivery targets",
 'Canada: Oxford-AstraZeneca vaccine approval expected this week',
 'Comment',
 'COVID-19: Músicos que han recibido la vacuna',
 'Now Casting COVID-19 Vaccine Volunteers, Freezer Truck Drivers, and Others!',
 'Beer after corona vaccination',
 'Waiting for vaccine',
 'A great article: myths vs facts of the Covid vaccine',
 "Vietnam's Covid-19 vaccine, Nanocovax effective on variants: university 'Vietnam is currently working on four Covid-19 vaccines produced by Nanogen, the Institute of Vaccines and Medical Biologicals..'",
 'Pertussis',
 'Sobre las vacunas para el COVID19, compilación de textos científicos y opinión personal.',
 'If someone tells you the 

### Data Cleaning

Removing any grammatical symbols from the text and converting everything to lower case 

In [8]:
# function to clean
def clean_data(text):
    cleaned = []
    for t in text:
        for symbol in ",.?!''-""~():/+|\[]=%;*":
            t = t.replace(symbol, "").lower()
        cleaned.append(t)
        
    return cleaned

In [9]:
cleaned_title = clean_data(title_text)
len(cleaned_title)

1424

In [10]:
comment_count = cleaned_title.count('comment')

In [11]:
for c in range(comment_count):
    cleaned_title.remove('comment')

len(cleaned_title)

444

### Tokenize


In [12]:
# function to tokenize
def tokenize(text):
    joined_text= ' '.join(text)
    tokenized_text = word_tokenize(joined_text)
    return tokenized_text
                                  

In [13]:
tokenized_title = tokenize(cleaned_title)
tokenized_title[:10]

['health',
 'canada',
 'approves',
 'astrazeneca',
 'covid19',
 'vaccine',
 'covid19',
 'in',
 'canada',
 'vaccination']

### Vectorization

Converting the text into a vector where each element in the vector represents a different word.
The vector is the length of the entire vocabulary -- here, every word that appears in our corpus.

In [15]:
# function to return a count vectorized representation of text as a dictionary
def count_vectorize(text, vocab=None):
# vocab is an optional parameter set to default to None
# vocab is just in case we use a vocabulary that contains words not seen in the song

    if vocab:
        unique_words = vocab
    else:
        unique_words = list(set(text))
    
    # empty dictionary with keys as the unique words in the corpus
    text_dict = {i:0 for i in unique_words}
    
    # adding count values for each unique words
    for t in text:
        text_dict[t] +=1
    
    return text_dict
    

In [17]:
vectorized_title = count_vectorize(tokenized_title)
print(vectorized_title)



### TF-IDF Vectorization
TF-IDF is a more advanced form of vectorization that weighs each term in a document by how unique it is to the given document it is contained in. This allows us to summarize the contents of a document using a few KEY WORDS.

If the word is used often in many other documents, it is not unique and therefore probably not too useful if we wanted to figure out how this document is unique in relation to other documents.

Conversely, if a word is used many times in a document, but rarely in all the other documents we are looking into, then it is likely a good indicator for telling us that this word is important to the document in question

In [None]:
# Function to calculate the TF