In [1]:
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
np.random.seed(10)

In [48]:
df = pd.read_csv("./data/reddit_vm.csv")

In [49]:
df.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
0,Health Canada approves AstraZeneca COVID-19 va...,7,lt74vw,https://www.canadaforums.ca/2021/02/health-can...,0,1614400000.0,,2021-02-27 06:33:45
1,COVID-19 in Canada: 'Vaccination passports' a ...,2,lsh0ij,https://www.canadaforums.ca/2021/02/covid-19-i...,1,1614316000.0,,2021-02-26 07:11:07
2,Coronavirus variants could fuel Canada's third...,6,lohlle,https://www.canadaforums.ca/2021/02/coronaviru...,0,1613887000.0,,2021-02-21 07:50:08
3,Canadian government to extend COVID-19 emergen...,1,lnptv8,https://www.canadaforums.ca/2021/02/canadian-g...,0,1613796000.0,,2021-02-20 06:35:13
4,Canada: Pfizer is 'extremely committed' to mee...,6,lkslm6,https://www.canadaforums.ca/2021/02/canada-pfi...,0,1613468000.0,,2021-02-16 11:36:28


In [50]:
df.tail()

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
1419,Comment,1,ejackaa,,0,1553486000.0,I didn't say thimerosal is mercury. I said thi...,2019-03-25 05:50:41
1420,Comment,2,ejacj98,,0,1553486000.0,"The ""myth"" you're debunking is in regards to t...",2019-03-25 05:50:20
1421,Comment,2,ejabpdx,,0,1553485000.0,You'll have to read it again because I didn't ...,2019-03-25 05:40:03
1422,Comment,0,ej9xuaf,,0,1553475000.0,"What do you mean by ""your OP"". I am fairly new...",2019-03-25 02:45:21
1423,Comment,1,ej9x2qr,,0,1553474000.0,"When they say there's no thimerasol, they mean...",2019-03-25 02:35:47


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1424 entries, 0 to 1423
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      1424 non-null   object 
 1   score      1424 non-null   int64  
 2   id         1424 non-null   object 
 3   url        444 non-null    object 
 4   comms_num  1424 non-null   int64  
 5   created    1424 non-null   float64
 6   body       1059 non-null   object 
 7   timestamp  1424 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 89.1+ KB


In [52]:
df.isna().sum()

title          0
score          0
id             0
url          980
comms_num      0
created        0
body         365
timestamp      0
dtype: int64

In [53]:
#create corpus
title_text= []
for t in df['title'].dropna():
    title_text.append(t)
title_text

['Health Canada approves AstraZeneca COVID-19 vaccine',
 "COVID-19 in Canada: 'Vaccination passports' a near certainty says bio-ethicist",
 "Coronavirus variants could fuel Canada's third wave",
 'Canadian government to extend COVID-19 emergency benefits',
 "Canada: Pfizer is 'extremely committed' to meeting vaccine delivery targets",
 'Canada: Oxford-AstraZeneca vaccine approval expected this week',
 'Comment',
 'COVID-19: Músicos que han recibido la vacuna',
 'Now Casting COVID-19 Vaccine Volunteers, Freezer Truck Drivers, and Others!',
 'Beer after corona vaccination',
 'Waiting for vaccine',
 'A great article: myths vs facts of the Covid vaccine',
 "Vietnam's Covid-19 vaccine, Nanocovax effective on variants: university 'Vietnam is currently working on four Covid-19 vaccines produced by Nanogen, the Institute of Vaccines and Medical Biologicals..'",
 'Pertussis',
 'Sobre las vacunas para el COVID19, compilación de textos científicos y opinión personal.',
 'If someone tells you the 

### Data Cleaning

Removing any grammatical symbols from the text and converting everything to lower case 

In [54]:
# function to clean corpus 
def clean_data(text):
    cleaned = []
    for t in text:
        for symbol in ",.?!''-""~():/+|\[]=%;*":
            t = t.replace(symbol, "").lower()
        cleaned.append(t)
        
    return cleaned

In [55]:
cleaned_title = clean_data(title_text)
len(cleaned_title)

1424

In [56]:
# removing titles that only say 'comment'
for t in cleaned_title:
    cleaned_title.remove('comment')
len(cleaned_title)

712

In [47]:
cleaned_title

['health canada approves astrazeneca covid19 vaccine',
 'covid19 in canada vaccination passports a near certainty says bioethicist',
 'coronavirus variants could fuel canadas third wave',
 'canadian government to extend covid19 emergency benefits',
 'canada pfizer is extremely committed to meeting vaccine delivery targets',
 'canada oxfordastrazeneca vaccine approval expected this week',
 'covid19 músicos que han recibido la vacuna',
 'now casting covid19 vaccine volunteers freezer truck drivers and others',
 'beer after corona vaccination',
 'waiting for vaccine',
 'a great article myths vs facts of the covid vaccine',
 'vietnams covid19 vaccine nanocovax effective on variants university vietnam is currently working on four covid19 vaccines produced by nanogen the institute of vaccines and medical biologicals',
 'pertussis',
 'sobre las vacunas para el covid19 compilación de textos científicos y opinión personal',
 'if someone tells you the vaccine contains a microchip ask them what w