# This notebook is the continuation of the Reddit-Scraping.ipynb

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import time
import requests
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.base import TransformerMixin
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
import datetime as dt

In [2]:
# read from reddit.csv - refer to Scraping-Part-1.ipynb
final_df = pd.read_csv('./reddit.csv')

In [3]:
final_df.shape

(4319, 9)

In [4]:
# display first 5 rows
final_df.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,Knock-knock,Who's there? \n**A parrot!** \nA parrot who...,Jokes,1552069053,motsanciens,0,0,True,2019-03-08
1,How did the dentist suddenly become a brain su...,A slip of the hand.,Jokes,1552069079,roastedtoperfection,0,4,True,2019-03-08
2,I hate build a bear. I took my chihauhua there...,AND the stuffed animal they gave me keeps bark...,Jokes,1552069382,RikorperationYT,0,2,True,2019-03-08
3,An English Teacher And The Pope Was Sitting Ne...,"He was reading a challenging book, and was ver...",Jokes,1552069428,GangstaKev,2,3,True,2019-03-08
4,"I looked left, then I looked right. I looked l...",Then I pulled out... she wasn’t pleased.,Jokes,1552069459,Windwaker85,0,2,True,2019-03-08


In [5]:
final_df.dtypes

title           object
selftext        object
subreddit       object
created_utc      int64
author          object
num_comments     int64
score            int64
is_self           bool
timestamp       object
dtype: object

In [6]:
# check for nulls in final_df
final_df.isnull().sum()

title             0
selftext        118
subreddit         0
created_utc       0
author            0
num_comments      0
score             0
is_self           0
timestamp         0
dtype: int64

In [7]:
# drop nulls with inplace true
final_df.dropna(inplace=True)

In [8]:
# check for nulls after dropna
final_df['selftext'].isnull().sum()

0

In [9]:
# check for the values in selftext using value counts
final_df['selftext'].value_counts(ascending = False).head()

[removed]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     667
[deleted]                                                             

In [10]:
# create a new df 'final_df_2' exclude '[removed]' and '[deleted]'
final_df_2 = final_df[(final_df.selftext != '[removed]') & (final_df.selftext != '[deleted]')].copy(deep = True)


In [11]:
# check for the values in selftext using value counts after deleting above
final_df_2['selftext'].value_counts(ascending = False).head()

I don’t have 2020 vision\n\nThis is the only day you can upvote this\n\nEDIT: Thank you sm for r/all ! Happy New Years!                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [12]:
# display first 5 rows
final_df_2.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,Knock-knock,Who's there? \n**A parrot!** \nA parrot who...,Jokes,1552069053,motsanciens,0,0,True,2019-03-08
1,How did the dentist suddenly become a brain su...,A slip of the hand.,Jokes,1552069079,roastedtoperfection,0,4,True,2019-03-08
2,I hate build a bear. I took my chihauhua there...,AND the stuffed animal they gave me keeps bark...,Jokes,1552069382,RikorperationYT,0,2,True,2019-03-08
3,An English Teacher And The Pope Was Sitting Ne...,"He was reading a challenging book, and was ver...",Jokes,1552069428,GangstaKev,2,3,True,2019-03-08
4,"I looked left, then I looked right. I looked l...",Then I pulled out... she wasn’t pleased.,Jokes,1552069459,Windwaker85,0,2,True,2019-03-08


In [13]:
final_df_2['subreddit'].head()

0    Jokes
1    Jokes
2    Jokes
3    Jokes
4    Jokes
Name: subreddit, dtype: object

In [14]:
# value counts for subreddit
final_df_2['subreddit'].value_counts()

Jokes          2129
datascience    1394
Name: subreddit, dtype: int64

In [15]:
# check for duplicates
final_df_2.duplicated().sum()

0

In [16]:
# create a new column 'is_datascience', if a subreddit is related to datascience, keep 1, otherwise 0
final_df_2['is_datascience'] = final_df_2['subreddit'].map({'Jokes':0, 'datascience':1})

In [17]:
final_df_2['is_datascience'].sample(10)

1       0
3100    1
2115    0
2907    1
3976    1
1059    0
2988    1
1426    0
3930    1
288     0
Name: is_datascience, dtype: int64

In [18]:
# create a new column 'title_text' by merging title and selftext
final_df_2['title_text'] = final_df_2['title'] + ' ' + final_df_2['selftext']
final_df_2['title_text']

0       Knock-knock Who's there?   \n**A parrot!**  \n...
1       How did the dentist suddenly become a brain su...
2       I hate build a bear. I took my chihauhua there...
3       An English Teacher And The Pope Was Sitting Ne...
4       I looked left, then I looked right. I looked l...
5       A man goes the cinema to see the first Harry P...
6       Hey, want a book full of jokes? Here's a copy ...
7       I just heard that the first manned mission of ...
8       I’m 6 foot 3 inches Those are two different me...
9       I was going to make a sexual harassment joke ....
10      How many vegans does it take to eat a cheese a...
11      The difference between being hungry or horny i...
12                I'm not racist, BUT... ...I like trains
13      Did you hear how loud that hooker's fart was? ...
14      What’s the difference between a hippo and a zi...
15      I'm not sure how I feel about this rash on my ...
16      Why did the lottery winner want to stay homele...
17      Why ar

In [19]:
# copy the cleaned data to a csv - reddit_EDA.csv
final_df_2.to_csv('./reddit_EDA.csv', index = False)