# Preprocessing for NLP

In [4]:
# importing libraries for text pre-processing
import pandas as pd
import nltk
import string 
from nltk.stem import WordNetLemmatizer

In [5]:
# Reading in my cleaned reddits csv
reddits = pd.read_csv('./data/reddits.csv')

In [6]:
# Checking out the data
reddits.head()

Unnamed: 0,subreddit,type,author,title,body,selftext,created_utc
0,gimlet,comment,CubeLifeDeskJob,_,She didn't mention this when I asked her. She...,_,1553281124
1,gimlet,comment,CubeLifeDeskJob,_,"I mean, I was, but not for the sole purpose of...",_,1553280963
2,gimlet,comment,heimmrich,_,Hardly. The best talent around in podcast has ...,_,1553280896
3,gimlet,comment,heimmrich,_,She can't do season 2 because gimlet owns the ...,_,1553280716
4,gimlet,comment,Subalpine,_,Search Party. really excellent and critically ...,_,1553280571


In [7]:
# Removing the punctuation from my columns I plan on using
reddits['author'] = reddits['author'].str.replace('[^\w\s]','')
reddits['title'] = reddits['title'].str.replace('[^\w\s]','')
reddits['body'] = reddits['body'].str.replace('[^\w\s]','')
reddits['selftext'] = reddits['selftext'].str.replace('[^\w\s]','')

In [8]:
# Making sure it worked
reddits.head()

Unnamed: 0,subreddit,type,author,title,body,selftext,created_utc
0,gimlet,comment,CubeLifeDeskJob,_,She didnt mention this when I asked her She i...,_,1553281124
1,gimlet,comment,CubeLifeDeskJob,_,I mean I was but not for the sole purpose of b...,_,1553280963
2,gimlet,comment,heimmrich,_,Hardly The best talent around in podcast has b...,_,1553280896
3,gimlet,comment,heimmrich,_,She cant do season 2 because gimlet owns the r...,_,1553280716
4,gimlet,comment,Subalpine,_,Search Party really excellent and critically a...,_,1553280571


In [9]:
# Making everything lower case
reddits['author'] = reddits['author'].str.lower() 
reddits['title'] = reddits['title'].str.lower() 
reddits['body'] = reddits['body'].str.lower() 
reddits['selftext'] = reddits['selftext'].str.lower() 

In [10]:
# Checking out my dataframe
reddits.head()

Unnamed: 0,subreddit,type,author,title,body,selftext,created_utc
0,gimlet,comment,cubelifedeskjob,_,she didnt mention this when i asked her she i...,_,1553281124
1,gimlet,comment,cubelifedeskjob,_,i mean i was but not for the sole purpose of b...,_,1553280963
2,gimlet,comment,heimmrich,_,hardly the best talent around in podcast has b...,_,1553280896
3,gimlet,comment,heimmrich,_,she cant do season 2 because gimlet owns the r...,_,1553280716
4,gimlet,comment,subalpine,_,search party really excellent and critically a...,_,1553280571


In [11]:
# Making a copy of reddits so I can have one dataframe with authors and one without 
reddits2 = reddits.copy()

In [12]:
# Checking reddits2
reddits2.head()

Unnamed: 0,subreddit,type,author,title,body,selftext,created_utc
0,gimlet,comment,cubelifedeskjob,_,she didnt mention this when i asked her she i...,_,1553281124
1,gimlet,comment,cubelifedeskjob,_,i mean i was but not for the sole purpose of b...,_,1553280963
2,gimlet,comment,heimmrich,_,hardly the best talent around in podcast has b...,_,1553280896
3,gimlet,comment,heimmrich,_,she cant do season 2 because gimlet owns the r...,_,1553280716
4,gimlet,comment,subalpine,_,search party really excellent and critically a...,_,1553280571


In [13]:
# Making a words column, and reddits2 will have author, title, body, and selftext
reddits2['words'] = reddits2['author'] + ' ' + reddits2['title'] + ' ' + reddits2['body'] + ' ' + reddits2['selftext']

In [14]:
# Making a words column for reddits, which will have everything above other than the author
reddits['words'] = reddits['title'] + ' ' + reddits['body'] + ' ' + reddits['selftext']

In [15]:
# Looking at reddits
reddits.head()

Unnamed: 0,subreddit,type,author,title,body,selftext,created_utc,words
0,gimlet,comment,cubelifedeskjob,_,she didnt mention this when i asked her she i...,_,1553281124,_ she didnt mention this when i asked her she...
1,gimlet,comment,cubelifedeskjob,_,i mean i was but not for the sole purpose of b...,_,1553280963,_ i mean i was but not for the sole purpose of...
2,gimlet,comment,heimmrich,_,hardly the best talent around in podcast has b...,_,1553280896,_ hardly the best talent around in podcast has...
3,gimlet,comment,heimmrich,_,she cant do season 2 because gimlet owns the r...,_,1553280716,_ she cant do season 2 because gimlet owns the...
4,gimlet,comment,subalpine,_,search party really excellent and critically a...,_,1553280571,_ search party really excellent and critically...


In [16]:
# Looking at reddits2
reddits2.head()

Unnamed: 0,subreddit,type,author,title,body,selftext,created_utc,words
0,gimlet,comment,cubelifedeskjob,_,she didnt mention this when i asked her she i...,_,1553281124,cubelifedeskjob _ she didnt mention this when ...
1,gimlet,comment,cubelifedeskjob,_,i mean i was but not for the sole purpose of b...,_,1553280963,cubelifedeskjob _ i mean i was but not for the...
2,gimlet,comment,heimmrich,_,hardly the best talent around in podcast has b...,_,1553280896,heimmrich _ hardly the best talent around in p...
3,gimlet,comment,heimmrich,_,she cant do season 2 because gimlet owns the r...,_,1553280716,heimmrich _ she cant do season 2 because gimle...
4,gimlet,comment,subalpine,_,search party really excellent and critically a...,_,1553280571,subalpine _ search party really excellent and ...


In [17]:
# Dropping the columns I combined or don't need
reddits.drop(columns=['author', 'title', 'body', 'selftext'], inplace=True)
reddits2.drop(columns=['author', 'title', 'body', 'selftext'], inplace=True)

In [18]:
# Checking out reddits
reddits.head()

Unnamed: 0,subreddit,type,created_utc,words
0,gimlet,comment,1553281124,_ she didnt mention this when i asked her she...
1,gimlet,comment,1553280963,_ i mean i was but not for the sole purpose of...
2,gimlet,comment,1553280896,_ hardly the best talent around in podcast has...
3,gimlet,comment,1553280716,_ she cant do season 2 because gimlet owns the...
4,gimlet,comment,1553280571,_ search party really excellent and critically...


In [19]:
# Checking out reddits 2
reddits2.head()

Unnamed: 0,subreddit,type,created_utc,words
0,gimlet,comment,1553281124,cubelifedeskjob _ she didnt mention this when ...
1,gimlet,comment,1553280963,cubelifedeskjob _ i mean i was but not for the...
2,gimlet,comment,1553280896,heimmrich _ hardly the best talent around in p...
3,gimlet,comment,1553280716,heimmrich _ she cant do season 2 because gimle...
4,gimlet,comment,1553280571,subalpine _ search party really excellent and ...


In [20]:
# Replacing the underscore I used for nulls with a space
reddits['words'] = reddits['words'].str.replace('_', ' ')
reddits2['words'] = reddits2['words'].str.replace('_', ' ')

In [21]:
# Looking at reddits
reddits.head()

Unnamed: 0,subreddit,type,created_utc,words
0,gimlet,comment,1553281124,she didnt mention this when i asked her she...
1,gimlet,comment,1553280963,i mean i was but not for the sole purpose of...
2,gimlet,comment,1553280896,hardly the best talent around in podcast has...
3,gimlet,comment,1553280716,she cant do season 2 because gimlet owns the...
4,gimlet,comment,1553280571,search party really excellent and critically...


In [22]:
# Making Maximum Fun my positive class
reddits['subreddit'].replace('maximumfun', 1, inplace=True)
reddits2['subreddit'].replace('maximumfun', 1, inplace=True)

In [23]:
# Making Gimlet my negative class
reddits['subreddit'].replace('gimlet', 0, inplace=True)
reddits2['subreddit'].replace('gimlet', 0, inplace=True)

In [33]:
# I tested out lemmatization and it did not quite give me the results I expected. This is something I would
# like to experiment further with this
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize(words):
    items = []
    lemmas = [lemmatizer.lemmatize(x) for x in (words)]
    items.append(lemmas)
    return items
lemmatize(reddits['words'][:5])

[['  she didnt mention this when i asked her  she implied there is nothing preventing her  but even if thats true she could very easily make a similar show with a different name   ',
  '  i mean i was but not for the sole purpose of being an ass  i think she believes her own bs that its all gimlets fault etc  i wanted her to see what the majority of reasonable people see when she tweets about gimlet   \n\n\ninstead she purposefully ignored my point and tried to drum up sympathy by saying someone photoshopped me shot dead in a chair as if i was making some kind of threat i can only imagine the type of games she played with her previous employer   ',
  '  hardly the best talent around in podcast has been doing radio for years now goldstein and starlee are both good examples so even as podcasts expand quickly theres a lot of lag on new talent because it takes a lot of time to hone a craft like this  ',
  '  she cant do season 2 because gimlet owns the rights to the show  ',
  '  search pa

In [28]:
# Looking at the info for reddits
reddits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28012 entries, 0 to 28011
Data columns (total 4 columns):
subreddit      28012 non-null int64
type           28012 non-null object
created_utc    28012 non-null int64
words          28012 non-null object
dtypes: int64(2), object(2)
memory usage: 875.5+ KB


In [29]:
# Looking at the info for reddits2
reddits2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28012 entries, 0 to 28011
Data columns (total 4 columns):
subreddit      28012 non-null int64
type           28012 non-null object
created_utc    28012 non-null int64
words          28012 non-null object
dtypes: int64(2), object(2)
memory usage: 875.5+ KB


In [213]:
# Reading my dataframes into two files, one with authors and one without
# I will use my data without authors to test my models, but then look at my best models and compare with
# authors included
reddits.to_csv('./data/reddits_preprocessed.csv', index=False)
reddits2.to_csv('./data/reddits_preprocessed_authors.csv', index=False)