# 02. Assemble and Clean Data for Classification Project

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re
from nltk.corpus import stopwords 
np.random.seed(42)


## A. Assembly Process

In [2]:


df_a=pd.read_csv('./DebateEvolution.csv')
df_a_com=pd.read_csv('./DebateEvolutionComments.csv')


df_b=pd.read_csv('./Creation.csv')
df_b_com=pd.read_csv('./CreationComments.csv')

df_c=pd.read_csv('./Prochoice.csv')
df_c_com=pd.read_csv('./ProchoiceComments.csv')

df_d=pd.read_csv('./Prolife.csv')
df_d_com=pd.read_csv('./ProlifeComments.csv')

df_e=pd.read_csv('./general.csv')


In [3]:
def combine(df, df_com):
    
    # Combine comment table and submission table
    df = df[['title', 'score', 'selftext', 'timestamp']]
    df_com = df_com[['body', 'score', 'timestamp']]
    df = df.rename(columns={'selftext':'body'})
    result = pd.concat([df, df_com], ignore_index=True)
    
    # Fill in missing value with empty strings
    result['body']=result['body'].fillna("")
    result['title']=result['title'].fillna("")
    
    # Combine titles with body text
    result['body']=result['title']+" "+result['body']
    result=result[['body', 'score', 'timestamp']]
      
    # Drop any rows that have any null values
    result.dropna(inplace=True)
    
    # Drop any rows for which the body text was removed by Reddit
    result=result.loc[result['body']!='[removed]']
    
    # 
    
    return result

In [4]:
#Merging submissions and comment tables

debate_evolution=combine(df_a, df_a_com)

creation=combine(df_b, df_b_com)

prochoice=combine(df_c, df_c_com)

prolife=combine(df_d, df_d_com)

In [5]:
# Mix together the pairs of subreddits

debate_evolution['class']=0
creation['class']=1
evol_debate = pd.concat([debate_evolution, creation], ignore_index=True)

prochoice['class']=0
prolife['class']=1
abort_debate=pd.concat([prochoice, prolife], ignore_index=True)



In [6]:
df_e=df_e[['title', 'subreddit', 'num_comments', 'created_utc']]

In [7]:
df_e['elapsed']=df_e['created_utc'].max()-df_e['created_utc']

In [8]:
def score(x):
    thresh=df_e['num_comments'].median()
    if x >= thresh:
        return 1
    else:
        return 0

df_e['class']=df_e['num_comments'].apply(score)

In [9]:
df_e['num_comments'].median()

169.0

In [10]:
df_e['elapsed'].describe()

count    7.000000e+03
mean     5.668239e+05
std      3.483431e+05
min      0.000000e+00
25%      2.637520e+05
50%      5.666920e+05
75%      8.688822e+05
max      1.133614e+06
Name: elapsed, dtype: float64

In [11]:
# Credit to Harsha G. for thinking of engineering a title length feature!

df_e['title_length']=df_e['title'].str.split().map(len)

In [12]:
# Thanks to Ben for suggesting this feature engineering in class

df_e['feature_question']=df_e['title'].str.contains("?", regex=False)

df_e['feature_question']=df_e['feature_question'].map({True: 1, False: 0})

df_e["feature_question"].value_counts()

0    5790
1    1210
Name: feature_question, dtype: int64

In [13]:

df_e['feature_exclaim']=df_e['title'].str.contains("!", regex=False)

df_e['feature_exclaim']=df_e['feature_exclaim'].map({True: 1, False: 0})

df_e["feature_exclaim"].value_counts()

0    6521
1     479
Name: feature_exclaim, dtype: int64

## B. Cleaning the Text

In [14]:
# Function adapted from General Assembly Notebook for NLP

def review_to_words(raw_review):

    # 1. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review)
    #
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    #
    # 3. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words('english'))
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    
    return(" ".join(meaningful_words))

In [15]:
df_e['title']=df_e['title'].apply(review_to_words)



In [16]:
evol_debate['body']=evol_debate['body'].apply(review_to_words)

In [17]:
abort_debate['body']=abort_debate['body'].apply(review_to_words)

In [18]:
# Dropping messages that refer to 'removed', because I don't want to classify
# based on the extent to which Reddit is engaged in automated troll removal

evol_debate=evol_debate.loc[~evol_debate['body'].str.contains("removed")]
abort_debate=abort_debate.loc[~abort_debate['body'].str.contains("removed")]



In [19]:
df_e.to_pickle('./general_pickle')
evol_debate.to_pickle('./evol_pickle')
abort_debate.to_pickle('./abort_pickle')


In [20]:
abort_debate.loc[abort_debate['class']==0]['body'].to_csv('./prochoice_text.csv')

In [21]:
abort_debate.loc[abort_debate['class']==1]['body'].to_csv('./prolife_text.csv')

In [22]:
evol_debate.loc[evol_debate['class']==0]['body'].to_csv('./evolution_text.csv')

In [23]:
evol_debate.loc[evol_debate['class']==1]['body'].to_csv('./creation_text.csv')