#  Pre-processing
---

In [22]:
import pandas as pd
import numpy as np

In [3]:
ask_df = pd.read_csv('../data/ask_historians.csv')
hist_df = pd.read_csv('../data/history.csv')

### Cleaning and Removing Nulls

In [4]:
reddit_df= pd.concat([ask_df, hist_df])

In [5]:
reddit_df.isnull().mean()

subreddit       0.000000
author          0.000000
locked          0.000000
num_comments    0.000000
selftext        0.182866
title           0.000000
timestamp       0.000000
dtype: float64

NaNs are justs with no selftext so no reason to drop and better to fill with empty string

In [6]:
reddit_df.fillna('', inplace=True)

In [7]:
# Creating a column that includes the full text of the post; both the body and title for a self.text posts.

reddit_df['full_text']= reddit_df.eval('title + selftext')

In [8]:
reddit_df.isnull().sum()

subreddit       0
author          0
locked          0
num_comments    0
selftext        0
title           0
timestamp       0
full_text       0
dtype: int64

In [9]:
reddit_df.dtypes

subreddit       object
author          object
locked           int64
num_comments     int64
selftext        object
title           object
timestamp       object
full_text       object
dtype: object

In [10]:
# Adapted from class code, cleans and lemmatizes the text.

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import regex as re
def lem_str(text):
    
    lemmatizer = WordNetLemmatizer()
    
    # Remove non-letters and trailing \n + some number.
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    text = text.replace(r'[\\\n0-9]', ' ')
    
    # Convert to lower case, split/tokenize into individual words.
    words = text.lower().split()
    
    stops = set(stopwords.words('english'))
    
    # Remove stop words.
    meaningful_words = [w for w in words if not w in stops]
    
    lem = [lemmatizer.lemmatize(i) for i in meaningful_words]
    
    return" ".join(lem)

In [11]:
reddit_df['full_text_clean']= reddit_df['full_text'].map(lem_str)

In [12]:
reddit_df.shape

(3992, 9)

In [27]:
lengths_of_posts= np.array([len(post.split()) for post in reddit_df['full_text_clean']])
lengths_of_posts.mean()

47.70390781563126

**Average length of posts across all subreddits:** $47.7$ words per post

Defining my postive and negative class.  AskHistorians is positive and History is negative.

In [12]:
reddit_df['subreddit']= reddit_df['subreddit'].map({'AskHistorians': 1,
                           'history': 0})

In [13]:
reddit_df.head()

Unnamed: 0,subreddit,author,locked,num_comments,selftext,title,timestamp,full_text,full_text_clean
0,1,AutoModerator,0,84,"[Previous](/r/AskHistorians/search?q=title%3A""...",Sunday Digest | Interesting &amp; Overlooked P...,2019-07-07 14:04:52,Sunday Digest | Interesting &amp; Overlooked P...,sunday digest interesting amp overlooked post ...
1,1,AutoModerator,0,1,[Previous weeks!](/r/AskHistorians/search?sort...,"Short Answers to Simple Questions | July 10, 2019",2019-07-10 14:05:16,"Short Answers to Simple Questions | July 10, 2...",short answer simple question july 10 2019 prev...
2,1,tiikerinsilma,0,26,I'm asking this partially because the atrociti...,"(WW2) Did Japan have genocidal plans for Asia,...",2019-07-10 09:09:07,"(WW2) Did Japan have genocidal plans for Asia,...",ww2 japan genocidal plan asia war asking parti...
3,1,Mr_Quinn,0,10,,"In 1627 the last aurochs, or wild cow, died in...",2019-07-10 14:00:39,"In 1627 the last aurochs, or wild cow, died in...",1627 last aurochs wild cow died jaktor w fores...
4,1,Erezen,0,14,"Moreover, how was the movie received in South ...","""The Gods Must Be Crazy"" is a beloved South Af...",2019-07-09 20:36:09,"""The Gods Must Be Crazy"" is a beloved South Af...",god must crazy beloved south african movie rel...


In [14]:
reddit_df['subreddit'].value_counts()

1    1996
0    1996
Name: subreddit, dtype: int64

In [15]:
reddit_df.to_csv('../data/reddit_df.csv', index=False)