# Import data and libraries

In [1]:
import pandas as pd
import re

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

Read and concat datasets

In [2]:
gainit = pd.read_csv('../datasets/gainit_raw.csv')
loseit = pd.read_csv('../datasets/loseit_raw.csv')
gainit2 = pd.read_csv('../datasets/gainit_raw2.csv')
loseit2 = pd.read_csv('../datasets/loseit_raw2.csv')

gainit = pd.concat([gainit, gainit2], ignore_index=True)
loseit = pd.concat([loseit, loseit2], ignore_index=True)

# Data Cleaning

## Cleaning before train-test split

Keep only columns that we're interested in (`subreddit`, `selftext` and `title`)

In [3]:
gainit.drop(gainit.columns.difference(['subreddit','selftext','title']), 1, inplace=True)
loseit.drop(loseit.columns.difference(['subreddit','selftext','title']), 1, inplace=True)

Drop duplicates

In [4]:
gainit.drop_duplicates(subset='selftext', inplace=True)
loseit.drop_duplicates(subset='selftext', inplace=True)

Create a dataset with combined data

In [5]:
full = pd.concat([gainit, loseit], ignore_index=True)

Remove null values for `selftext`

In [6]:
full['selftext'].isnull().sum()

1

In [7]:
full = full[full['selftext'].notnull()]

Create a `combined` feature by combining `title` and `selftext` so that we capture both sets of data points

In [8]:
full['combined'] = full['title'] + " " + full['selftext']

In [9]:
full.shape

(1441, 4)

Map `y` targets to `1` and `0`

In [10]:
full['subreddit'] = full['subreddit'].map({'loseit': 0, 'gainit': 1})

Create separate datasets for `gainit` and `loseit` for EDA purpose in the next workbook

In [None]:
gainit_eda = full[full['subreddit'] == 1]['combined']
loseit_eda = full[full['subreddit'] == 0]['combined']

## Train-test split

In [11]:
X = full['combined']
y = full['subreddit']

Check baseline score for model to beat (50%)

In [12]:
y.value_counts(normalize=True)

1    0.500347
0    0.499653
Name: subreddit, dtype: float64

In order to measure the effectiveness of our classifier model, we want to beat the baseline score which is 50%.

We do a train-test split in order to get a test "unseen" data that our model can be measured on.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42)

In [14]:
X_train.shape

(965,)

In [15]:
X_test.shape

(476,)

## Clean words in comments

Create a function to take in a string and clean it. For our model I had actually done a comparison of Lemmatizer vs. Porter Stemmer and it turns out that Lemmatizer can a slight better accuracy score.

In [16]:
def comments_to_words(raw_comment):
    lemmatizer = WordNetLemmatizer()
    
    #Remove URL from content
    review_text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', raw_comment, flags=re.MULTILINE)
    
    #Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    #Convert to lower case, split into individual words.
    words = letters_only.lower().split()
   
    #Lemmatize words
    lemmatized = [lemmatizer.lemmatize(i) for i in words]
    
    #Remove stopwords
    stops = set(stopwords.words('english'))
    meaningful_words = [w for w in lemmatized if not w in stops]
    
    return(" ".join(meaningful_words))

Loop through comments and call on cleaning function

In [17]:
clean_train_comments = []
clean_test_comments = []

gainit_eda_comments = []
loseit_eda_comments = []

for train_comment in X_train:
    clean_train_comments.append(comments_to_words(train_comment))

for test_comment in X_test:
    clean_test_comments.append(comments_to_words(test_comment))
    
for gain_comment in gainit_eda:
    gainit_eda_comments.append(comments_to_words(gain_comment))
    
for lose_comment in loseit_eda:
    loseit_eda_comments.append(comments_to_words(lose_comment))    

Vectorize both datasets for word cloud EDA later

In [18]:
cvec = CountVectorizer(stop_words = text.ENGLISH_STOP_WORDS.union(['gainit', 'loseit']))
gainit_cvec = cvec.fit_transform(gainit_eda)
gainit_cvec_eda = pd.DataFrame(gainit_cvec.todense(), columns=cvec.get_feature_names())

cvec2 = CountVectorizer(stop_words = text.ENGLISH_STOP_WORDS.union(['gainit', 'loseit']))
loseit_cvec = cvec2.fit_transform(loseit_eda)
loseit_cvec_eda = pd.DataFrame(loseit_cvec.todense(), columns=cvec2.get_feature_names())

Convert to dataframes and export datasets

In [19]:
df_clean_train_comments = pd.DataFrame(clean_train_comments, columns=['comments'])
df_clean_test_comments = pd.DataFrame(clean_test_comments, columns=['comments'])

df_clean_train_comments.to_csv('../datasets/X_train_clean.csv', index=False)
df_clean_test_comments.to_csv('../datasets/X_test_clean.csv', index=False)
y_train.to_csv('../datasets/y_train.csv', index=False)
y_test.to_csv('../datasets/y_test.csv', index=False)
full.to_csv('../datasets/full.csv', index=False)

gainit_cvec_eda.to_csv('../datasets/gainit_cvec_eda.csv', index=False)
loseit_cvec_eda.to_csv('../datasets/loseit_cvec_eda.csv', index=False)

gainit_eda.to_csv('../datasets/gainit_eda.csv', index=False)
loseit_eda.to_csv('../datasets/loseit_eda.csv', index=False)