In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

%matplotlib inline

# importing the packages i'll likely use

In [2]:
reddit_posts_df = pd.read_csv('../data/cleansed_data.csv').drop(columns='Unnamed: 0')

# reading in my data and dropping the unnamed axis

In [3]:
reddit_posts_df.head()

# checking the head of my dataframe

Unnamed: 0,text,title,target
0,"From maternity photos to bathroom selfies, sho...","Daily Bump Picture Thread - December 19, 2018",1
1,Grab Mr. DeMille and show us dem close ups! Re...,Weekly Ultrasound and Announcement Thread - De...,1
2,thisismissingtext,Every day since my bump appeared,1
3,thisismissingtext,Was requested to share this here,1
4,I’m in the waiting room right now for my ultra...,Finding out the gender of my baby!,1


In [4]:
features = ['text', 'title']
X = reddit_posts_df[features]
y = reddit_posts_df.target

# creating a features list and setting my X equal to it
# setting my Y equal to my target column

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    test_size=0.50,
                                                    stratify=y)

# instantiating my train test split
# setting a random state for reproducibility and stratifying my Y because the class is slightly unbalanced

In [6]:
#cv_text = CountVectorizer(stop_words='english', strip_accents = 'ascii')
#cv_title = CountVectorizer(stop_words='english', strip_accents = 'ascii')

# instantiating a CountVectorizer on both my text and title features, removing english stop words and stripping ascii accents

In [7]:
#cv_text = CountVectorizer(stop_words='english', strip_accents = 'ascii', min_df=.20, max_df= .95)
#cv_title = CountVectorizer(stop_words='english', strip_accents = 'ascii')

# instantiating a CountVectorizer on both my text and title features, removing english stop words and stripping ascii accents
# the assumption was that post texts would have more noise so i set min_df and max_df to help clean it up some

In [8]:
#cv_text = CountVectorizer(stop_words='english', strip_accents = 'ascii', min_df=.20, max_df= .95)
#cv_title = CountVectorizer(stop_words='english', strip_accents = 'ascii', ngram_range=(1, 2))

# instantiating a CountVectorizer on both my text and title features, removing english stop words and stripping ascii accents
# the assumption was that post texts would have more noise so i set min_df and max_df to help clean it up some
# the 2nd assumption was that titles were likely more informative & setting an n-gram range of (1, 2) would provide more helpful context

In [9]:
cv_text = CountVectorizer(stop_words='english', strip_accents = 'ascii', ngram_range=(1, 6), min_df=.03)
cv_title = CountVectorizer(stop_words='english', strip_accents = 'ascii', ngram_range=(1, 3), min_df=.01)

# instantiating a CountVectorizer on both my text and title features, removing english stop words and stripping ascii accents
# the assumption was that post texts with an ngram range of (1, 6) would both clean up noise and be more helpful than those with a range of (1, 1)
# i still set a min_df to help clean up noise though it was gentler than previous ones
# the assumption was that titles are likely more informative so setting an n-gram range of (1, 4) might provide even more helpful context
# i also thought setting a gentle min_df would help clean up any further noise

In [10]:
X_train_text = cv_text.fit_transform(X_train.text)
X_train_title = cv_title.fit_transform(X_train.title)

X_test_text = cv_text.transform(X_test.text)
X_test_title = cv_title.transform(X_test.title)

# this will give me two diff matrices one for text and one for title, i'll need to re-combine them before modeling

In [11]:
X_train_text_df = pd.DataFrame(X_train_text.todense(), columns=[x+'_text' for x in cv_text.get_feature_names()])
X_train_text_df.shape

# creating a dataframe with my train post text and checking the shape

(909, 336)

In [12]:
X_train_title_df = pd.DataFrame(X_train_title.todense(), columns=[y+'_title' for y in cv_title.get_feature_names()])
X_train_title_df.shape

# creating a dataframe with my train post titles and checking the shape

(909, 57)

In [13]:
X_test_text_df = pd.DataFrame(X_test_text.todense(), columns=[x+'_text' for x in cv_text.get_feature_names()])
X_test_text_df.shape

# creating a dataframe with my test post text and checking the shape

(909, 336)

In [14]:
X_test_title_df = pd.DataFrame(X_test_title.todense(), columns=[y+'_title' for y in cv_title.get_feature_names()])
X_test_title_df.shape

# creating a dataframe with my test post titles and checking the shape

(909, 57)

In [15]:
vecced_train_reddit_posts = pd.concat([X_train_text_df, X_train_title_df], axis=1)
vecced_test_reddit_posts = pd.concat([X_test_text_df, X_test_title_df], axis=1)

# concatenating my train text and titles back together as well as my test text and titles

In [16]:
vecced_train_reddit_posts.shape

# checking the shape of my newly concatenated train dataframe

(909, 393)

In [17]:
vecced_test_reddit_posts.shape

# checking the shape of my newly concatenated train dataframe

(909, 393)

In [18]:
vecced_train_reddit_posts.isnull().sum().sum()

# double-checking to make sure there are no null values

0

In [19]:
vecced_test_reddit_posts.isnull().sum().sum()

# double-checking to make sure there are no null values

0

# LOGISTIC REGRESSION MODELS

In [20]:
def run_the_lr_models(model):
    
    if model == 'lr_1':
        
        lr_1_params = {
            'penalty': ['l1'],
            'C': [1, 1.5, 2, 2.5],
            'class_weight': ['balanced'],
            'warm_start': [True, False],
            'random_state': [42],
            'solver': ['liblinear']}
        
        M = GridSearchCV(LogisticRegression(),
                        lr_1_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)
        
    elif model == 'lr_2':
        
        lr_2_params = {
            'penalty': ['l2'],
            'C': [1, 1.5, 2, 2.5],
            'class_weight': ['balanced'],
            'warm_start': [True, False],            
            'random_state': [42],
            'solver': ['lbfgs', 'liblinear']}
        
        M = GridSearchCV(LogisticRegression(),
                        lr_2_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)

    else:
        print('There is an error.')
        
    M.fit(vecced_train_reddit_posts.values, y_train)
     
    print(f'Train score = {M.score(vecced_train_reddit_posts.values, y_train)}')
    print(f'Test score = {M.score(vecced_test_reddit_posts.values, y_test)}')
    
    predictions = M.predict(vecced_test_reddit_posts.values)
    print('--------')
    print(confusion_matrix(y_test, predictions))
    print(f'Best params = {M.best_params_}')

In [21]:
run_the_lr_models('lr_1')

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Train score = 0.9658965896589659
Test score = 0.9372937293729373
--------
[[368  50]
 [  7 484]]
Best params = {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1', 'random_state': 42, 'solver': 'liblinear', 'warm_start': True}


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    3.9s finished


In [22]:
run_the_lr_models('lr_2')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Train score = 0.9823982398239824
Test score = 0.9306930693069307
--------
[[370  48]
 [ 15 476]]
Best params = {'C': 1.5, 'class_weight': 'balanced', 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear', 'warm_start': True}


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    3.3s finished


### Explaining the 3 cells above

I defined a function to: 
- GridSearch the best hyperparameters for two Logistic Regresssion Models
- Fit the models
- Print the train and test scores for the models
- Make predictions and print them in a confusion matrix
- Print out the best parameters

So in one cell I call <i>run_the_lr_models('lr_1')</i>  which runs the first logistic regression model and in the next cell I call <i>run_the_lr_models('lr_2')</i>  which runs the second model. The difference between the two models are the penalty. Some hyperparemeters only work with an L1 penalty while some only work with the L2 so I separated them based on that.

# DECISION TREE MODELS

In [23]:
def run_the_dt_models(model):
    
    if model == 'dt_1':
        
        dt_params = {
            'criterion': ['gini'],
            'max_depth': [4, 24, 54],
            'min_samples_split': [5, 7, 11, 14],
            'max_features': [None, 'log2', 'auto', .40, .50, .70],
            'random_state': [42]}
        
        M = GridSearchCV(DecisionTreeClassifier(),
                        dt_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)
   
    elif model == 'dt_2':
        
        dt_none_params = {
            'criterion': ['entropy'],
            'max_depth': [4, 24, 54],
            'min_samples_split': [5, 7, 11, 14],
            'max_features': [None, 'log2', 'auto', .40, .50, .70],
            'random_state': [42]}
        
        M = GridSearchCV(DecisionTreeClassifier(),
                        dt_none_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)

    else:
        print('There is an error.')
        
    M.fit(vecced_train_reddit_posts.values, y_train)
     
    print(f'Train score = {M.score(vecced_train_reddit_posts.values, y_train)}')
    print(f'Test score = {M.score(vecced_test_reddit_posts.values, y_test)}')
    
    predictions = M.predict(vecced_test_reddit_posts.values)
    print('--------')
    print(confusion_matrix(y_test, predictions))
    print(f'Best params = {M.best_params_}')

In [24]:
run_the_dt_models('dt_1')

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    3.4s


Train score = 0.9746974697469747
Test score = 0.8921892189218922
--------
[[356  62]
 [ 36 455]]
Best params = {'criterion': 'gini', 'max_depth': 54, 'max_features': None, 'min_samples_split': 5, 'random_state': 42}


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    8.0s finished


In [25]:
run_the_dt_models('dt_2')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done 172 tasks      | elapsed:    3.6s


Train score = 0.9658965896589659
Test score = 0.8976897689768977
--------
[[353  65]
 [ 28 463]]
Best params = {'criterion': 'entropy', 'max_depth': 54, 'max_features': 0.4, 'min_samples_split': 11, 'random_state': 42}


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    8.0s finished


### Explaining the 3 cells above

I defined a function to: 
- GridSearch the best hyperparameters for two Decision Tree Models
- Fit the models
- Print the train and test scores for the models
- Make predictions and print them in a confusion matrix
- Print out the best parameters

So in one cell I call <i> run_the_dt_models('dt_1') </i>  which runs the first decision tree model and in the next cell I call <i>run_the_dt_models('dt_2')</i>  which runs the second model. The difference between the two models are the criterion, I wanted one model that focused on gini and the other on entropy.

# RANDOM FOREST MODELS

In [26]:
def run_the_rf_models(model):
    
    if model == 'rf_1':
        
        rf_params = {
            'n_estimators': [15, 24, 30],
            'criterion': ['gini'],
            'max_depth': [None, 5, 13, 21],
            'bootstrap': [True, False],
            'min_samples_split': [5, 7, 15, 25],
            'max_features': [None, 'log2', 'auto', .10, .25, .50],
            'warm_start': [True],
            'random_state': [42]}
        
        M = GridSearchCV(RandomForestClassifier(),
                        rf_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)
        
    elif model == 'rf_2':
        
        rf_none_params = {
            'n_estimators': [15, 24, 30],
            'criterion': ['entropy'],
            'max_depth': [None, 5, 13, 21],
            'bootstrap': [True, False],
            'min_samples_split': [5, 7, 15, 25],
            'max_features': [None, 'log2', 'auto',  .10, .25, .50],
            'warm_start': [True],
            'random_state': [42]}
        
        M = GridSearchCV(RandomForestClassifier(),
                        rf_none_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)
        
    else:
        print('There is an error.')
        
    M.fit(vecced_train_reddit_posts.values, y_train)
     
    print(f'Train score = {M.score(vecced_train_reddit_posts.values, y_train)}')
    print(f'Test score = {M.score(vecced_test_reddit_posts.values, y_test)}')
    
    predictions = M.predict(vecced_test_reddit_posts.values)
    print('--------')
    print(confusion_matrix(y_test, predictions))
    print(f'Best params = {M.best_params_}')

In [27]:
run_the_rf_models('rf_1')

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done 451 tasks      | elapsed:   54.2s
[Parallel(n_jobs=-1)]: Done 1126 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 2126 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 2776 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed:  8.0min finished


Train score = 0.9812981298129813
Test score = 0.935093509350935
--------
[[368  50]
 [  9 482]]
Best params = {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'min_samples_split': 25, 'n_estimators': 24, 'random_state': 42, 'warm_start': True}


In [28]:
run_the_rf_models('rf_2')

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 451 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 947 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1397 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 1996 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 2738 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed:  7.9min finished


Train score = 0.9834983498349835
Test score = 0.933993399339934
--------
[[374  44]
 [ 16 475]]
Best params = {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'log2', 'min_samples_split': 5, 'n_estimators': 24, 'random_state': 42, 'warm_start': True}


### Explaining the 3 cells above

I defined a function to: 
- GridSearch the best hyperparameters for two Random Forest Models
- Fit the models
- Print the train and test scores for the models
- Make predictions and print them in a confusion matrix
- Print out the best parameters

So in one cell I call <i>run_the_rf_models('rf_1')</i>  which runs the first random forest model and in the next cell I call <i>run_the_rf_models('rf_2')</i>  which runs the second model. The difference between the two models are the criterion, I wanted one model that focused on gini and the other on entropy.

# MULTINOMIALNB MODELS

In [29]:
def run_the_mn_models(model):
    
    if model == 'mn_1':
        
        mn_params = {
            'fit_prior': [True],
            'alpha': [0, 0.5, 1]}
        
        M = GridSearchCV(MultinomialNB(),
                        mn_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)
        
    elif model == 'mn_2':
        
         mn_params = {
            'fit_prior': [False],
            'alpha': [0, 0.5, 1]}
        
         M = GridSearchCV(MultinomialNB(),
                        mn_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)
        
    else:
        print('There is an error.')
        
    M.fit(vecced_train_reddit_posts.values, y_train)
     
    print(f'Train score = {M.score(vecced_train_reddit_posts.values, y_train)}')
    print(f'Test score = {M.score(vecced_test_reddit_posts.values, y_test)}')
    
    predictions = M.predict(vecced_test_reddit_posts.values)
    print('--------')
    print(confusion_matrix(y_test, predictions))
    print(f'Best params = {M.best_params_}')

In [30]:
run_the_mn_models('mn_1')

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Train score = 0.9262926292629263
Test score = 0.9240924092409241
--------
[[399  19]
 [ 50 441]]
Best params = {'alpha': 0, 'fit_prior': True}


[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    0.4s finished
  'setting alpha = %.1e' % _ALPHA_MIN)


In [31]:
run_the_mn_models('mn_2')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 3 candidates, totalling 15 fits
Train score = 0.9240924092409241
Test score = 0.9218921892189219
--------
[[401  17]
 [ 54 437]]
Best params = {'alpha': 0, 'fit_prior': False}


[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    0.4s finished
  'setting alpha = %.1e' % _ALPHA_MIN)


### Explaining the 3 cells above

I defined a function to: 
- GridSearch the best hyperparameters for two Multinomial NB Models
- Fit the models
- Print the train and test scores for the models
- Make predictions and print them in a confusion matrix
- Print out the best parameters

So in one cell I call <i>run_the_mn_models('mn_1')</i>  which runs the first multinomial nb model and in the next cell I call <i>run_the_mn_models('mn_2')</i>  which runs the second model. The difference between the two models are fit_prior, with one model having it set equal to True and the other to False.