Given that interpretation has been conducted with the count vectorizer and logistic regression, this notebook will focus solely on optimizing predictive performance by incorporating multiple classification models and combining the outputs of these models together. 

In [1]:
# importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

  from numpy.core.umath_tests import inner1d


In [2]:
train_text = pd.read_csv('./datasets/cleaned_donald_chappo_text.csv', index_col=0)
train_text.dropna(inplace=True)
train_text.head()

Unnamed: 0,author,body,date,score,subreddit
0,nixfu,no completed deals not doing business he w...,2018-11-30,1,The_Donald
1,Crypulous,trump has been amazing on a lot of things but...,2018-11-30,1,The_Donald
2,Dueler312,actually fox news did show it,2018-11-30,1,The_Donald
3,soberlight,the excuse about being concerned about their r...,2018-11-30,1,The_Donald
4,enterthewalrus,well arizona did not get martha mcsally but it...,2018-11-30,1,The_Donald


In [3]:
train_text.shape

(57710, 5)

In [4]:
X = train_text['body']
y = train_text['subreddit'].map(lambda x: 1 if x == 'The_Donald' else 0)

In [5]:
# baseline score is approximately 51%
y.value_counts(normalize=True)

1    0.511159
0    0.488841
Name: subreddit, dtype: float64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .05, stratify=y)

---

In [7]:
#building a function that builds a pipeline with the specified vectorizer and model for efficient grid searching

def vect_model(vectorizer, model):    
    vect_dict = {
        'count' : CountVectorizer(), 
        'hash' : HashingVectorizer(non_negative=True),
        'td' : TfidfVectorizer()
    }

    model_dict = {
        'log_reg' : LogisticRegression(), 
        'rand_forest' : RandomForestClassifier(),
        'bayes' : MultinomialNB(), 
        'xgb' : XGBClassifier()
    }

    steps = [
        ('vectorize', vect_dict[vectorizer]),
        ('model', model_dict[model])
    ]

    return Pipeline(steps = steps)

Testing different vectorizors with Naive Bayes

In [8]:
bayes_count_model = vect_model('count', 'bayes')

In [9]:
pipe_params = {
    'vectorize__stop_words': ['english', None],
    'vectorize__ngram_range': [(1,1), (1,2)],
    'vectorize__max_features' : [35000, 40000]
}

grid = GridSearchCV(bayes_count_model, pipe_params, n_jobs=-2, verbose=1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-2)]: Done  24 out of  24 | elapsed:   19.4s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorize', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preproc...zer=None, vocabulary=None)), ('model', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-2,
       param_grid={'vectorize__stop_words': ['english', None], 'vectorize__ngram_range': [(1, 1), (1, 2)], 'vectorize__max_features': [35000, 40000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [10]:
grid.best_params_

{'vectorize__max_features': 35000,
 'vectorize__ngram_range': (1, 1),
 'vectorize__stop_words': None}

In [11]:
grid.best_estimator_.score(X_train, y_train)

0.8067452210710637

In [12]:
grid.best_estimator_.score(X_test, y_test)

0.7297297297297297

In [13]:
bayes_model = grid.best_estimator_

In [14]:
bayes_td_model = vect_model('td', 'bayes')

In [15]:
pipe_params = {
    'vectorize__stop_words': ['english', None],
    'vectorize__ngram_range': [(1,1), (1,2)],
    'vectorize__max_features' : [35000, 40000]
}

grid = GridSearchCV(bayes_td_model, pipe_params, n_jobs=-2, verbose=1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-2)]: Done  24 out of  24 | elapsed:   19.7s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorize', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...e,
        vocabulary=None)), ('model', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-2,
       param_grid={'vectorize__stop_words': ['english', None], 'vectorize__ngram_range': [(1, 1), (1, 2)], 'vectorize__max_features': [35000, 40000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [16]:
grid.best_params_

{'vectorize__max_features': 40000,
 'vectorize__ngram_range': (1, 2),
 'vectorize__stop_words': 'english'}

In [17]:
grid.best_estimator_.score(X_train, y_train)

0.8313877134101854

In [18]:
grid.best_estimator_.score(X_test, y_test)

0.7245322245322245

In [19]:
if grid.best_estimator_.score(X_test, y_test) > bayes_model.score(X_test, y_test):
    bayes_model = grid.best_estimator_
    print('model changed')

The score is identical, so the count vercorizer model is kept as native bayes model

---

Testing different vectorizors with Logistic Regression

In [20]:
log_count_model = vect_model('count', 'log_reg')

In [21]:
pipe_params = {
    'vectorize__stop_words': ['english', None],
    'vectorize__ngram_range': [(1,1), (1,2), (1,3)],
    'vectorize__max_features' : [40000],
    'model__penalty' : ['l1', 'l2'],
    'model__C' : [1, .1]
}

grid = GridSearchCV(log_count_model, pipe_params, n_jobs=-2, verbose=1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:   48.0s
[Parallel(n_jobs=-2)]: Done  72 out of  72 | elapsed:  1.7min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorize', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-2,
       param_grid={'vectorize__stop_words': ['english', None], 'vectorize__ngram_range': [(1, 1), (1, 2), (1, 3)], 'vectorize__max_features': [40000], 'model__penalty': ['l1', 'l2'], 'model__C': [1, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [22]:
grid.best_params_

{'model__C': 0.1,
 'model__penalty': 'l2',
 'vectorize__max_features': 40000,
 'vectorize__ngram_range': (1, 2),
 'vectorize__stop_words': None}

In [23]:
grid.best_estimator_.score(X_train, y_train)

0.8218845760980592

In [24]:
grid.best_estimator_.score(X_test, y_test)

0.7165627165627165

In [25]:
log_reg_model = grid.best_estimator_

In [26]:
log_td_model = vect_model('td', 'log_reg')

In [27]:
pipe_params = {
    'vectorize__stop_words': ['english', None],
    'vectorize__ngram_range': [(1,1), (1,2)],
    'vectorize__max_features' : [35000, 40000],
}

grid = GridSearchCV(log_td_model, pipe_params, n_jobs=-2, verbose=1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-2)]: Done  24 out of  24 | elapsed:   22.0s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorize', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-2,
       param_grid={'vectorize__stop_words': ['english', None], 'vectorize__ngram_range': [(1, 1), (1, 2)], 'vectorize__max_features': [35000, 40000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [28]:
grid.best_params_

{'vectorize__max_features': 40000,
 'vectorize__ngram_range': (1, 2),
 'vectorize__stop_words': None}

In [29]:
grid.best_estimator_.score(X_train, y_train)

0.8272654311980154

In [30]:
grid.best_estimator_.score(X_test, y_test)

0.7300762300762301

In [31]:
if grid.best_estimator_.score(X_test, y_test) > log_reg_model.score(X_test, y_test):
    log_reg_model = grid.best_estimator_
    print('model changed')

model changed


The count vectorizer outperformed the TFIDF vectorizer for logistic regression

---

Testing different vectorizors with XGBoost

In [32]:
import warnings
warnings.filterwarnings("ignore")

In [33]:
xgb_count_model = vect_model('count', 'xgb')

In [34]:
pipe_params = {
    'vectorize__stop_words': [None],
    'vectorize__ngram_range': [(1,1), (1,2)],
    'vectorize__max_features' : [35000, 40000],
    'model__n_estimators' : [300],
    'model__max_depth' : [10]
}

grid = GridSearchCV(xgb_count_model, pipe_params, n_jobs=-2, verbose=1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-2)]: Done  12 out of  12 | elapsed:  7.5min remaining:    0.0s
[Parallel(n_jobs=-2)]: Done  12 out of  12 | elapsed:  7.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorize', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid=True, n_jobs=-2,
       param_grid={'vectorize__stop_words': [None], 'vectorize__ngram_range': [(1, 1), (1, 2)], 'vectorize__max_features': [35000, 40000], 'model__n_estimators': [300], 'model__max_depth': [10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [35]:
grid.best_params_

{'model__max_depth': 10,
 'model__n_estimators': 300,
 'vectorize__max_features': 35000,
 'vectorize__ngram_range': (1, 2),
 'vectorize__stop_words': None}

In [36]:
grid.best_estimator_.score(X_train, y_train)

0.7552166934189406

In [37]:
grid.best_estimator_.score(X_test, y_test)

0.6794871794871795

In [38]:
xgb_model = grid.best_estimator_

In [39]:
xgb_td_model = vect_model('td', 'xgb')

In [40]:
pipe_params = {
    'vectorize__stop_words': ['english'],
    'vectorize__ngram_range': [(1,1),(1,2)],
    'vectorize__max_features' : [35000, 40000],
    'model__n_estimators' : [300],
    'model__max_depth' : [10]
}

grid = GridSearchCV(xgb_td_model, pipe_params, n_jobs=-2, verbose=1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-2)]: Done  12 out of  12 | elapsed:  7.6min remaining:    0.0s
[Parallel(n_jobs=-2)]: Done  12 out of  12 | elapsed:  7.6min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorize', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid=True, n_jobs=-2,
       param_grid={'vectorize__stop_words': ['english'], 'vectorize__ngram_range': [(1, 1), (1, 2)], 'vectorize__max_features': [35000, 40000], 'model__n_estimators': [300], 'model__max_depth': [10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [41]:
grid.best_params_

{'model__max_depth': 10,
 'model__n_estimators': 300,
 'vectorize__max_features': 40000,
 'vectorize__ngram_range': (1, 2),
 'vectorize__stop_words': 'english'}

In [42]:
grid.best_estimator_.score(X_train, y_train)

0.7419195972566759

In [43]:
grid.best_estimator_.score(X_test, y_test)

0.6677061677061678

In [44]:
if grid.best_estimator_.score(X_test, y_test) > xgb_model.score(X_test, y_test):
    xgb_model = grid.best_estimator_
    print('model changed')

The count vectorizer outperformed the TFIDF vectorizer for XGBoost

---

Testing different models with Random Forest

In [45]:
rf_count_model = vect_model('count', 'rand_forest')

In [46]:
pipe_params = {
    'vectorize__stop_words': [None],
    'vectorize__ngram_range': [(1,1),(1,2)],
    'vectorize__max_features' : [35000, 40000],
     'model__n_estimators' : [405],
     'model__max_depth': [25]
}

grid = GridSearchCV(rf_count_model, pipe_params, verbose=1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  4.0min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorize', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vectorize__stop_words': [None], 'vectorize__ngram_range': [(1, 1), (1, 2)], 'vectorize__max_features': [35000, 40000], 'model__n_estimators': [405], 'model__max_depth': [25]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [47]:
grid.best_params_

{'model__max_depth': 25,
 'model__n_estimators': 405,
 'vectorize__max_features': 35000,
 'vectorize__ngram_range': (1, 1),
 'vectorize__stop_words': None}

In [48]:
grid.best_estimator_.score(X_train, y_train)

0.7026302349336058

In [49]:
grid.best_estimator_.score(X_test, y_test)

0.6507276507276507

In [50]:
rf_model = grid.best_estimator_

In [51]:
rf_td_model = vect_model('td', 'rand_forest')

In [52]:
pipe_params = {
    'vectorize__stop_words': ['english'],
    'vectorize__ngram_range': [(1,1), (1,2)],
    'vectorize__max_features' : [35000, 40000],
    'model__n_estimators' : [410],
    'model__max_depth': [27]
}

grid = GridSearchCV(rf_td_model, pipe_params, n_jobs=-2, verbose=1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-2)]: Done  12 out of  12 | elapsed:   57.7s remaining:    0.0s
[Parallel(n_jobs=-2)]: Done  12 out of  12 | elapsed:   57.7s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorize', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-2,
       param_grid={'vectorize__stop_words': ['english'], 'vectorize__ngram_range': [(1, 1), (1, 2)], 'vectorize__max_features': [35000, 40000], 'model__n_estimators': [410], 'model__max_depth': [27]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [53]:
grid.best_params_

{'model__max_depth': 27,
 'model__n_estimators': 410,
 'vectorize__max_features': 40000,
 'vectorize__ngram_range': (1, 1),
 'vectorize__stop_words': 'english'}

In [54]:
grid.best_estimator_.score(X_train, y_train)

0.6989274770173647

In [55]:
grid.best_estimator_.score(X_test, y_test)

0.6431046431046431

In [56]:
if grid.best_estimator_.score(X_test, y_test) > rf_model.score(X_test, y_test):
    rf_model = grid.best_estimator_
    print('model changed')

The count vectorizer outperformed the TFIDF vectorizer for Random Forest

---

While these accuracies are not unexpected given the similiarity of our subreddits, perhaps they can be improved by looking at the outputs of all our models together. 

In [81]:
# Creating a dataframe that takes the predictions from each model as new training features and counts the votes
predictions_df = pd.DataFrame(index=X_test.index, columns=['bayes', 'log_reg', 'xgb', 'rand_forest'])
predictions_df['bayes'] = bayes_model.predict(X_test)
predictions_df['log_reg'] = log_reg_model.predict(X_test)
predictions_df['xgb'] = xgb_model.predict(X_test)
predictions_df['rand_forest'] = rf_model.predict(X_test)

predictions_df['votes'] = predictions_df.apply(np.sum, axis=1)

predictions_df.head()

Unnamed: 0,bayes,log_reg,xgb,rand_forest,votes
45186,0,0,1,1,2
42746,0,0,0,0,0
17869,1,1,1,1,4
28277,1,1,1,1,4
45028,1,0,1,1,3


In [82]:
# Counting any comment where at least 3 models voted postitve as a positive
y_pred = predictions_df.votes.map(lambda x: 1 if x >= 3 else 0)
accuracy_score(y_test, y_pred)

0.7331947331947332

Slight improvement, but certainly not significant enough to be worth the additional effort. Going forward I would use the Naive Bayes model.

---

For the sake of curiosity we can also try a stacked-model and train a logistic regression on the model outputs and see if this improves accuarcy

In [83]:
# First we need to train the model on the training set predictions
train_predictions_df = pd.DataFrame(index=X_train.index, columns=['bayes', 'log_reg', 'xgb', 'rand_forest'])
train_predictions_df['bayes'] = bayes_model.predict(X_train)
train_predictions_df['log_reg'] = log_reg_model.predict(X_train)
train_predictions_df['xgb'] = xgb_model.predict(X_train)
train_predictions_df['rand_forest'] = rf_model.predict(X_train)

train_predictions_df.head()

Unnamed: 0,bayes,log_reg,xgb,rand_forest
14081,1,1,1,1
50414,1,1,1,1
843,1,1,1,1
10757,0,1,0,0
25646,1,0,0,1


In [85]:
god_log_reg = LogisticRegression()
god_log_reg.fit(train_predictions_df, y_train)

god_log_reg.score(train_predictions_df, y_train)

0.8271195097037793

In [86]:
god_log_reg.score(test_predictions_df.drop(['votes'], axis=1), y_test)

0.7321552321552321

No boost in performance. 