# ADA BOOST CLASSIFIER
I used a decision tree with a boost classifier. I had to try various settings in my parameter grid before finding an acceptable fit. My baseline score is %50. I made a concious choice to keep my classes balanced. This model seemed to fit the data most accurately. It scored an accuracy of about 68 percent which seems reasonable with volume of data.

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
import re
import pickle



In [4]:
combined = pd.read_pickle('../data/combined.pkl')
combined.columns

Index(['author', 'author_cakeday', 'author_flair_background_color',
       'author_flair_css_class', 'author_flair_richtext',
       'author_flair_template_id', 'author_flair_text',
       'author_flair_text_color', 'author_flair_type', 'author_fullname',
       'author_id', 'body', 'created_utc', 'distinguished', 'edited', 'id',
       'link_id', 'no_follow', 'parent_id', 'permalink', 'retrieved_on',
       'rte_mode', 'score', 'send_replies', 'stickied', 'subreddit',
       'subreddit_id'],
      dtype='object')

In [None]:
X_train = pd.read_pickle('../data/X_train.pkl')
X_test = pd.read_pickle('../data/X_test.pkl')
y_train = pd.read_pickle('../data/y_train.pkl')
y_test = pd.read_pickle('../data/y_test.pkl')

# Setting up my Pipeline
The tfidf vec is breaking the words into single tokens and bigrams.

In [10]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english',ngram_range=(1,2))),
    ('ada',AdaBoostClassifier()),
    
])


# Setting up my param grid

In [11]:
param_grid =  {
    'tfidf__min_df': np.arange(1,4,2),
    'tfidf__max_df': [.95, .98, 1.0],
    'ada__n_estimators':[10,20,30,60,70,80]
    
}

In [12]:
gs = GridSearchCV(pipe, param_grid=param_grid,verbose=1)

In [13]:
gs.fit(X_train,y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed: 139.8min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
 ...m='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'tfidf__min_df': array([1, 3]), 'tfidf__max_df': [0.95, 0.98, 1.0], 'ada__n_estimators': [10, 20, 30, 60, 70, 80]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

In [2]:
gs.score(X_train,y_train)

In [15]:
gs.score(X_test,y_test)

0.6425261982252393

In [16]:
X_test.shape

(114607,)

In [24]:
combined['subreddit'].value_counts()/combined.shape[0]

Libertarian            0.527031
LateStageCapitalism    0.472969
Name: subreddit, dtype: float64

In [18]:
pred_df = pd.DataFrame(gs.predict_proba(X_test))

In [19]:
pred_df[(pred_df[1]<.45) | (pred_df[1]>.55)]

Unnamed: 0,0,1
726,0.387035,0.612965
1506,0.389576,0.610424
1890,0.389576,0.610424
2085,0.387265,0.612735
3765,0.376598,0.623402
4636,0.391835,0.608165
5158,0.394652,0.605348
5831,0.391785,0.608215
5845,0.389576,0.610424
6933,0.389576,0.610424


In [20]:
gs.best_score_

0.6412715863791724

In [21]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
...m='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=80, random_state=None))])

382021

In [22]:
gs.predict(["Worth noting the evolution of Trump's Syria policy is very similar to what happened with Afghanistan. In both places, Trump instinctively wanted to wind down U.S. military presence -- foreign policy aides convinced him not only to maintain status quo, but expand the mission."])

array(['Libertarian'], dtype='<U19')

In [35]:
with open('../assets/ada_boost.pkl','wb+') as f:
    pickle.dump(gs,f)