In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline
import time

### Dataframes

I have three testing data frames from different reddits.  I can pull in each of them and tune them with the different models by changing which csv file I bring in.

In [114]:
df = pd.read_csv('./data/space_data.csv')
#df = pd.read_csv('./data/combined_data.csv')
#df = pd.read_csv('./data/entertainment.csv')

In [115]:
df.shape


(1351, 2)

## Train Test Split

It is important that we split the data before we start training our model or create our vectorization.  When applying the model it will have no concept of the word from the original model so the data needs to be split and fit only on the training data.

In [116]:
X = df.text
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [117]:
X_train.shape

(1013,)

In [118]:
X_test.shape

(338,)

## Count Vectorization

In [119]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words


In [120]:
cvec = CountVectorizer()


### Random Forest model

In [121]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [128]:
%%time
rf = RandomForestClassifier()
rf_params = {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [ 50,100,300],
    'cvec__ngram_range'  : [(1, 1), (1,2)],
    'rf__n_estimators'   : [12,14],
    'rf__max_depth'      : [12, 14]
    
}
pipe1 = Pipeline([
    ('cvec', cvec),
    ('rf',   rf)
])

gs = GridSearchCV(pipe1, param_grid= rf_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.9032576505429417
CPU times: user 46.9 s, sys: 224 ms, total: 47.1 s
Wall time: 47.2 s


In [129]:
gs.score(X_test, y_test)

0.8964497041420119

In [130]:
gs.best_params_

{'cvec__max_features': 100,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'rf__max_depth': 14,
 'rf__n_estimators': 14}

Best model Books : Features 100,  ngram (1,2), stopwords english, max depth 13 estimators 16

cross val .941 , test .950  Baseline Accuracy Books .621

Space: F

### Extra Tree

In [51]:
et = ExtraTreesClassifier()
et_params = {
    'cvec__stop_words'       : ['english', None], 
    'cvec__max_features'     : [ 100, 300, 500, ],
    'cvec__ngram_range'      : [(1, 1), (1,2)],
    'et__n_estimators'       : [6,8, 10],
    'et__max_depth'          : [16, 18, 20]
}


In [48]:
%%time
pipe2 = Pipeline([
    ('cvec', cvec),
    ('et',   et)
])

gs = GridSearchCV(pipe2, param_grid= et_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.9085439229843562
CPU times: user 49 s, sys: 179 ms, total: 49.2 s
Wall time: 49.2 s


In [52]:
gs.score(X_test, y_test)

0.8812949640287769

In [53]:
gs.best_params_

{'cvec__max_features': 300,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'et__max_depth': 18,
 'et__n_estimators': 10}

### Naive Bayes

In [54]:
from sklearn.naive_bayes import MultinomialNB


In [59]:
nb = MultinomialNB()
nb_params = {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [300, 500, 700],
    'cvec__ngram_range'  : [(1, 1), (1,2)]
}


In [60]:

pipe3 = Pipeline([
    ('cvec', cvec),
    ('nb',   nb)
])

gs = GridSearchCV(pipe3, param_grid= nb_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.9494584837545126


In [57]:
gs.score(X_test, y_test)

0.935251798561151

In [58]:
gs.best_params_

{'cvec__max_features': 500,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': None}

Books model features 500, ngram 1,1 , stopwords none
cross val .949 test .935

### K Nearest Neighbors

In [61]:
from sklearn.neighbors import KNeighborsClassifier

In [67]:
knn = KNeighborsClassifier()
knn_params = {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [ 900, 1000, 1100],
    'cvec__ngram_range'  : [(1, 1), (1,2)],
    'knn__n_neighbors'   : [3,5,7]
}

In [68]:
pipe4 = Pipeline([
    ('cvec', cvec),
    ('knn',   knn)
])

gs = GridSearchCV(pipe4, param_grid= knn_params)
gs.fit(X_train, y_train)
print(gs.best_score_)


0.855595667870036


In [69]:
gs.score(X_test, y_test)

0.8776978417266187

In [70]:
gs.best_params_

{'cvec__max_features': 1000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'knn__n_neighbors': 5}

Books data for knn max features 100, ngram 1,1, stopwords english, knn =5
accuracy bad at .856 cross val and .878 test group


### Logistic Regression

In [71]:
from sklearn.linear_model import LogisticRegression


In [72]:
lr = LogisticRegression()
lr_params = {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [200,300, 500],
    'cvec__ngram_range'  : [(1, 1), (1,2)],
    'lr__penalty'        : ['l1', 'l2']
}

In [73]:
pipe5 = Pipeline([
    ('cvec', cvec),
    ('lr',   lr)
])

gs = GridSearchCV(pipe5, param_grid= lr_params)
gs.fit(X_train, y_train)
print(gs.best_score_)


0.9458483754512635


In [74]:
gs.score(X_test, y_test)

0.9496402877697842

In [75]:
gs.best_params_

{'cvec__max_features': 300,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english',
 'lr__penalty': 'l1'}

In [None]:
features 300, ngram 1,2 stopwords english, lr penalty l1
cross val .946, test .950 accuracy

### SVM Model

In [76]:
from sklearn import svm, linear_model, datasets
from sklearn.model_selection import cross_val_score

In [106]:
sv_m = svm.SVC() 
sv_m_params =  {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [ 500, 700, 1000],
    'cvec__ngram_range'  : [(1, 1), (1,2)],
    "sv_m__C"            : [.5, 1, 10],
    "sv_m__kernel"       : ['rbf', 'linear', 'poly']
}



In [107]:
pipe6 = Pipeline([
    ('cvec', cvec),
    ('sv_m',   sv_m)
])

gs = GridSearchCV(pipe6, param_grid= sv_m_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.9253910950661853


In [108]:
gs.score(X_test, y_test)

0.9496402877697842

In [109]:
gs.best_params_

{'cvec__max_features': 700,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'sv_m__C': 0.5,
 'sv_m__kernel': 'linear'}

In [None]:
boooks features 700, ngram 1,1 stopwords english c = .5 linear
cross val .925 test .950

### Adaboost Model

In [85]:
from sklearn.ensemble import AdaBoostClassifier

In [110]:
ada = AdaBoostClassifier()
ada_params = {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [700, 1000, 1200],
    'cvec__ngram_range'  : [(1, 1)],
    'ada__n_estimators'  : [ 40, 50, 55],
    'ada__learning_rate' : [.7, .8, .9],
    
}


In [111]:
pipe7 = Pipeline([
    ('cvec', cvec),
    ('ada',   ada)
])

gs = GridSearchCV(pipe7, param_grid= ada_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.9542719614921781


In [112]:
gs.best_params_

{'ada__learning_rate': 0.8,
 'ada__n_estimators': 50,
 'cvec__max_features': 1000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': None}

In [113]:
gs.score(X_test, y_test)

0.935251798561151

In [None]:
Books features 1000, ngram 1,1 stopwords none, learning rate .8 nestimators 50
crosval .954, test .935

### Voting Classifier

In [104]:
from sklearn.ensemble import VotingClassifier
vote = VotingClassifier()

TypeError: __init__() missing 1 required positional argument: 'estimators'

In [103]:
pipe7 = Pipeline([
    ('cvec', cvec),
    ('ada',   ada)
])

ValueError: could not convert string to float: 'Shows that could have been. Before "The Big Bang Theory", the animated pilot for "Welcome to Eltingville: Bring Me the Head of Boba Fett".Years before "The Big Bang Theory", Evan Dorkin made this animated pilot for Adult Swim, but alas it was not picked up. It\'s still a good geeky watch.\n\nLong live the Eltingville Club!\n\n[Welcome to Eltingville](https://www.youtube.com/watch?v=Lyus5kz8wB0)'

In [102]:
vote.score(X_test, y_test)

NotFittedError: This VotingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.