In [139]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline
import time

### Dataframes

I have three testing data frames from different reddits.  I can pull in each of them and tune them with the different models by changing which csv file I bring in.

In [140]:
df = pd.read_csv('./data/space_data.csv')
#df = pd.read_csv('./data/combined_data.csv)
#df = pd.read_csv()

In [141]:
df.shape


(1352, 2)

## Train Test Split

It is important that we split the data before we start training our model or create our vectorization.  When applying the model it will have no concept of the word from the original model so the data needs to be split and fit only on the training data.

In [142]:
X = df.text
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [143]:
X_train.shape

(1014,)

In [144]:
X_test.shape

(338,)

## Count Vectorization

In [145]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words


In [146]:
cvec = CountVectorizer()


### Random Forest model

In [147]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [148]:
rf = RandomForestClassifier()
rf_params = {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [ 500, 700, 1000],
    'cvec__ngram_range'  : [(1, 1), (1,2)],
    'rf__n_estimators'   : [16, 18, 20],
    'rf__max_depth'      : [12, 14, 16]
    
}
pipe1 = Pipeline([
    ('cvec', cvec),
    ('rf',   rf)
])

gs = GridSearchCV(pipe1, param_grid= rf_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.8954635108481263


In [149]:
gs.score(X_test, y_test)

0.8905325443786982

In [150]:
gs.best_params_

{'cvec__max_features': 700,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'rf__max_depth': 14,
 'rf__n_estimators': 18}

### Extra Tree

In [151]:
et = ExtraTreesClassifier()
et_params = {
    'cvec__stop_words'       : ['english', None], 
    'cvec__max_features'     : [ 500, 700, 1000],
    'cvec__ngram_range'      : [(1, 1), (1,2)],
    'et__min_samples_split'  : [2,4,6]
}


In [179]:
%%time
pipe2 = Pipeline([
    ('cvec', cvec),
    ('et',   et)
])

gs = GridSearchCV(pipe2, param_grid= et_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.8984220907297831
CPU times: user 16.4 s, sys: 79.3 ms, total: 16.5 s
Wall time: 16.5 s


In [153]:
gs.score(X_test, y_test)

0.9201183431952663

In [154]:
gs.best_params_

{'cvec__max_features': 500,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english',
 'et__min_samples_split': 6}

### Naive Bayes

In [155]:
from sklearn.naive_bayes import MultinomialNB


In [156]:
nb = MultinomialNB()
nb_params = {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [200,300, 500, 800],
    'cvec__ngram_range'  : [(1, 1), (1,2)]
}


In [157]:

pipe3 = Pipeline([
    ('cvec', cvec),
    ('nb',   nb)
])

gs = GridSearchCV(pipe3, param_grid= nb_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.863905325443787


In [158]:
gs.score(X_test, y_test)

0.8994082840236687

In [159]:
gs.best_params_

{'cvec__max_features': 800,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': None}

### K Nearest Neighbors

In [160]:
from sklearn.neighbors import KNeighborsClassifier

In [161]:
knn = KNeighborsClassifier()
knn_params = {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [500, 700, 1000],
    'cvec__ngram_range'  : [(1, 1), (1,2)],
    'knn__n_neighbors'   : [3,5,7]
}

In [162]:
pipe4 = Pipeline([
    ('cvec', cvec),
    ('knn',   knn)
])

gs = GridSearchCV(pipe4, param_grid= knn_params)
gs.fit(X_train, y_train)
print(gs.best_score_)


0.8254437869822485


In [163]:
gs.score(X_test, y_test)

0.8017751479289941

In [164]:
gs.best_params_

{'cvec__max_features': 500,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'knn__n_neighbors': 3}

### Logistic Regression

In [165]:
from sklearn.linear_model import LogisticRegression


In [166]:
lr = LogisticRegression()
lr_params = {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [200,300, 500, 800],
    'cvec__ngram_range'  : [(1, 1), (1,2)],
    'lr__penalty'        : ['l1', 'l2']
}

In [167]:
pipe5 = Pipeline([
    ('cvec', cvec),
    ('lr',   lr)
])

gs = GridSearchCV(pipe5, param_grid= lr_params)
gs.fit(X_train, y_train)
print(gs.best_score_)


0.903353057199211


In [168]:
gs.score(X_test, y_test)

0.9142011834319527

In [98]:
gs.best_params_

{'cvec__max_features': 300,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english',
 'lr__penalty': 'l2'}

### SVM Model

In [169]:
from sklearn import svm, linear_model, datasets
from sklearn.model_selection import cross_val_score

In [170]:
sv_m = svm.SVC() 
sv_m_params =  {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [ 500, 700, 1000],
    'cvec__ngram_range'  : [(1, 1), (1,2)],
    "sv_m__C"            : [.01,.1,.5],
    "sv_m__kernel"       : ['rbf', 'linear', 'poly']
}



In [171]:
pipe6 = Pipeline([
    ('cvec', cvec),
    ('sv_m',   sv_m)
])

gs = GridSearchCV(pipe6, param_grid= sv_m_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.8846153846153846


In [172]:
gs.score(X_test, y_test)

0.9142011834319527

In [173]:
gs.best_params_

{'cvec__max_features': 700,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'sv_m__C': 0.5,
 'sv_m__kernel': 'linear'}

### Adaboost Model

In [174]:
from sklearn.ensemble import AdaBoostClassifier

In [175]:
ada = AdaBoostClassifier()
ada_params = {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [ 500, 700, 1000],
    'cvec__ngram_range'  : [(1, 1), (1,2)],
    'ada__n_estimators' : [ 40, 50, 55],
    'ada__learning_rate': [.7, .8, .9],
}


In [176]:
pipe7 = Pipeline([
    ('cvec', cvec),
    ('ada',   ada)
])

gs = GridSearchCV(pipe7, param_grid= ada_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.9072978303747534


In [177]:
gs.best_params_

{'ada__learning_rate': 0.8,
 'ada__n_estimators': 40,
 'cvec__max_features': 1000,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english'}

In [178]:
gs.score(X_test, y_test)

0.9319526627218935

### Voting Classifier

In [155]:
from sklearn.ensemble import VotingClassifier

In [None]:
vote = VotingClassifier([
     ('rf', RandomForestClassifier()),
     ('lr', LogisticRegression()),
     ('knn', KNeighborsClassifier()),
     ('sv_m', svm.SVC()),
     ('nb', NaiveBayes()),
])