In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline
import time

### Dataframes

I have three testing data frames from different reddits.  I can pull in each of them and tune them with the different models by changing which csv file I bring in.

In [2]:
#df = pd.read_csv('./data/space_data.csv')
#df = pd.read_csv('./data/combined_data.csv')
df = pd.read_csv('./data/entertainment.csv')

In [3]:
df.shape


(1109, 3)

In [4]:
# also threw in a random target to see how the models would compare
#rand = np.random.randint(2, size = 1351)
#df['target'] = rand
#rand.sum()

## Train Test Split

It is important that we split the data before we start training our model or create our vectorization.  When applying the model it will have no concept of the word from the original model so the data needs to be split and fit only on the training data.

In [5]:
X = df.text
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [6]:
X_train.shape

(831,)

In [7]:
X_test.shape

(278,)

## Count Vectorization

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words


In [9]:
cvec = CountVectorizer()


### Random Forest model

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [11]:
%%time
rf = RandomForestClassifier()
rf_params = {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [ 500, 700, 900],
    'cvec__ngram_range'  : [(1, 1), (1,2)],
    'rf__n_estimators'   : [ 20,22, 24],
    'rf__max_depth'      : [ 18,  20, 22]
    
}
pipe1 = Pipeline([
    ('cvec', cvec),
    ('rf',   rf)
])

gs = GridSearchCV(pipe1, param_grid= rf_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.9350180505415162
CPU times: user 52.7 s, sys: 224 ms, total: 52.9 s
Wall time: 52.9 s


In [12]:
gs.score(X_test, y_test)

0.9280575539568345

In [13]:
gs.best_params_

{'cvec__max_features': 500,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'rf__max_depth': 22,
 'rf__n_estimators': 22}

#### Best model Books : 
Features 100,  ngram (1,2), stopwords english, max depth 12 estimators 16

cross val .941 , test .950  Baseline Accuracy Books .621

#### Space:
Features 300, ngram 1,1 stopwords english, max depth 20, estimators 22

cross val .908, test .902  Baseline Accuracy .500

#### Parties: 
Features 500 ngram 1,1 stopwords none depth 22 estimators 20

cross val .709 Test .680  Baseline Accuracy .526

#### Random
Did not try and fit best models for random
cross val .508 Test .512  Baseline accuracy .516

### Extra Tree

In [51]:
et = ExtraTreesClassifier()
et_params = {
    'cvec__stop_words'       : ['english', None], 
    'cvec__max_features'     : [25,50, 75,],
    'cvec__ngram_range'      : [(1, 1), (1,2)],
    'et__n_estimators'       : [6,8,10], 
    'et__max_depth'          : [ 20, 22, 24]
}


In [52]:
%%time
pipe2 = Pipeline([
    ('cvec', cvec),
    ('et',   et)
])

gs = GridSearchCV(pipe2, param_grid= et_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.927797833935018
CPU times: user 33.9 s, sys: 192 ms, total: 34.1 s
Wall time: 34.1 s


In [53]:
gs.score(X_test, y_test)

0.9172661870503597

In [54]:
gs.best_params_

{'cvec__max_features': 50,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'et__max_depth': 22,
 'et__n_estimators': 10}

#### Books Model 
features 50  ngram 1,1 stopwords english depth 22 estimators 10
cross val .928 test .917
#### Space Model 
features 100, ngram 1,1 stopwords english, depth 20 estimators 14

cross val .901 test 896

#### Parties
features 300 ngram 1,2 stopwords none, depth 22l estimators 14

cross val .703 test .706

#### Random
cross val .520 . test .452

### Naive Bayes

In [18]:
from sklearn.naive_bayes import MultinomialNB


In [56]:
nb = MultinomialNB()
nb_params = {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' :  [4000, 4500, 5000],
    'cvec__ngram_range'  : [(1, 1), (1,2)],
    'nb__alpha'          : [.4,.5,.6]
}


In [57]:

pipe3 = Pipeline([
    ('cvec', cvec),
    ('nb',   nb)
])

gs = GridSearchCV(pipe3, param_grid= nb_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.9578820697954272


In [58]:
gs.score(X_test, y_test)

0.960431654676259

In [59]:
gs.best_params_

{'cvec__max_features': 4500,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english',
 'nb__alpha': 0.5}

#### Books 
model features 4500, ngram 1,2 , stopwords english

cross val .958 test .960
#### Space model
features 3300, ngram 1,1 stopwords none alpha .3

cross val .902 test .899
#### Parties
features 4500 ngram 1,1 stopwords english alpha .7

cross val.690 test .678

#### Random
cross val .492 test .532

### K Nearest Neighbors

In [23]:
from sklearn.neighbors import KNeighborsClassifier

In [24]:
knn = KNeighborsClassifier()
knn_params = {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [500,700, 900],
    'cvec__ngram_range'  : [(1, 1), (1,2)],
    'knn__n_neighbors'   : [3,5,7]
}

In [25]:
pipe4 = Pipeline([
    ('cvec', cvec),
    ('knn',   knn)
])

gs = GridSearchCV(pipe4, param_grid= knn_params)
gs.fit(X_train, y_train)
print(gs.best_score_)


0.8531889290012034


In [26]:
gs.score(X_test, y_test)

0.9028776978417267

In [27]:
gs.best_params_

{'cvec__max_features': 700,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english',
 'knn__n_neighbors': 5}

#### Books 
 max features 100, ngram 1,1, stopwords english, knn =5
 
cross val .856  test group.878 

#### Space 
feature max features 700, ngram 1,1 stopwords english, knn =3 

cross val .820 test .808

#### Parties
features 500 ngram 1,2 stopwords english, knn = 5

cross val .637 Test .612

#### Random
cross val .523 test .517


### Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression


In [29]:
lr = LogisticRegression()
lr_params = {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [ 700, 900, 1100],
    'cvec__ngram_range'  : [(1, 1), (1,2)],
    'lr__penalty'        : ['l1', 'l2']
}

In [30]:
pipe5 = Pipeline([
    ('cvec', cvec),
    ('lr',   lr)
])

gs = GridSearchCV(pipe5, param_grid= lr_params)
gs.fit(X_train, y_train)
print(gs.best_score_)


0.9446450060168472


In [31]:
gs.score(X_test, y_test)

0.9496402877697842

In [32]:
gs.best_params_

{'cvec__max_features': 1100,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'lr__penalty': 'l1'}

#### Books 
features 300, ngram 1,2 stopwords english, lr penalty l1

cross val .946, test .950 accuracy

#### Space 
features 500 ngram 1,1 stopwords english lr penalty l2

crossval .905, test .908

#### Parties
features 900 ngram 1,1 stopwords none penalty l2

crossval .707, test.723

#### Random
crossval .500 test .503


### SVM Model

In [33]:
from sklearn import svm, linear_model, datasets
from sklearn.model_selection import cross_val_score

In [34]:
sv_m = svm.SVC() 
sv_m_params =  {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [ 2500, 3000],
    'cvec__ngram_range'  : [(1, 1), (1,2)],
    "sv_m__C"            : [.05,.1,.3 ],
    "sv_m__kernel"       : ['rbf', 'linear', 'poly']
}



In [35]:
pipe6 = Pipeline([
    ('cvec', cvec),
    ('sv_m',   sv_m)
])

gs = GridSearchCV(pipe6, param_grid= sv_m_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.9253910950661853


In [36]:
gs.score(X_test, y_test)

0.9460431654676259

In [37]:
gs.best_params_

{'cvec__max_features': 2500,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english',
 'sv_m__C': 0.3,
 'sv_m__kernel': 'linear'}

#### Books 
features 700, ngram 1,1 stopwords english c = .5 linear

cross val .925 test .950

#### Space 
features 100 ngram 1,2 stopwords english, c .5 linear

cross val .897, test .888

#### Parties
features 2500 ngram 1,2 stopwords none c .1 linear

cross val .729  test .707

#### Random
Cross val .504 test .503

### Adaboost Model

In [38]:
from sklearn.ensemble import AdaBoostClassifier

In [39]:
ada = AdaBoostClassifier()
ada_params = {
    'cvec__stop_words'   : ['english', None], 
    'cvec__max_features' : [100,300,500],
    'cvec__ngram_range'  : [(1, 1)],
    'ada__n_estimators'  : [50,60, 85 ],
    'ada__learning_rate' : [.3, .4, .5],
    
}


In [40]:
pipe7 = Pipeline([
    ('cvec', cvec),
    ('ada',   ada)
])

gs = GridSearchCV(pipe7, param_grid= ada_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.9554753309265944


In [41]:
gs.best_params_

{'ada__learning_rate': 0.4,
 'ada__n_estimators': 60,
 'cvec__max_features': 300,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english'}

In [42]:
gs.score(X_test, y_test)

0.9460431654676259

#### Books 
features 1000, ngram 1,1 stopwords none, learning rate .8 nestimators 50

crossval .954, test .935

#### Space 
feature 500 ngram 1,1 stopwords english learning rate .4 estmators 85

crossval .911 test .929

#### Parties
features 300 ngram 1,1 stopwords none learning rate .4 estimators 60

cross val .690 test .685

#### Random 
cross val .513 test .484