In [132]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier

In [133]:
df = pd.read_csv('./data/combined_data.csv')

In [134]:
df.shape


(1675, 2)

## Train Test Split

It is important that we split the data before we start training our model or create our vectorization.  When applying the model it will have no concept of the word from the original model so the data needs to be split and fit only on the training data.

In [135]:
X = df.title
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

## Count Vectorization

In [136]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words


In [137]:
cvec = CountVectorizer(min_df = 3, stop_words='english')
cvec.fit(X_train);

In [138]:
X_train = cvec.transform(X_train)
X_test = cvec.transform(X_test)

In [139]:
X_train.shape

(1256, 1182)

In [140]:
X_test.shape

(419, 1182)

### Random Forest model

In [141]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [142]:
rf = RandomForestClassifier()
rf_params = {
    'n_estimators' : [12, 14, 16],
    'max_depth'    : [10, 12, 14]
    
}

gs = GridSearchCV(rf, param_grid=rf_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.7070063694267515


In [143]:
gs.score(X_test, y_test)

0.6873508353221957

In [144]:
gs.best_params_

{'max_depth': 14, 'n_estimators': 12}

### Extra Tree

### KNN

In [131]:
from sklearn.

SyntaxError: invalid syntax (<ipython-input-131-f8934e568edb>, line 1)

### Logistic Regression

In [145]:
from sklearn.linear_model import LogisticRegression


In [146]:
lr = LogisticRegression()
lr_params = {
    'penalty'  : ['l1', 'l2']
    
    
}

In [147]:
gs = GridSearchCV(lr, param_grid=lr_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.7014331210191083


In [148]:
gs.score(X_test, y_test)

0.6897374701670644

In [149]:
gs.best_params_

{'penalty': 'l1'}

In [97]:
lr.fit(X_train, y_train)
lr.coef_

array([[-3.35397633e-02, -1.36306374e-02, -7.45805679e-01,
        -6.34241444e-01,  9.95420539e-01,  5.02195007e-01,
         7.16240509e-02,  6.03350865e-01, -5.04783850e-01,
         2.30628403e-03,  6.75254648e-02, -3.07896096e-01,
         7.72559400e-02, -9.05144693e-01, -3.03932213e-02,
         3.14280266e-01,  4.25434291e-02, -2.55110328e-01,
         2.03414006e-01,  5.99172212e-01,  7.20978804e-01,
         1.07557892e+00, -5.41791047e-01, -3.85762687e-01,
         1.84743702e-01, -2.22485509e-01, -2.46058508e-01,
        -6.17852469e-01, -3.16921572e-01,  2.98751193e-01,
        -3.39655944e-01, -7.06181805e-01, -6.14579069e-01,
         4.73015393e-01, -1.07639377e+00, -9.20176921e-01,
         1.00248208e-01,  7.05141885e-02,  1.39655502e-01,
        -1.93025324e-01,  1.73779433e-01,  1.39909421e+00,
        -1.26654070e+00,  1.73410657e-01,  3.57975280e-02,
        -2.79610831e-01,  5.97829491e-01,  1.83054372e-01,
        -4.25989784e-01, -5.59857101e-01,  4.27801057e-0

### Naive Bayes

### SVM Model

In [150]:
from sklearn import svm, linear_model, datasets
from sklearn.model_selection import cross_val_score

In [151]:
sv_m = svm.SVC() 
sv_m_params =  {
    "C"       : [.01,.1,.5],
    "kernel"  : ['rbf', 'linear', 'poly']
}



In [152]:
gs = GridSearchCV(sv_m, param_grid=sv_m_params)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.7078025477707006


In [153]:
gs.score(X_test, y_test)

0.7136038186157518

In [154]:
gs.best_params_

{'C': 0.1, 'kernel': 'linear'}

### Adaboost Model

In [158]:
from sklearn.ensemble import AdaBoostClassifier

In [167]:
ada = AdaBoostClassifier()
ada_params = {
    'n_estimators' : [30, 40, 50, 55],
    'learning_rate': [.7, .8, .9],
}


In [168]:
gs = GridSearchCV(ada, param_grid=ada_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)

0.6902866242038217


In [169]:
gs.best_params_

{'learning_rate': 0.8, 'n_estimators': 40}

In [170]:
gs.score(X_test, y_test)

0.7016706443914081

### Voting Classifier

In [155]:
from sklearn.ensemble import VotingClassifier

In [None]:
vote = VotingClassifier([
     ('rf', RandomForestClassifier()),
     ('lr', LogisticRegression()),
     ('knn', KNeighborsClassifier()),
     ('sv_m', svm.SVC()),
     ('nb', NaiveBayes()),
])