In [None]:
%matplotlib inline

import matplotlib.pyplot as plt 
import matplotlib
import pickle
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.utils.validation import column_or_1d
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

## Precision and Recall recap:

Precision is the proprotion of all the times a model makes a correct positive prediction out of all the times that the model makes a positive prediction.

$$
precision=\frac{TP}{TP+FP}
$$

Recall is the proportion of times that the model detected a true positive out of all the times the model was correct:

$$
recall=\frac{TP}{TP + FN}
$$

In this case, it doesn't matter much if the model makes a lot of false negatives, as the next step in the process is that the negatives will be seen by human classifiers, obviously it would be good to minimise this however as human classifier time is at a premium. On the other hand, it matters very much that the model does not produce a lot of false positives because these will never get seen by human classifiers, and will be 'lost'.

In our case a true positive would be identifying an 'ok' class
A false positive would be incorrectly saying it is 'ok' when it is 'not-ok'

A true negative would be correctly identifying a 'not-ok'
A false negative would be incorrectly saying it is 'not-ok' when it is 'ok'

A model optimised for precision would one that minimises false positives, which is what we are interested in in this case. A model optimised for precision would be one that minimises false negatives. We are less concerned with this.


In [None]:
with open('../../data/transformed_data.pkl','rb') as f:
    transformed_data = pickle.load(f)
    f.close()
    
with open('../../data/targets.pkl','rb') as f:
    targets = column_or_1d(pickle.load(f))
    f.close()


#df = pd.DataFrame(np.c_[transformed_data, targets])

df.columns = ['start_date_unix', 'start_date_weekday', 'start_date_dayofyear', 'start_date_day', 
                'start_date_week', 'start_date_month', 'start_date_hour','time_delta',
                'comment_why_you_came_strlength',
                'comment_why_you_came_capsratio', 'comment_where_for_help_strlength',
                'comment_where_for_help_capsratio','comment_further_comments_strlength',
                'comment_further_comments_capsratio','target']

In [None]:
split = StratifiedShuffleSplit(n_splits = 1, test_size=0.2, random_state=1337)
for train_index, test_index in split.split(transformed_data, targets):
    train_index = train_index
    test_index=test_index


In [None]:
print('test_m =', len(test_index))
print('test_m =', len(train_index))
print('proportion of targets =',sum(targets[test_index])/len(targets[test_index]))
print('proportion of targets =',sum(targets[train_index])/len(targets[train_index]))

train_X = transformed_data[train_index]
train_y = targets[train_index]
test_X = transformed_data[test_index]
test_y = targets[test_index]

# Try to solve label shape error

from sklearn.utils import column_or_1d
train_y = column_or_1d(train_y)
test_y = column_or_1d(test_y)

## Logistic regression model

In [None]:
log_reg = LogisticRegression()
log_reg.fit(train_X, train_y)

In [None]:
train_pred = log_reg.predict(train_X)

In [None]:
class_report = classification_report(train_y, train_pred)
print(class_report)

In [None]:
test_pred = log_reg.predict(test_X)
class_report = classification_report(test_y, test_pred)
print(class_report)

In [None]:
confusion_matrix(test_y, test_pred)

### Does cross validation make a difference?

In [None]:
scores = cross_val_score(log_reg, test_X, test_y, 
                         scoring="f1", cv=20)

print("Mean f1 score:", round(scores.mean(),2))
print("Std f1 score:", round(scores.std(),2))

nope

## SVM

In [None]:
svm = SVC(kernel="rbf")
svm.fit(train_X, train_y)

train_pred = svm.predict(train_X)

class_report = classification_report(train_y, train_pred)
print(class_report)

test_pred = svm.predict(test_X)
class_report = classification_report(test_y, test_pred)
print(class_report)

## Random Forest

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(train_X, train_y)

train_pred = dtc.predict(train_X)

class_report = classification_report(train_y, train_pred)
print(class_report)

test_pred = dtc.predict(test_X)
class_report = classification_report(test_y, test_pred)
print(class_report)

print(confusion_matrix(test_y, test_pred))

A decision tree seems to perform much better on this dataset. We are beyond dice roll territory!

## What about an enseble of all the models so far?

In [None]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('lr', log_reg), ('svm', svm), ('dtc', dtc)],
    voting='hard')
voting_clf.fit(train_X, train_y)
train_pred = voting_clf.predict(train_X)

class_report = classification_report(train_y, train_pred)
print(class_report)

test_pred = voting_clf.predict(test_X)
class_report = classification_report(test_y, test_pred)
print(class_report)

print(confusion_matrix(test_y, test_pred))

This does not perform very well!

## Random Forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(train_X, train_y)
train_pred = rf.predict(train_X)

class_report = classification_report(train_y, train_pred)
print(class_report)

test_pred = rf.predict(test_X)
class_report = classification_report(test_y, test_pred)
print(class_report)

print(confusion_matrix(test_y, test_pred))

In [None]:
pd.DataFrame([df.columns,rf.feature_importances_])

## Bagging classifier

In [None]:
from sklearn.ensemble import BaggingClassifier

bc = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True, random_state=40)
bc.fit(train_X, train_y)
bc.oob_score_

train_pred = bc.predict(train_X)

class_report = classification_report(train_y, train_pred)
print(class_report)

test_pred = bc.predict(test_X)
class_report = classification_report(test_y, test_pred)
print(class_report)

print(confusion_matrix(test_y, test_pred))

## Adaboost classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.1, random_state=42)
abc.fit(train_X, train_y)

train_pred = abc.predict(train_X)

class_report = classification_report(train_y, train_pred)
print(class_report)

test_pred = abc.predict(test_X)
class_report = classification_report(test_y, test_pred)
print(class_report)

print(confusion_matrix(test_y, test_pred))

## Random parameter search on Random Forest

This is the best performing model out of the box, maybe a random parameter search will improve upon it?

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8)
    }

rf = RandomForestClassifier(random_state=42)
rnd_search = RandomizedSearchCV(rf, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(train_X, train_y)

In [None]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
pd.DataFrame(cvres.cv_results_)