In [None]:
%matplotlib inline

import matplotlib.pyplot as plt 
import matplotlib
import pickle
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.utils.validation import column_or_1d
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

## Precision and Recall recap:

Precision is the proprotion of all the times a model makes a correct positive prediction out of all the times that the model makes a positive prediction.

$$
precision=\frac{TP}{TP+FP}
$$

Recall is the proportion of times that the model detected a true positive out of all the times the model was correct:

$$
recall=\frac{TP}{TP + FN}
$$

In this case, it doesn't matter much if the model makes a lot of false negatives, as the next step in the process is that the negatives will be seen by human classifiers, obviously it would be good to minimise this however as human classifier time is at a premium. On the other hand, it matters very much that the model does not produce a lot of false positives because these will never get seen by human classifiers, and will be 'lost'.

In our case a true positive would be identifying an 'ok' class
A false positive would be incorrectly saying it is 'ok' when it is 'not-ok'

A true negative would be correctly identifying a 'not-ok'
A false negative would be incorrectly saying it is 'not-ok' when it is 'ok'

A model optimised for precision would one that minimises false positives, which is what we are interested in in this case. A model optimised for precision would be one that minimises false negatives. We are less concerned with this.


In [None]:
with open('../../data/transformed_data.pkl','rb') as f:
    transformed_data = pickle.load(f)
    f.close()
    
with open('../../data/targets.pkl','rb') as f:
    targets = column_or_1d(pickle.load(f))
    f.close()


#df = pd.DataFrame(np.c_[transformed_data, targets])

#df.columns = ['start_date_unix', 'start_date_weekday', 'start_date_dayofyear', 'start_date_day', 
#                'start_date_week', 'start_date_month', 'start_date_hour','time_delta',
#                'comment_why_you_came_strlength',
#                'comment_why_you_came_capsratio', 'comment_where_for_help_strlength',
#                'comment_where_for_help_capsratio','comment_further_comments_strlength',
#                'comment_further_comments_capsratio','target']

In [None]:
split = StratifiedShuffleSplit(n_splits = 1, test_size=0.2, random_state=1337)
for train_index, test_index in split.split(transformed_data, targets):
    train_index = train_index
    test_index=test_index


In [None]:
print('test_m =', len(test_index))
print('test_m =', len(train_index))
print('proportion of targets =',sum(targets[test_index])/len(targets[test_index]))
print('proportion of targets =',sum(targets[train_index])/len(targets[train_index]))

train_X = transformed_data[train_index]
train_y = targets[train_index]
test_X = transformed_data[test_index]
test_y = targets[test_index]

# Try to solve label shape error

from sklearn.utils import column_or_1d
train_y = column_or_1d(train_y)
test_y = column_or_1d(test_y)

## Adaboost classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

abc = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.1, random_state=42)
abc.fit(train_X, train_y)

train_pred = abc.predict(train_X)

class_report = classification_report(train_y, train_pred)
print(class_report)

test_pred = abc.predict(test_X)
class_report = classification_report(test_y, test_pred)
print(class_report)

print(confusion_matrix(test_y, test_pred))

## Optimising the AdaBoost for high precision

Since we are mostly interested in having a very precise model, and less worried about recall, here I tweak it.

In [None]:
#Extract a decision function for a single training example

abc.decision_function(train_X[0,:].reshape(1,-1))

In [None]:
from sklearn.model_selection import cross_val_predict

train_y_scores = cross_val_predict(abc, train_X, train_y, cv=3, method="decision_function")

In [None]:
train_y_scores

In [None]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(train_y, train_y_scores)

In [None]:
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([0, 1])
plt.show()

In [None]:
plt.plot(recalls, precisions, "r-")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

In [None]:
train_y_pred_90 = (train_y_scores > 0.06)

In [None]:
train_y_pred_90

In [None]:
from sklearn.metrics import precision_score, recall_score

print('precision:', round(precision_score(train_y, train_y_pred_90), 3))
print('recall:', round(recall_score(train_y, train_y_pred_90), 3))

Try with the test set

In [None]:
test_y_scores = cross_val_predict(abc, test_X, test_y, cv=3, method="decision_function")
test_y_pred_90 = (test_y_scores > 0.06)

print('precision:', round(precision_score(test_y, test_y_pred_90), 3))
print('recall:', round(recall_score(test_y, test_y_pred_90), 3))

Check the classification report:

In [None]:
print(classification_report(test_y, test_y_pred_90))

In [None]:
print(confusion_matrix(test_y, test_y_pred_90))

So the model often thinks that surveys are 'not ok' (and therefore will go on to human classifiers) when in fact they are 'ok' (this is fine because a human will later classify them), but the model very rarely thinks a survey is 'ok' when it is not ok. This is the ideal balance because it will prevent us from accidentally removing surveys from the human pool prematurely.

## Optimising parameters with RandomizedSearch/GridSearch

Which variables can we tune in the RandomizedSearch?

In [None]:
abc.get_params().keys()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=400),
    # TO access the parameters of the underlying Decision Tree, use
    # base_estimator__...
        'base_estimator__max_leaf_nodes': randint(1,10),
        'base_estimator__max_features': randint(1,5),
        'base_estimator__min_samples_split': randint(2,20),
        'base_estimator__min_samples_leaf': randint(1,20),
    }

rnd_search = RandomizedSearchCV(abc, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='f1', random_state=42)

In [None]:
rnd_search.fit(train_X, train_y)

In [None]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(round(mean_score,3), params)

In [None]:
rnd_search.best_estimator_