In [None]:
%matplotlib inline

import matplotlib.pyplot as plt 
import matplotlib
import pickle
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.utils.validation import column_or_1d
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

In [None]:
with open('../../data/transformed_data.pkl','rb') as f:
    transformed_data = pickle.load(f)
    f.close()
    
with open('../../data/targets.pkl','rb') as f:
    targets = column_or_1d(pickle.load(f))
    f.close()


#df = pd.DataFrame(np.c_[transformed_data, targets])

#df.columns = ['start_date_unix', 'start_date_weekday', 'start_date_dayofyear', 'start_date_day', 
#                'start_date_week', 'start_date_month', 'start_date_hour','time_delta',
#                'comment_why_you_came_strlength',
#                'comment_why_you_came_capsratio', 'comment_where_for_help_strlength',
#                'comment_where_for_help_capsratio','comment_further_comments_strlength',
#                'comment_further_comments_capsratio','target']

In [None]:
split = StratifiedShuffleSplit(n_splits = 1, test_size=0.2, random_state=1337)
for train_index, test_index in split.split(transformed_data, targets):
    train_index = train_index
    test_index=test_index


In [None]:
print('test_m =', len(test_index))
print('train_m =', len(train_index))
print('proportion of targets =',sum(targets[test_index])/len(targets[test_index]))
print('proportion of targets =',sum(targets[train_index])/len(targets[train_index]))

train_X = transformed_data[train_index]
train_y = targets[train_index]
test_X = transformed_data[test_index]
test_y = targets[test_index]

# Try to solve label shape error

from sklearn.utils import column_or_1d
train_y = column_or_1d(train_y)
test_y = column_or_1d(test_y)

## RandomForest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

rf = RandomForestClassifier(random_state=42, n_jobs=-1, 
                            max_features=None,
                            oob_score=True, 
                            n_estimators=500)
rf.fit(train_X, train_y)

rf.oob_score_

In [None]:
from sklearn.metrics import accuracy_score

test_pred = rf.predict(test_X)
class_report = classification_report(test_y, test_pred)
print(class_report)

print(confusion_matrix(test_y, test_pred))

So the model often thinks that surveys are 'not ok' (and therefore will go on to human classifiers) when in fact they are 'ok' (this is fine because a human will later classify them), but the model very rarely thinks a survey is 'ok' when it is not ok. This is the ideal balance because it will prevent us from accidentally removing surveys from the human pool prematurely.

## Optimising parameters with RandomizedSearch/GridSearch

Which variables can we tune in the RandomizedSearch?

In [None]:
rf.get_params().keys()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        #'n_estimators': randint(low=1, high=500),
        'max_leaf_nodes': randint(low=1, high=40),
    # TO access the parameters of the underlying Decision Tree, use
    # base_estimator__...
        'max_depth': randint(1,25),
    }

rnd_search = RandomizedSearchCV(rf, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='f1', random_state=42)

In [None]:
rnd_search.fit(train_X, train_y)

In [None]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(round(mean_score,3), params)

In [None]:
rnd_search.best_estimator_