In [1]:
import pandas as pd
import numpy as np
import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, mean_squared_error,r2_score
from sklearn.metrics import confusion_matrix, accuracy_score, plot_confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import scikitplot as skplt

In [3]:
LFBclass = pd.read_csv('../Assets/LFBClass.csv')
LFBclass['ob_time'] = pd.to_datetime(LFBclass.ob_time)
LFBclass.set_index('ob_time', inplace=True, drop=True)
y = LFBclass.pop('IncidentGroup')
dummify = ['wind_dirc','PropertyType', 'HourOfCall']
X = pd.get_dummies(LFBclass, columns=dummify, drop_first=True)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify = y, test_size=0.2, random_state=1)

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
# gridsearch params - decision tree classifier
dtc_params = {
    'max_depth': list(range(1, 8))+[None],
    'max_features': [None, 1, 3],
    'min_samples_split': [2, 3, 4, 5, 10, 15, 25, 40, 50],
    'ccp_alpha': [0.001, 0.005, 0.01]
}

# set the gridsearch
modeldtc = DecisionTreeClassifier()
dtc_gs = GridSearchCV(modeldtc, dtc_params, cv=5, verbose=1, n_jobs=-2)

In [7]:

dtc_gs.fit(X_train, y_train)
dtc_best = dtc_gs.best_estimator_
print(dtc_gs.best_params_)
print(dtc_gs.best_score_)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-2)]: Done 194 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-2)]: Done 444 tasks      | elapsed: 22.4min
[Parallel(n_jobs=-2)]: Done 794 tasks      | elapsed: 44.7min
[Parallel(n_jobs=-2)]: Done 1244 tasks      | elapsed: 152.6min
[Parallel(n_jobs=-2)]: Done 1794 tasks      | elapsed: 194.7min
[Parallel(n_jobs=-2)]: Done 2444 tasks      | elapsed: 364.0min
[Parallel(n_jobs=-2)]: Done 3194 tasks      | elapsed: 470.5min
[Parallel(n_jobs=-2)]: Done 3240 out of 3240 | elapsed: 481.4min finished


{'ccp_alpha': 0.001, 'max_depth': None, 'max_features': None, 'min_samples_split': 2}
0.645183076848859


In [8]:
fi = pd.DataFrame({
    'feature': X.columns,
    'importance': dtc_best.feature_importances_
})

fi.sort_values('importance', ascending=False, inplace=True)
fi.head(15)

Unnamed: 0,feature,importance
264,PropertyType_Small refuse/rubbish container,0.117488
69,PropertyType_Car,0.085432
176,PropertyType_Multiple Vehicles,0.071756
236,PropertyType_Purpose Built Flats/Maisonettes -...,0.068138
159,PropertyType_Loose refuse,0.065959
259,PropertyType_Self contained Sheltered Housing,0.04961
149,PropertyType_Large refuse/rubbish container (e...,0.045675
238,PropertyType_Purpose built office,0.041826
251,PropertyType_Road surface/pavement,0.037893
237,PropertyType_Purpose Built Flats/Maisonettes -...,0.037721
