Before we try and use more complex models and engage in hyperparameter tuning, it would be worthwhile for us to understand where our optimized model is making errors, w.r.t. associated feature values. 

In [85]:
%load_ext autoreload
%autoreload 2

from utils import code

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [129]:
import pandas as pd
import numpy as np
import scikitplot as skplt
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from support.model import Model
from support.datasets import get_data
from support.experiments import get_auc_scorer
from support.experiments.experiment_1 import CATEGORICAL_FEATURES, baseline_model_predictions
from support.parameters import P_TARGETED, AVG_COST, AVG_REVENUE

%matplotlib inline

Load the best performing model

In [87]:
tuned_model = Model.load('../models/experiment-1-model.pkl')
model = Model(tuned_model.name, tuned_model.model, tuned_model.pipeline)

Load the training set and test sets

In [88]:
X, y = get_data('../data/train.csv')
X_test, y_test = get_data('../data/test.csv')

In [89]:
print('Number of training instances: {:,}'.format(len(X)))

Number of training instances: 32,950


Create a validation set

In [90]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, stratify=y, random_state=1)

In [91]:
print('Number of training instances: {:,}'.format(len(X_train)))
print('Number of validation instances: {:,}'.format(len(X_val)))

Number of training instances: 26,360
Number of validation instances: 6,590


These datasets are large enough to give us good performance and an accurate estimate of generalization.

Compute the baseline

In [92]:
model.train(X_train, y_train)

<support.model.Model at 0x1a1fa0b6a0>

In [93]:
scorer = get_auc_scorer()
model.score(X_val, y_val, scorer)

0.6800129332118983

This estimate is slightly optimistic since we scored .66 AUC using our held-out test set. We may be overfitting due to high cardinality in some of the variables, as we saw in the exploratory analysis:

* job
* education
* month

Let's try and collapse some of the rarer values and see if that impacts our model. First we need to update the training set with the collapsed values then we need to re-split into training and validation.

In [94]:
X.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,25,blue-collar,single,high.school,no,yes,no,cellular,jul,mon,619,2,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1
1,40,admin.,married,high.school,no,no,no,telephone,jun,thu,97,1,999,0,nonexistent,1.4,94.465,-41.8,4.958,5228.1
2,51,technician,married,university.degree,no,yes,no,cellular,aug,wed,512,5,999,0,nonexistent,1.4,93.444,-36.1,4.964,5228.1
3,37,blue-collar,married,high.school,unknown,yes,no,cellular,jul,tue,423,1,999,0,nonexistent,1.4,93.918,-42.7,4.961,5228.1
4,54,housemaid,married,university.degree,unknown,yes,no,cellular,aug,thu,297,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1


What is the cardinality of categorical variables?

In [128]:
X[CATEGORICAL_FEATURES].nunique().sort_values(ascending=False)

job            12
month          10
education       8
day_of_week     5
marital         4
poutcome        3
loan            3
housing         3
default         3
contact         2
dtype: int64

Let's see if there are values that are in the training set but not in the test set or vice-versa.

In [95]:
# For each variable
#  Compute number of occurances of each value in the training set
#  Compute number of occurances of each value in the test set
#  Compute difference as training - test

var_differences = []

for var in CATEGORICAL_FEATURES:
#   var = 'job'
  train_counts = (X[var]
    .value_counts()
    .rename('occurances')
    .rename_axis('value').reset_index()
    .assign(variable=lambda x: var)
    .set_index(['variable', 'value'])
  )

  test_counts = (X_test[var]
    .value_counts()
    .rename('occurances')
    .rename_axis('value').reset_index()
    .assign(variable=lambda x: var)
    .set_index(['variable', 'value'])
  )

  differences = (train_counts - test_counts).reset_index()
  
  var_differences.append(differences)

differences = pd.concat(var_differences)
differences[differences['occurances'].isnull()]

Unnamed: 0,variable,value,occurances
2,default,yes,


There are only 3 instances of defaulting customers in the training set anyway but for completeness, we'll merge these values into the 'unknown' category.

In [141]:
X_train_2, X_val_2 = X_train.copy(), X_val.copy()

X_train_2.loc[X_train_2.default == 'yes', 'default'] = 'unknown'
X_val_2.loc[X_val_2.default == 'yes', 'default'] = 'unknown'

(Model(tuned_model.name, tuned_model.model, tuned_model.pipeline)
  .train(X_train_2, y_train)
  .score(X_val_2, y_val, scorer)
)

0.6800129332118983

No effect yet. Let's try merging the informative variables with low representation together and then merge the rest of the variables with low representation together.

In [142]:
X_train_3, X_val_3 = X_train.copy(), X_val.copy()

X_train_3.loc[X_train_3.job.isin(['retired', 'student', 'unemployed']), 'job'] = 'rare_high_prob'
X_val_3.loc[X_val_3.job.isin(['retired', 'student', 'unemployed']), 'job'] = 'rare_high_prob'

(Model(tuned_model.name, tuned_model.model, tuned_model.pipeline)
  .train(X_train_3, y_train)
  .score(X_val_3, y_val, scorer)
)

0.6810389250039639

In [143]:
X.job.unique()

array(['blue-collar', 'admin.', 'technician', 'housemaid', 'entrepreneur',
       'unemployed', 'student', 'management', 'services', 'retired',
       'self-employed', 'unknown'], dtype=object)

In [144]:
X_train_5, X_val_5 = X_train.copy(), X_val.copy()

X_train_5.loc[X_train_5.job.isin(['entrepreneur', 'self-employed', 'housemaid', 'unknown']), 'job'] = 'rare_low_prob'
X_val_5.loc[X_val_5.job.isin(['entrepreneur', 'self-employed', 'housemaid', 'unknown']), 'job'] = 'rare_low_prob'

(Model(tuned_model.name, tuned_model.model, tuned_model.pipeline)
  .train(X_train_5, y_train)
  .score(X_val_5, y_val, scorer)
)

0.6793289386838545

In [145]:
X_train_6, X_val_6 = X_train.copy(), X_val.copy()

X_train_6.loc[X_train_6.job.isin(['retired', 'student', 'unemployed']), 'job'] = 'rare_high_prob'
X_train_6.loc[X_train_6.job.isin(['entrepreneur', 'self-employed', 'housemaid', 'unknown']), 'job'] = 'rare_low_prob'
X_val_6.loc[X_val_6.job.isin(['retired', 'student', 'unemployed']), 'job'] = 'rare_high_prob'
X_val_6.loc[X_val_6.job.isin(['entrepreneur', 'self-employed', 'housemaid', 'unknown']), 'job'] = 'rare_low_prob'

(Model(tuned_model.name, tuned_model.model, tuned_model.pipeline)
  .train(X_train_6, y_train)
  .score(X_val_6, y_val, scorer)
)

0.6803650705565245

In [146]:
X.education.unique()

array(['high.school', 'university.degree', 'basic.9y',
       'professional.course', 'basic.4y', 'basic.6y', 'unknown',
       'illiterate'], dtype=object)

In [147]:
X_train_7, X_val_7 = X_train.copy(), X_val.copy()

X_train_7.loc[X_train_7.education.isin(['high.school', 'university.degree', 'basic.9y','professional.course', 'basic.4y', 'basic.6y', 'unknown']), 'education'] = 'rare_low_prob'
X_val_7.loc[X_val_7.education.isin(['high.school', 'university.degree', 'basic.9y','professional.course', 'basic.4y', 'basic.6y', 'unknown']), 'education'] = 'rare_low_prob'

(Model(tuned_model.name, tuned_model.model, tuned_model.pipeline)
  .train(X_train_7, y_train)
  .score(X_val_7, y_val, scorer)
)

0.6734700462018944

In [151]:
X_train_7.education.value_counts()

rare_low_prob    26347
illiterate          13
Name: education, dtype: int64

Maybe we should consider dropping education if it's not providing much predictive value.

In [152]:
X_train_8, X_val_8 = X_train.copy(), X_val.copy()

X_train_8.loc[X_train_8.month.isin(['mar', 'oct', 'dec', 'sep']), 'month'] = 'rare_high_prob'
X_val_8.loc[X_val_8.month.isin(['mar', 'oct', 'dec', 'sep']), 'month'] = 'rare_high_prob'

(Model(tuned_model.name, tuned_model.model, tuned_model.pipeline)
  .train(X_train_8, y_train)
  .score(X_val_8, y_val, scorer)
)

0.6814664215839913

In [153]:
X_train_9, X_val_9 = X_train.copy(), X_val.copy()

X_train_9.loc[X_train_9.month.isin(['mar', 'oct', 'dec', 'sep']), 'month'] = 'rare_high_prob'
X_val_9.loc[X_val_9.month.isin(['mar', 'oct', 'dec', 'sep']), 'month'] = 'rare_high_prob'

X_train_9.loc[X_train_9.month.isin(['may', 'jul', 'aug', 'jun', 'nov', 'apr']), 'month'] = 'rare_low_prob'
X_val_9.loc[X_val_9.month.isin(['may', 'jul', 'aug', 'jun', 'nov', 'apr']), 'month'] = 'rare_low_prob'

(Model(tuned_model.name, tuned_model.model, tuned_model.pipeline)
  .train(X_train_9, y_train)
  .score(X_val_9, y_val, scorer)
)

0.6827966157941896

In [154]:
X_train_9.month.value_counts()

rare_low_prob     25045
rare_high_prob     1315
Name: month, dtype: int64

In [155]:
X_train_10, X_val_10 = X_train.copy(), X_val.copy()

X_train_10.loc[X_train_10.poutcome.isin(['failure', 'nonexistent']), 'poutcome'] = 'failure'
X_val_10.loc[X_val_10.poutcome.isin(['failure', 'nonexistent']), 'poutcome'] = 'failure'


(Model(tuned_model.name, tuned_model.model, tuned_model.pipeline)
  .train(X_train_10, y_train)
  .score(X_val_10, y_val, scorer)
)

0.6823271761534803

In [156]:
X_train_11, X_val_11 = X_train.copy(), X_val.copy()

X_train_11.loc[X_train_11.job.isin(['retired', 'student', 'unemployed']), 'job'] = 'rare_high_prob'
X_val_11.loc[X_val_11.job.isin(['retired', 'student', 'unemployed']), 'job'] = 'rare_high_prob'

X_train_11.loc[X_train_11.month.isin(['mar', 'oct', 'dec', 'sep']), 'month'] = 'rare_high_prob'
X_val_11.loc[X_val_11.month.isin(['mar', 'oct', 'dec', 'sep']), 'month'] = 'rare_high_prob'

X_train_11.loc[X_train_11.poutcome.isin(['failure', 'nonexistent']), 'poutcome'] = 'failure'
X_val_11.loc[X_val_11.poutcome.isin(['failure', 'nonexistent']), 'poutcome'] = 'failure'

(Model(tuned_model.name, tuned_model.model, tuned_model.pipeline)
  .train(X_train_11, y_train)
  .score(X_val_11, y_val, scorer)
)

0.682196737843887

Let's evaluate on the test set with these transformations.