# Bank Term Deposits - Data Preparation and Initial Modeling

### Import Packages and Load Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.impute
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler,FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


from project_utils import *
import os
import requests
import zipfile
# %matplotlib inline 
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


pd.options.display.float_format = '{:.4f}'.format
pd.set_option('display.max_colwidth', -1)


data = pd.read_csv('../data/bank-additional/bank-additional-full.csv',sep=';')

dp_dropCols(data,"duration")

#Convert target to y=1 and n=0 and rename columun
data["y"][data["y"] == 'yes'] = 1
data["y"][data["y"] == 'no'] = 0
data.rename(columns={"y":"target"}, inplace = True)
data["target"] = data["target"].astype('int64')


train_set, test_set = train_test_split(data, test_size=0.2,random_state=42,stratify = data["target"],shuffle = True)

#Check proportions 
countsAndProportions(data["target"])
countsAndProportions(test_set["target"])
countsAndProportions(train_set["target"])

train_set_X = train_set.drop("target", axis=1)
train_set_Y = train_set["target"]

test_set_X = test_set.drop("target", axis=1)
test_set_Y = test_set["target"]



   target  target
0  36548  0.8873 
1  4640   0.1127 
   target  target
0  7310   0.8874 
1  928    0.1126 
   target  target
0  29238  0.8873 
1  3712   0.1127 


### Create Data prep pipeline

In [2]:
cols_OrdinalEncode = ["education"]
cols_OHE = data.select_dtypes(include="object").columns
cols_log1p = ["age"]
cols_Numeric = ['campaign', 'pdays', 'previous','emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m','nr.employed']


full_pipeline = ColumnTransformer([
#For OrdinalEncoder, the categories have to be a list of arrays (per column). Since we have only one column
('oe', OrdinalEncoder(categories = [['illiterate','basic.4y','basic.6y','basic.9y','high.school','professional.course','university.degree','unknown']]),
         cols_OrdinalEncode),
('ohe', OneHotEncoder(sparse=False,handle_unknown="ignore"),cols_OHE),
('logAge',FunctionTransformer(np.log1p),cols_log1p),
('scale', StandardScaler(),cols_Numeric)
])

#Separate fit and transform, otherwise OHE was giving error as one of the categories were missing in Test
full_pipeline.fit(train_set_X)
train_prep = full_pipeline.transform(train_set_X)

#Apply pipeline to test
test_prep = full_pipeline.transform(test_set_X)


ColumnTransformer(transformers=[('oe',
                                 OrdinalEncoder(categories=[['illiterate',
                                                             'basic.4y',
                                                             'basic.6y',
                                                             'basic.9y',
                                                             'high.school',
                                                             'professional.course',
                                                             'university.degree',
                                                             'unknown']]),
                                 ['education']),
                                ('ohe',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month',

### Train few models
* Use default hyperparameters
* Logistic gives low F1 score and the predicted distribution is also not close to the actuals
* Decision Tree is the best model, even though it has low F1
* RF and Neural network have higher F1 score but predicted distributions are way off

In [3]:
print("==================     Logistic Regression     ==================")
from sklearn.linear_model import LogisticRegression
modelTrainPredict(LogisticRegression(),train_prep,train_set_Y,test_prep,test_set_Y)

print("==================     Decision Tree     ==================")
from sklearn.tree import DecisionTreeClassifier
modelTrainPredict(DecisionTreeClassifier(),train_prep,train_set_Y,test_prep,test_set_Y)

print("==================     Random Forest     ==================")
from sklearn.ensemble import RandomForestClassifier
modelTrainPredict(RandomForestClassifier(),train_prep,train_set_Y,test_prep,test_set_Y)

print("==================     Neural Network     ==================")
from sklearn.neural_network import MLPClassifier
modelTrainPredict(MLPClassifier(),train_prep,train_set_Y,test_prep,test_set_Y)


Confusion Matrix :
 [[7218   92]
 [ 723  205]]
Precision : 0.6902356902356902
Recall : 0.2209051724137931
F1 Score : 0.3346938775510204
.............................
y_true Counts :
   target  target
0  7310   0.8874 
1  928    0.1126 
None
pred Counts :
      0      1
0  7941 0.9639
1  297  0.0361
None
Confusion Matrix :
 [[6602  708]
 [ 629  299]]
Precision : 0.2969215491559086
Recall : 0.32219827586206895
F1 Score : 0.3090439276485788
.............................
y_true Counts :
   target  target
0  7310   0.8874 
1  928    0.1126 
None
pred Counts :
      0      1
0  7231 0.8778
1  1007 0.1222
None
Confusion Matrix :
 [[7098  212]
 [ 647  281]]
Precision : 0.5699797160243407
Recall : 0.30280172413793105
F1 Score : 0.39549612948627727
.............................
y_true Counts :
   target  target
0  7310   0.8874 
1  928    0.1126 
None
pred Counts :
      0      1
0  7745 0.9402
1  493  0.0598
None
Confusion Matrix :
 [[7035  275]
 [ 619  309]]
Precision : 0.5291095890410958
Reca

### Improve the Random Forest Model

* Uses random grid search to find the best hyperparameters using 5 fold CV 

In [4]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = [{'n_estimators': [100, 500]
                ,'max_features': [10]
                ,'max_depth':[8]
                ,'class_weight':[None,'balanced','balanced_subsample']
                }
                # ,{'n_estimators': [100, 200,300,400, 500,1000]
                # ,'max_features': ['auto','sqrt','log2',6,8,10]
                # ,'max_depth':[2,3,4,6,8]
                # ,'class_weight':[None,'balanced','balanced_subsample']                }
                #,{'bootstrap': [False], 'n_estimators': [100, 500], 'max_features': [2, 3, 4]}
                ,]

random_search = RandomizedSearchCV(RandomForestClassifier(),param_grid, cv=5,scoring='f1',return_train_score=True, n_iter=4)
random_search.fit(train_prep,train_set_Y)




RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=4,
                   param_distributions=[{'class_weight': [None, 'balanced',
                                                          'balanced_subsample'],
                                         'max_depth': [8], 'max_features': [10],
                                         'n_estimators': [100, 500]}],
                   return_train_score=True, scoring='f1')

### Hyperparameter search results

In [8]:
cvres = random_search.cv_results_
#cvres.keys()
out = pd.DataFrame(data = {"mean_fit_time":cvres["mean_fit_time"],"mean_test_score":cvres["mean_test_score"],"params":cvres["params"]})
out.sort_values(["mean_test_score"],ascending=False)


Unnamed: 0,mean_fit_time,mean_test_score,params
2,8.1867,0.4743,"{'n_estimators': 500, 'max_features': 10, 'max_depth': 8, 'class_weight': 'balanced'}"
3,2.0952,0.4735,"{'n_estimators': 100, 'max_features': 10, 'max_depth': 8, 'class_weight': 'balanced_subsample'}"
0,1.6671,0.4724,"{'n_estimators': 100, 'max_features': 10, 'max_depth': 8, 'class_weight': 'balanced'}"
1,8.4359,0.3244,"{'n_estimators': 500, 'max_features': 10, 'max_depth': 8, 'class_weight': None}"


### Predict on Test using the best estimator

* Better than the default parameters
* Higher F1 score but performance is not better than Decision Tree 



In [11]:
#RandomForestClassifier(class_weight='balanced', max_depth=8, max_features=10,n_estimators=500)
#random_search.best_index_
random_search.best_estimator_

pred = random_search.predict(test_prep)
classificationMetrics(test_set_Y,pred)
    



RandomForestClassifier(class_weight='balanced', max_depth=8, max_features=10,
                       n_estimators=500)

Confusion Matrix :
 [[6449  861]
 [ 337  591]]
Precision : 0.40702479338842973
Recall : 0.6368534482758621
F1 Score : 0.4966386554621849
.............................
y_true Counts :
   target  target
0  7310   0.8874 
1  928    0.1126 
None
pred Counts :
      0      1
0  6786 0.8237
1  1452 0.1763
None


## so far

1. Fit pipeline to train set and transform it, so that it can be fed to the models
2. (Tried various models - Go deeper to see why RF Classifier is not giving the best results )
3. Randomised Grid Search Cross Validation

## TO DO 
1. Create own metric to get higher values for True Positives
2. Discretize continuous
3. Treat some of the data prep steps as hyperparameters
4. Feature importance




In [12]:
x = full_pipeline.named_transformers_["ohe"]
# list(x.categories_)
# random_search.best_estimator_.feature_importances_

full_pipeline.get_feature_names()


AttributeError: Transformer oe (type OrdinalEncoder) does not provide get_feature_names.

In [None]:
%%script false --no-raise-error #avoid cell execution

### Cross Validation - Example
 
from sklearn.model_selection import cross_val_score
#sorted(sklearn.metrics.SCORERS.keys())
scores = cross_val_score(DecisionTreeClassifier(), train_prep, train_set_Y,scoring="f1", cv=10)
scores




In [2]:
%%script false --no-raise-error #avoid cell execution

#The imputer stores the median values in **statistics_** instance variable. We cannot be sure that there won’t be any missing values in new data after the system goes live, so it is safer to apply the imputer to all the numerical attributes
data = dp_ImputeNumericCols(data)

#Encode education as ordinal encoder and see if its inherent ordering property would lead to better results
cols = ["education"]
data = dp_EncodeOrdinalCols(data,cols,categories = [['illiterate','basic.4y','basic.6y','basic.9y','high.school','professional.course','university.degree','unknown']])

cols = data.select_dtypes(include="object").columns
enc = DataframeOneHotEncoder(cols)
data = enc.transform(data)    

standardiseCols = ['age', 'campaign', 'pdays', 'previous','emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m','nr.employed']
data = dp_StandardiseNumericCols(data,standardiseCols)
