# Bank Term Deposits - Data Preparation

## Import Packages and Load Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.impute
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler,FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


from project_utils import *
import os
import requests
import zipfile
# %matplotlib inline 
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


pd.options.display.float_format = '{:.4f}'.format

data = pd.read_csv('../data/bank-additional/bank-additional-full.csv',sep=';')

dp_dropCols(data,"duration")

#Convert target to y=1 and n=0 and rename columun
data["y"][data["y"] == 'yes'] = 1
data["y"][data["y"] == 'no'] = 0
data.rename(columns={"y":"target"}, inplace = True)
data["target"] = data["target"].astype('int64')


train_set, test_set = train_test_split(data, test_size=0.2,random_state=42,stratify = data["target"],shuffle = True)

countsAndProportions(data["target"])
countsAndProportions(test_set["target"])
countsAndProportions(train_set["target"])

train_set_X = train_set.drop("target", axis=1)
train_set_Y = train_set["target"]

test_set_X = test_set.drop("target", axis=1)
test_set_Y = test_set["target"]



In [None]:
%%script false --no-raise-error #avoid cell execution

#The imputer stores the median values in **statistics_** instance variable. We cannot be sure that there won’t be any missing values in new data after the system goes live, so it is safer to apply the imputer to all the numerical attributes
data = dp_ImputeNumericCols(data)

#Encode education as ordinal encoder and see if its inherent ordering property would lead to better results
cols = ["education"]
data = dp_EncodeOrdinalCols(data,cols,categories = [['illiterate','basic.4y','basic.6y','basic.9y','high.school','professional.course','university.degree','unknown']])

cols = data.select_dtypes(include="object").columns
enc = DataframeOneHotEncoder(cols)
data = enc.transform(data)    

standardiseCols = ['age', 'campaign', 'pdays', 'previous','emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m','nr.employed']
data = dp_StandardiseNumericCols(data,standardiseCols)


In [None]:
cols_OrdinalEncode = ["education"]
cols_OHE = data.select_dtypes(include="object").columns
cols_log1p = ["age"]
cols_Numeric = ['campaign', 'pdays', 'previous','emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m','nr.employed']


full_pipeline = ColumnTransformer([
#For OrdinalEncoder, the categories have to be a list of arrays (per column). Since we have only one column
('oe', OrdinalEncoder(categories = [['illiterate','basic.4y','basic.6y','basic.9y','high.school','professional.course','university.degree','unknown']]),
         cols_OrdinalEncode),
('ohe', OneHotEncoder(sparse=False,handle_unknown="ignore"),cols_OHE),
('logAge',FunctionTransformer(np.log1p),cols_log1p),
('std', StandardScaler(),cols_Numeric)
])

#Separate fit and transform, otherwise OHE was giving error as one of the categories were missing in Test
full_pipeline.fit(train_set_X)
train_prep = full_pipeline.transform(train_set_X)


In [None]:
from sklearn.model_selection import RandomizedSearchCV
# param_grid = [{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},]
param_grid = [{'n_estimators': [5, 10]},]

random_search = RandomizedSearchCV(RandomForestClassifier(),param_grid, cv=5,scoring='f1',return_train_score=True)
random_search.fit(train_prep,train_set_Y)






In [None]:
x = full_pipeline.named_transformers_["ohe"]
# list(x.categories_)
# random_search.best_estimator_.feature_importances_

full_pipeline.get_feature_names()


In [None]:
%%script false --no-raise-error #avoid cell execution

### 
from sklearn.linear_model import LogisticRegression
modelTrainPredict(LogisticRegression(),train_prep,train_set_Y,test_set_X,test_set_Y)

from sklearn.tree import DecisionTreeClassifier
modelTrainPredict(DecisionTreeClassifier(),train_prep,train_set_Y,test_set_X,test_set_Y)

from sklearn.ensemble import RandomForestClassifier
modelTrainPredict(RandomForestClassifier(),train_prep,train_set_Y,test_set_X,test_set_Y)

from sklearn.neural_network import MLPClassifier
modelTrainPredict(MLPClassifier(),train_prep,train_set_Y,test_set_X,test_set_Y)

### Cross Validation - Example
 
from sklearn.model_selection import cross_val_score
#sorted(sklearn.metrics.SCORERS.keys())
scores = cross_val_score(DecisionTreeClassifier(), train_prep, train_set_Y,scoring="f1", cv=10)
scores


