# Bank Marketing UCI Modeling

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_validate

### Import data

In [5]:
def read_data(path: str, files: list):
    dataframes = []
    for file in files:
        dataframes.append(pd.read_csv(path + file, sep=','))
    return dataframes


path = '../data/'
files = ['bank_data.csv']
bank_data = read_data(path, files)[0]
bank_data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,ts_month
0,56,housemaid,married,basic.4y,0,0,0,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
1,57,services,married,high.school,unknown,0,0,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
2,37,services,married,high.school,0,1,0,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
3,40,admin.,married,basic.6y,0,0,0,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
4,56,services,married,high.school,0,0,1,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0


### Feature Engineering

In [None]:
def encode_categorical(features, data):
    for feature in features:
        data = pd.concat([data.drop(feature, axis=1), pd.get_dummies(data[feature], prefix=feature, prefix_sep='_',
                                                                     drop_first=True, dummy_na=False)], axis=1)
    display(data.sample(5))
    return data
        
        
cat_features = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
bank_data = encode_categorical(cat_features, bank_data)

### Model Selection & HPO

* Train/valid/test split: I'll split data according to 'ts_month' feature
* Train: 0 - 23 months
* Valid: 24 month
* Test: 25 month

In [6]:
class TimeSeriesCycle:
    def __init__(self,
                 data: pd.DataFrame,
                 models_with_params: dict,
                 metric: object,
                 start_border: int,
                 train_border: int,
                 valid_border: int,
                 test_border: int,
                 cv_step: int,
                 optuna_trials: int
                ):
        self.data = data
        self.models_with_params = models_with_params
        self.metric = metric
        self.start_border = start_border
        self.train_border = train_border
        self.valid_border = valid_border
        self.test_border = test_border    
        self.cv_step = cv_step
        self.optuna_trials = optuna_trials
        self.best_estimators = []
        self.final_estimator = None
        
   
    def evaluate_model(self, model, params):
        def objective(trial):
            model.set_params(**params(trial))
            scores = []
            for i in range(self.start_border + self.cv_step, self.train_border + 1, self.cv_step):
                X_train = self.data[self.data['date_block_num'] < i].drop(columns=['item_cnt_month'])
                y_train = self.data[self.data['date_block_num'] < i]['item_cnt_month']
                X_valid = self.data[self.data['date_block_num'] == i].drop(columns=['item_cnt_month'])
                y_valid = self.data[self.data['date_block_num'] == i]['item_cnt_month']

                model.fit(X_train, y_train)
                preds = model.predict(X_valid).clip(0, 20)
                scores.append(self.metric(y_valid, preds))
            return np.mean(scores)
        
        
        start = time.time()
        study_name = 'study'
        study = optuna.create_study(study_name=study_name, direction='minimize')
        study.optimize(objective, n_trials=self.optuna_trials, show_progress_bar=True, gc_after_trial=True)
        end = time.time()
        
        print('Best score: {}'.format(study.best_value))
        print('Best params: {}'.format(study.best_params))
        print('Taken time: {}'.format(int(end - start)))
        
        self.best_estimators.append((model, study.best_params))
    
    
    def compare_models(self):
        for model, params in self.models_with_params.items():
            print('Hyperparameters tuning for ' + str(type(model).__name__))
            self.evaluate_model(model, params)
            
        X_train = self.data[self.data['date_block_num'] <= self.train_border].drop(columns=['item_cnt_month'])
        y_train = self.data[self.data['date_block_num'] <= self.train_border]['item_cnt_month']
        X_valid = self.data[self.data['date_block_num'] == self.valid_border].drop(columns=['item_cnt_month'])
        y_valid = self.data[self.data['date_block_num'] == self.valid_border]['item_cnt_month']
        
        scores = []
        for model, params in self.best_estimators:
            model.set_params(**params)
            model.fit(X_train, y_train)
            preds = model.predict(X_valid).clip(0, 20)
            score = self.metric(preds, y_valid)
            print('Validation set score for {} = '.format(type(model).__name__) + str(score))
            scores.append(score)
            
        self.final_estimator = self.best_estimators[np.argmin(scores)]
    
    def get_predictions(self):
        start = time.time()
        
        model, params = self.final_estimator
        model.set_params(**params)
        print('Final model: ' + str(model))
        X = self.data[self.data['date_block_num'] <= self.valid_border].drop(columns=['item_cnt_month'])
        y = self.data[self.data['date_block_num'] <= self.valid_border]['item_cnt_month']
        print('Fitting...')
        model.fit(X, y)
        X_test = self.data[self.data['date_block_num'] == self.test_border].drop(columns=['item_cnt_month'])
        y_test = model.predict(X_test).clip(0, 20)
        
        end = time.time()
        print('Took ' + str(int(end - start)) + ' seconds to get final predictions')
        self.final_estimator = model
        return y_test
    
    def explain_model(self, model):
        explainer = shap.Explainer(model)
        shap_values = explainer(
            self.data[self.data['date_block_num'] == self.test_border].drop(columns=['item_cnt_month'])
        )
        shap.plots.waterfall(shap_values[0])

In [None]:
"""X = bank_data.drop(columns='y')
y = bank_data['y']

rf = RandomForestClassifier(n_estimators=500)

scoring = {'accuracy': 'accuracy', 'recall': 'recall'}
scores = cross_validate(rf, X, y, scoring=scoring, cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
                        return_train_score=True)
sorted(scores.keys())"""

In [None]:
"""print('Train accuracy: ', scores['train_accuracy'])
print('Valid accuracy: ', scores['test_accuracy'])
print('Train recall: ', scores['train_recall'])
print('Valid recall: ', scores['test_recall'])"""

### Final model quality check