In [1]:
import pickle

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
import xgboost as xgb
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv('predict-term-deposit-data.csv')

In [3]:
df.iloc[100]

Id                  1101
age                 44.0
job          blue-collar
marital          married
education      secondary
default               no
balance           -674.0
housing              yes
loan                  no
contact          unknown
day                    5
month                may
duration             257
campaign               1
pdays                 -1
previous               0
poutcome         unknown
y                     no
Name: 100, dtype: object

In [4]:
df.columns = df.columns.str.lower()

### converting `day` and `month` into `day_of_year`

In [5]:
df['day'] = df['day'].map(str)

In [6]:
month_mapping = {
    'jan': '1',
    'feb': '2',
    'mar': '3',
    'apr': '4',
    'may': '5',
    'jun': '6',
    'jul': '7', 
    'aug': '8', 
    'sep': '9',
    'oct': '10', 
    'nov': '11', 
    'dec': '12' 
}
df['month'] = df['month'].map(month_mapping)

In [7]:
df['date_formatted'] = pd.to_datetime(
    dict(         
        year='2055',
        month=df['month'], 
        day=df['day']
    )
)

In [8]:
df['day_of_year']=df['date_formatted'].dt.dayofyear

In [9]:
df = df.drop(columns = ['id','day','month','date_formatted'])

### Converting 'yes - no' columns into 1-0 columns

In [10]:
df.default = (df.default == 'yes').astype(int)
df.housing = (df.housing == 'yes').astype(int)
df.loan = (df.loan == 'yes').astype(int)
df.y = (df.y == 'yes').astype(int)

## 6. Splitting full dataset into train, validation and test datasets

In [11]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 11)

In [12]:
df_full_train = df_full_train.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [13]:
y_full_train = df_full_train.y.values
y_test = df_test.y.values

In [14]:
del df_full_train['y']
del df_test['y']

### Dealing with -1 values in `pdays`

In [15]:
df_test['pdays'] = df_test['pdays'].replace([-1], 999999999)
df_full_train['pdays'] = df_full_train['pdays'].replace([-1], 999999999)

In [16]:
df_full_train['age'] = df_full_train['age'].replace([-1, 999], df_full_train['age'].mode()[0])
df_test['age'] = df_test['age'].replace([-1, 999], df_test['age'].mode()[0])

### Dealing with Missing Values

In [17]:
df_full_train['age'] = df_full_train['age'].fillna(df_full_train['age'].mode()[0])
df_test['age'] = df_test['age'].fillna(df_test['age'].mode()[0])

df_full_train['balance'] = df_full_train['balance'].fillna(df_full_train['balance'].mode()[0])
df_test['balance'] = df_test['balance'].fillna(df_test['balance'].mode()[0])

In [18]:
cols_numerical = list(df_full_train.select_dtypes(include='number').columns)
cols_categorical = list(df_full_train.select_dtypes(exclude='number').columns)

## 7. Training

### 7.1 One hot encoding

## 10. Gradient boosting with XGBoost 

In [19]:
dv = DictVectorizer(sparse = False)

full_train_dict = df_full_train.to_dict(orient='records')
test_dict = df_test.to_dict(orient='records')


X_full_train = dv.fit_transform(full_train_dict)
X_test = dv.transform(test_dict)

features = dv.get_feature_names()
dfulltrain = xgb.DMatrix(X_full_train, label=y_full_train, feature_names=features)
dtest = xgb.DMatrix(X_test, feature_names=features)



In [20]:
xgb_params = {
    'eta': 0.1,    
    'max_depth': 6,
    'min_child_weight': 10,
    
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    #'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dfulltrain, num_boost_round=190)
y_pred = model.predict(dtest)
print('Result of Final Gradient Boosting Model:')
print(f'ROC AUC score: {roc_auc_score(y_test, y_pred).round(3)}')

Result of Final Gradient Boosting Model:
ROC AUC score: 0.941


In [34]:
output_file = 'model.bin'
with open(output_file,'wb') as f_out: 
    pickle.dump((dv, model), f_out)

Load the model

Important:
- go to `Kernel` and click on `Restart` to start a new kernel
- Then run the below cells

In [1]:
import pickle
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer


import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
import xgboost as xgb
from sklearn.metrics import roc_auc_score

In [2]:
model_file = 'model.bin'

In [3]:
with open(model_file, 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [4]:
dv, model

(DictVectorizer(sparse=False), <xgboost.core.Booster at 0x7efecb7b1ee0>)

In [5]:
customer = {
	'age': 44.0,
	'job': 'blue-collar',
	'marital': 'married',
	'education': 'secondary',
	'default': 'no',
	'balance': -674.0,
	'housing': 'yes',
	'loan': 'no',
	'contact': 'unknown',
	'day': 5,
	'month': 'may',
	'duration': 257,
	'campaign': 1,
	'pdays': -1,
	'previous': 0,
	'poutcome': 'unknown',
}

In [6]:
X = dv.transform([customer])
X

array([[  44., -674.,    1.,    0.,    0.,    1.,    0.,    0.,  257.,
           0.,    1.,    0.,    0.,    0.,    0.,    1.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    1.,    0.,   -1.,    0.,    0.,    0.,    1.,    0.]])

In [7]:
dtest = xgb.DMatrix(X, feature_names=dv.get_feature_names())
y_pred = model.predict(dtest)
y_pred



array([0.01549412], dtype=float32)

In [None]:
xg_reg.