In [227]:
#import libraries
import pandas as pd
import xgboost as xgboost
import numpy as np

In [228]:
#import dataset
dataset = pd.read_csv('./dataset/bank-full.csv',sep=';')

In [229]:
#dataset.describe()
dataset.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [230]:
dataset.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [231]:
#isolate the x and y variables
y = dataset.iloc[:, -1].values
X = dataset._get_numeric_data()


In [232]:
#Split dataset into traininig and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=1502)

In [233]:
#Transform y factor variables
y_train = np.where(y_train == 'yes', 1, 0)
y_test = np.where(y_test == 'yes', 1, 0)
np.mean(y_train) #11.5% is our baseline, we need to beat this
np.mean(y_test) #12.17%

0.12175163109587527

In [234]:
#Create xgbosst matrices
Train = xgboost.DMatrix(X_train, label = y_train)
Test = xgboost.DMatrix(X_test, label = y_test)

In [235]:
#Set parameters
parameters1 = {
    'learning_rate': 0.3,
    'max_depth': 2,
    'colsample_bytree': 1,
    'subsample': 1,
    'min_child_weight': 1,
    'gamma': 0,
    'random_state': 1502,
    'eval_metric': 'auc', 
    'objective': 'binary:logistic'
}

In [236]:
#run XGBoost
model = xgboost.train(params = parameters1, 
                      dtrain = Train, 
                      num_boost_round = 200,
                      evals = [(Test, 'Test')],
                      verbose_eval = 50
                      )

[0]	Test-auc:0.75049


[50]	Test-auc:0.87633
[100]	Test-auc:0.88087
[150]	Test-auc:0.88275
[199]	Test-auc:0.88375


In [237]:
#Predictions 
predictions1 =  model.predict(Test)
predictions1 = np.where(predictions1 > 0.5, 1, 0)

In [238]:
#Confusion matrix
# see lesson 27 for more details
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
confusion_matrix1 = confusion_matrix(y_test, predictions1)
print(confusion_matrix1)
report1 = classification_report(y_test, predictions1)
print(report1)

[[7690  252]
 [ 736  365]]
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      7942
           1       0.59      0.33      0.42      1101

    accuracy                           0.89      9043
   macro avg       0.75      0.65      0.68      9043
weighted avg       0.87      0.89      0.88      9043



In [239]:
#########################################################
# Transforming categorical variables into dummy variables
# Isolate the categorical variables
dataset_categorical = dataset.select_dtypes(exclude = "number")

# Transform categorical variables into dummy variables
dataset_categorical = pd.get_dummies(dataset_categorical, drop_first = True)