Most powerful implementation of gradient boosting in terms of 
- model performance
- fast execution speed
- can keep the interpretation of the problem and the model

In [1]:
# Part 1 -> Data Preprocessing
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
df = pd.read_csv('Churn_Modelling.csv')

X = df.iloc[:,3:13].values
y = df.iloc[:,-1].values

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder_X_geo = LabelEncoder()
X[:, 1] = label_encoder_X_geo.fit_transform(X[:, 1])
label_encoder_X_gen = LabelEncoder()
X[:, 2] = label_encoder_X_gen.fit_transform(X[:, 2])

onehotencoder = OneHotEncoder(categorical_features=[1])
X = onehotencoder.fit_transform(X).toarray() # for the country column
# Removing one column to avoid dummy variable trap
X = X[:, 1:]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling is totally unnecessary

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [2]:
df.shape

(10000, 14)

In [9]:
# Fitting xgboost to the Training set
# from xgboost.sklearn import XGBClassifier
from xgboost.sklearn import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [8]:
import xgboost

In [11]:
# Predicting the result for test set
y_pred = classifier.predict(X_test)

In [12]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[1521,   74],
       [ 197,  208]])

In [13]:
(1521 + 208) / 2000

0.8645

In [14]:
# Apllying k-Flod Cross Validation

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(classifier, X_train, y_train, cv= 10)

In [15]:
accuracies

array([0.86891386, 0.8576779 , 0.885     , 0.86625   , 0.85875   ,
       0.855     , 0.86625   , 0.85      , 0.8485607 , 0.87359199])

In [16]:
accuracies.mean()

0.8629994451163204

In [17]:
accuracies.std()

0.010677872171663988