In [1]:
import pandas as pd
import numpy as np
import pylab as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn import metrics

In [3]:
import os
os.chdir(r'C:\Users\Vish\Documents\Data\Dataset')
df = pd.read_csv("diabetes.csv")

In [4]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
#Check class imbalance
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [6]:
#Splitting the data into independent and dependent variables
X = df.iloc[:,:8]
y = df['Outcome'] 

In [7]:
#Evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [8]:
#Model building
model = XGBClassifier()
#General Parameters
#booster:{"gbtree", "dart" ,"gblinear"} ,default= gbtree--type of model, gbtree equals tree-based models and gblinear equals linear models
#n_jobs:int, range=0-Inf, default=1--Number of parallel threads. When set to zero, then the optimal number of threads will be inferred automatically
#silent:int,default=0--Silent mode is activated is set to 1, i.e. no running messages will be printed.

#Booster Parameters
#n_estimators=int, range=0-Inf, default=100--The no of trees to be built
#eta:float , range=0-1, default=0.3--The learning rate of the model.
#gamma:int, range=0-Inf, default=0--A node is split only when the resulting split gives a positive reduction in the loss function. 
#Gamma specifies the minimum loss reduction required to make a split.
#max_depth:int, range=0-Inf, default=6--The maximum depth of a tree. Used to control over-fitting as higher depth will allow model to learn relations very specific to a particular sample
#min_child_weight:int, range=0-Inf, default=1--Defines the minimum sum of weights of all observations required in a child.
#Used to control over-fitting. Too high values can lead to under-fitting.
#max_leaf_nodes:int, range=0-Inf, default=Inf--The maximum number of terminal nodes or leaves in a tree.
#subsample:float, range=0-1, default=1--Denotes the fraction of observations to be randomly samples for each tree.
#Lower values make the algorithm more conservative and prevents overfitting but too small values might lead to under-fitting. Typical range is 0.5-1
#colsample_bytree:float, range=0-1, default=1--Denotes the fraction of columns to be randomly samples for each tree.Typical range is 0.5-1
#colsample_bylevel:float, range=0-1, default=1--Denotes the subsample ratio of columns for each split in each level
#colsample_bynode:float, range=0-1, default=1--Denotes the subsample ratio of columns for each split point.
#max_delta_step:float, range=0-Inf, default=0--The maximum delta step allowed for the weight estimation of each tree.
# If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative.
# Decreasing this hyperparameter reduces the likelihood of overfitting
#reg_lambda:int, range=0-Inf, default=1--L2 regularization term
#reg_alpha:int, range=0-Inf, default=0--L1 regularization term
#scale_pos_weight:float,range=0-Inf, default=1--A value greater than 0 should be used in case of high class imbalance as it helps in faster convergence.
#sampling_method:{"uniform","gradient_based"}, default="uniform"--The method to use to sample the training instances. If uniform
# is selected, each training instance has an equal probability of being selected. If gradient_based is selected, the selection probability for each training instance is proportional to the regularized absolute value of gradients
#tree_method:{"auto", "exact", "approx", "hist", "gpu_hist"}, default= auto--The tree construction algorithm used in XGBoost
#predictor:{"auto","cpu_predictor","gpu_predictor"}, default=auto--Provides the same results but allows the use of GPU or CPU.

#Learning Parameters
#objective: {"reg:squarederror","reg:squaredlogerror", "reg:logistic", "binary:logistic", "binary:logitraw", "binary:hinge",
#"multi:softmax","multi:softprob" } ,default= 'reg:linear','binary:logistic'-- This defines the loss function to be minimized
#eval_metric: {"rmse","rmsle","mae","logloss","error","merror","mlogloss","auc"}, default= 'rmse','error'--The metric to be used for validation data.
#seed:int, default=0--Can be used for generating reproducible results
model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1.5, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [9]:
#Fit the classifier
#model = XGBRegressor()

In [10]:
#Predictions. The predict fuction converts probability values > .5 to 1 else 0
y_pred = model.predict(X_test)

In [12]:
#Evaluation
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("Recall: ", metrics.precision_score(y_test,y_pred))
print("F1 score: ", metrics.f1_score(y_test,y_pred))

Accuracy:  0.7489177489177489
Recall:  0.6142857142857143
F1 score:  0.5972222222222221


In [13]:
# Using our own threshold
# decisions = (model.predict_proba(X_test) >= 0.6).astype(int)
# y_pred=decisions[:,1]
# print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
# print("Recall: ", metrics.precision_score(y_test,y_pred))
# print("F1 score: ", metrics.f1_score(y_test,y_pred))

In [14]:
#AUC ROC

# Auc roc works on class probabilities. Although it can work on predictions of 0,1 as well but it works well with probabilities
# class probabilities are indepedent of threshold because first class probabilites are calculated and then threshold is used
#to predict 0 and 1. So auc will remain same for all threshold values

# generate class probabilities
# notice that 2 elements will be returned in probs array
probs = model.predict_proba(X_test)

In [15]:
# 1st element is probability for negative class,
# 2nd element gives probability for positive class
probs

array([[7.61757493e-02, 9.23824251e-01],
       [9.70411062e-01, 2.95889359e-02],
       [9.71625268e-01, 2.83747204e-02],
       [2.62779236e-01, 7.37220764e-01],
       [9.79701757e-01, 2.02982631e-02],
       [9.99315083e-01, 6.84945437e-04],
       [1.48052931e-01, 8.51947069e-01],
       [1.18364692e-02, 9.88163531e-01],
       [8.90477002e-01, 1.09522998e-01],
       [6.58192039e-01, 3.41807932e-01],
       [4.67805266e-02, 9.53219473e-01],
       [6.33091331e-02, 9.36690867e-01],
       [9.94742572e-01, 5.25741745e-03],
       [9.02732611e-01, 9.72673595e-02],
       [5.49686432e-01, 4.50313538e-01],
       [1.07227743e-01, 8.92772257e-01],
       [5.92291355e-03, 9.94077086e-01],
       [9.98471141e-01, 1.52888347e-03],
       [2.69970000e-01, 7.30030000e-01],
       [8.30653846e-01, 1.69346169e-01],
       [5.33286572e-01, 4.66713458e-01],
       [9.83711183e-01, 1.62888113e-02],
       [8.31155062e-01, 1.68844968e-01],
       [3.56995940e-01, 6.43004060e-01],
       [9.990526

In [16]:
#Probability for positive class
y_pred_prob = probs[:, 1]
y_pred_prob

array([9.23824251e-01, 2.95889359e-02, 2.83747204e-02, 7.37220764e-01,
       2.02982631e-02, 6.84945437e-04, 8.51947069e-01, 9.88163531e-01,
       1.09522998e-01, 3.41807932e-01, 9.53219473e-01, 9.36690867e-01,
       5.25741745e-03, 9.72673595e-02, 4.50313538e-01, 8.92772257e-01,
       9.94077086e-01, 1.52888347e-03, 7.30030000e-01, 1.69346169e-01,
       4.66713458e-01, 1.62888113e-02, 1.68844968e-01, 6.43004060e-01,
       9.47339169e-04, 1.00519270e-01, 1.68096647e-03, 8.96308362e-01,
       4.94609866e-03, 9.05864965e-03, 3.77114803e-01, 2.35506713e-01,
       4.94636362e-03, 9.06101823e-01, 9.85055417e-03, 9.89166439e-01,
       8.48109424e-01, 8.20163463e-04, 5.19757450e-01, 9.91708100e-01,
       5.46459675e-01, 2.97988649e-03, 3.25890891e-02, 9.72337902e-01,
       9.10901845e-01, 5.46590774e-04, 3.81328585e-03, 6.04359731e-02,
       6.09834611e-01, 1.49582803e-01, 3.00684512e-01, 2.68609654e-02,
       8.68643880e-01, 9.71611917e-01, 3.45323868e-02, 6.87754771e-04,
      

In [17]:
#AUC ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
roc_auc = metrics.auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc)

# fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
# roc_auc = metrics.auc(fpr, tpr)
# print("Area under the ROC curve : %f" % roc_auc)

Area under the ROC curve : 0.803925


In [19]:
#Which Error is Costly??
i = np.arange(len(tpr)) # index for df
roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),'tpr' : pd.Series(tpr,
index = i),'1-fpr' : pd.Series(1-fpr, index = i)})
print(roc)

         fpr       tpr     1-fpr
0   0.000000  0.000000  1.000000
1   0.000000  0.013514  1.000000
2   0.012739  0.013514  0.987261
3   0.012739  0.040541  0.987261
4   0.025478  0.040541  0.974522
..       ...       ...       ...
66  0.821656  0.972973  0.178344
67  0.821656  0.986486  0.178344
68  0.866242  0.986486  0.133758
69  0.866242  1.000000  0.133758
70  1.000000  1.000000  0.000000

[71 rows x 3 columns]


In [20]:
#Which Error is Costly??
#i = np.arange(len(tpr)) # index for df
roc_with_threshold = pd.DataFrame({'fpr' : pd.Series(fpr, index=thresholds),'tpr' : pd.Series(tpr,
index = thresholds),'1-fpr' : pd.Series(1-fpr, index = thresholds)})
print(roc_with_threshold)

               fpr       tpr     1-fpr
1.998250  0.000000  0.000000  1.000000
0.998250  0.000000  0.013514  1.000000
0.994369  0.012739  0.013514  0.987261
0.991980  0.012739  0.040541  0.987261
0.991125  0.025478  0.040541  0.974522
...            ...       ...       ...
0.002167  0.821656  0.972973  0.178344
0.001977  0.821656  0.986486  0.178344
0.000939  0.866242  0.986486  0.133758
0.000912  0.866242  1.000000  0.133758
0.000148  1.000000  1.000000  0.000000

[71 rows x 3 columns]
