In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [3]:
import matplotlib.pylab as plt

In [21]:
df = pd.read_csv('diabetes.csv')

In [22]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [24]:
var_columns = [c for c in df.columns if c not in ['Outcome']]

In [25]:
var_columns

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [26]:
X = df.loc[:,var_columns]
y = df.loc[:,'Outcome']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

In [29]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 8), (154, 8), (614,), (154,))

In [31]:
model_gbm = GradientBoostingClassifier(n_estimators=5000, learning_rate=0.05, max_depth=3, subsample=0.5, validation_fraction=0.1, n_iter_no_change=20, max_features='log2', verbose=1)

In [32]:
model_gbm.fit(X_train,y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.2809           0.0132            7.50m
         2           1.2356           0.0281            3.96m
         3           1.1977           0.0215            2.78m
         4           1.1548           0.0166            2.19m
         5           1.1441           0.0172            1.75m
         6           1.1266           0.0212            1.53m
         7           1.1064           0.0147            1.37m
         8           1.0392           0.0130            1.20m
         9           1.1200           0.0108            1.11m
        10           1.0465           0.0085            1.04m
        20           0.9325           0.0046           36.11s
        30           0.8664           0.0033           26.51s
        40           0.8150          -0.0001           21.70s
        50           0.7824           0.0012           19.31s
        60           0.7462          -0.0007           17.78s
       

GradientBoostingClassifier(learning_rate=0.05, max_features='log2',
                           n_estimators=5000, n_iter_no_change=20,
                           subsample=0.5, verbose=1)

In [33]:
len(model_gbm)

106

In [37]:
y_train_pred = model_gbm.predict_proba(X_train)[:,1]
y_test_pred = model_gbm.predict_proba(X_test)[:,1]

In [38]:
roc_auc_score(y_train, y_train_pred)

0.9389256547451421

In [39]:
roc_auc_score(y_test,y_test_pred)

0.814868804664723

In [40]:
pd.DataFrame({'Variable_Name': var_columns,
             'Importance': model_gbm.feature_importances_}).sort_values('Importance', ascending=False)

Unnamed: 0,Variable_Name,Importance
1,Glucose,0.345314
5,BMI,0.178201
7,Age,0.141629
6,DiabetesPedigreeFunction,0.10591
4,Insulin,0.081607
0,Pregnancies,0.059397
3,SkinThickness,0.051096
2,BloodPressure,0.036846
