In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.metrics import roc_auc_score,confusion_matrix,accuracy_score 

In [2]:
df=pd.read_csv('diabetes.csv')
df = df.sample(frac=1,random_state=4).reset_index(drop=True)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,100,66,29,196,32.0,0.444,42,0
1,9,57,80,37,0,32.8,0.096,41,0
2,0,100,70,26,50,30.8,0.597,21,0
3,1,119,88,41,170,45.3,0.507,26,0
4,2,102,86,36,120,45.5,0.127,23,1


In [3]:
y=df['Outcome']
X=df.drop('Outcome',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### AdaBoost Classifier

In [4]:
from sklearn.ensemble import AdaBoostClassifier

In [5]:
adac = AdaBoostClassifier(random_state=4)

adac.fit(X_train,y_train)

AdaBoostClassifier(random_state=4)

In [6]:
y_train_pred=adac.predict(X_train)
y_train_prob=adac.predict_proba(X_train)[:,1]

print('Train')
print('Confusion Matrix:\n',confusion_matrix(y_train,y_train_pred))
print('\nAccuracy Score:',accuracy_score(y_train,y_train_pred))
print('AUC:',roc_auc_score(y_train,y_train_prob))


y_test_pred=adac.predict(X_test)
y_test_prob=adac.predict_proba(X_test)[:,1]

print('\n\nTest')
print('Confusion Matrix:\n',confusion_matrix(y_test,y_test_pred))
print('\nAccuracy Score:',accuracy_score(y_test,y_test_pred))
print('AUC:',roc_auc_score(y_test,y_test_prob))

Train
Confusion Matrix:
 [[320  29]
 [ 49 139]]

Accuracy Score: 0.8547486033519553
AUC: 0.9358425288057063


Test
Confusion Matrix:
 [[128  23]
 [ 40  40]]

Accuracy Score: 0.7272727272727273
AUC: 0.7894867549668874


## LightGBM

In [7]:
!pip install lightgbm



In [8]:
import lightgbm as lgb

In [9]:
lgbmc= lgb.LGBMClassifier(random_state=4)

lgbmc.fit(X_train,y_train)

LGBMClassifier(random_state=4)

In [10]:
y_train_pred=lgbmc.predict(X_train)
y_train_prob=lgbmc.predict_proba(X_train)[:,1]

print('Train')
print('Confusion Matrix:\n',confusion_matrix(y_train,y_train_pred))
print('\nAccuracy Score:',accuracy_score(y_train,y_train_pred))
print('AUC:',roc_auc_score(y_train,y_train_prob))


y_test_pred=lgbmc.predict(X_test)
y_test_prob=lgbmc.predict_proba(X_test)[:,1]

print('\n\nTest')
print('Confusion Matrix:\n',confusion_matrix(y_test,y_test_pred))
print('\nAccuracy Score:',accuracy_score(y_test,y_test_pred))
print('AUC:',roc_auc_score(y_test,y_test_prob))

Train
Confusion Matrix:
 [[349   0]
 [  0 188]]

Accuracy Score: 1.0
AUC: 1.0


Test
Confusion Matrix:
 [[120  31]
 [ 35  45]]

Accuracy Score: 0.7142857142857143
AUC: 0.7836092715231788


### Hyperparameter Tuning LightGBM

In [11]:
from sklearn.model_selection import RandomizedSearchCV

In [14]:
lgbmc = lgb.LGBMClassifier(random_state=4)

param_dist = {'n_estimators':sp_randint(50,250),
             'max_depth':sp_randint(1,15),
             'learning_rate':sp_uniform(0,0.5)}

rsearch = RandomizedSearchCV(lgbmc,param_distributions=param_dist,cv=3,n_iter=100,
                             scoring='roc_auc',random_state=4,n_jobs=-1)

rsearch.fit(X,y)
rsearch.best_params_

{'learning_rate': 0.1822765876628779, 'max_depth': 1, 'n_estimators': 57}

In [17]:
lgbmc = lgb.LGBMClassifier(**rsearch.best_params_,importance_type='gain',random_state=4)
lgbmc.fit(X_train,y_train)

y_train_pred=lgbmc.predict(X_train)
y_train_prob=lgbmc.predict_proba(X_train)[:,1]

print('Train')
print('Confusion Matrix:\n',confusion_matrix(y_train,y_train_pred))
print('\nAccuracy Score:',accuracy_score(y_train,y_train_pred))
print('AUC:',roc_auc_score(y_train,y_train_prob))


y_test_pred=lgbmc.predict(X_test)
y_test_prob=lgbmc.predict_proba(X_test)[:,1]

print('\n\nTest')
print('Confusion Matrix:\n',confusion_matrix(y_test,y_test_pred))
print('\nAccuracy Score:',accuracy_score(y_test,y_test_pred))
print('AUC:',roc_auc_score(y_test,y_test_prob))

Train
Confusion Matrix:
 [[318  31]
 [ 62 126]]

Accuracy Score: 0.8268156424581006
AUC: 0.8987532768396025


Test
Confusion Matrix:
 [[129  22]
 [ 41  39]]

Accuracy Score: 0.7272727272727273
AUC: 0.7923013245033113


In [18]:
pd.DataFrame(lgbmc.feature_importances_,X.columns,['Importance']).sort_values(by='Importance',ascending=False)

Unnamed: 0,Importance
Glucose,514.958633
BMI,140.735138
Age,99.068299
Pregnancies,29.983
DiabetesPedigreeFunction,29.7579
BloodPressure,9.57662
SkinThickness,0.0
Insulin,0.0


In [1]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.7.0 imblearn-0.0
