In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
%matplotlib inline
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.tools as tls
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)

In [5]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [6]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

In [7]:
df.head()

In [8]:
len(df)

In [9]:
df.isnull().sum()

In [10]:
df.info()

In [11]:
df.describe()

In [12]:
cols = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
df[cols] = df[cols].replace(0,np.NaN)

In [13]:
df.isnull().sum()

# EDA

In [14]:
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
plt.figure(figsize=(20,40))
col = df.columns
for i in range(0,8):
    plt.subplot(4,2,i+1)
    plt.hist(df[col[i]]);
    plt.xlabel(col[i])


# Outlier Detection

In [16]:
plt.figure(figsize=(20,10))
plt.boxplot([vals.dropna() for col, vals in df.iteritems()]);
plt.xticks([1,2,3,4,5,6,7,8,9], df.columns);

In [17]:
def treat_outliers():
    for col in df.columns:
        Q1=df[col].quantile(0.25)
        Q3=df[col].quantile(0.75)
        IQR=Q3-Q1
        Lower_Whisker = Q1- 1.5*IQR
        Upper_Whisker = Q3+1.5*IQR
        df.loc[(df[col] > Upper_Whisker), col] = Upper_Whisker
        df.loc[(df[col] < Lower_Whisker), col] = Lower_Whisker
    
    

In [18]:
plt.figure(figsize=(20,10))
treat_outliers()
plt.boxplot([vals.dropna() for col, vals in df.iteritems()]);
plt.xticks([1,2,3,4,5,6,7,8,9], df.columns);

# Missing Values

In [19]:
import missingno
missingno.heatmap(df, figsize=(10,5), fontsize=12);

In [20]:
df.pivot_table(df, index=["Outcome"])

In [21]:
sns.heatmap(df.corr(), annot=True)

In [22]:
import plotly.figure_factory as ff
%matplotlib inline
# 2 datasets
tmp1 = df[df['Insulin'].notnull()]
tmp1 = tmp1[tmp1['Outcome']==1]['Insulin']
tmp2 = df[df['Insulin'].notnull()]
tmp2 = tmp2[tmp2['Outcome']==0]['Insulin']
hist_data = [tmp1, tmp2]

group_labels = ['diabetic', 'healthy']
colors = ['#FFD700', '#7EC0EE']

fig = ff.create_distplot(hist_data, group_labels, colors = colors, show_hist = True, bin_size = 0, curve_type='kde')

fig['layout'].update(title = 'Insulin')

py.iplot(fig, filename = 'Density plot')

In [23]:
pd.DataFrame(df.groupby('Outcome').median()).loc[:,['Glucose','BloodPressure','SkinThickness','Insulin','BMI']]

In [24]:
for col in df.columns:
    df.loc[(df["Outcome"] == 0) & (df[col].isnull()), col] = df[df["Outcome"] == 0][col].median()
    df.loc[(df["Outcome"] == 1) & (df[col].isnull()), col] = df[df["Outcome"] == 1][col].median()

In [25]:
df.isnull().sum()

In [26]:
plt.figure(figsize = (12,8))
sns.heatmap(df.corr(), annot=True)

# Feature Extraction

In [27]:
corr = df.corr()
plt.figure(figsize=(14,9))
sns.set(font_scale=1.4)
g = sns.heatmap(corr,annot=True,cmap="YlGnBu");

In [28]:
import plotly.express as px
%matplotlib inline
fig = px.scatter(df,y="Insulin",x="Glucose",color="Outcome",color_continuous_scale='geyser',title='Insulin vs Glucose',width=900, height=700)
fig.update_layout(
    font=dict(
        size=18,
    ))
fig.show()

In [29]:
fig = px.scatter(df,y="BMI",x="SkinThickness",color="Outcome",color_continuous_scale='geyser',title='BMI vs SkinThickness',width=900, height=700)
fig.update_layout(
    font=dict(
        size=18,
    ))
fig.show()

In [30]:
fig = px.scatter(df,y="BMI",x="BloodPressure",color="Outcome",color_continuous_scale='geyser',title='BMI vs BloodPressure',width=900, height=700)
fig.update_layout(
    font=dict(
        size=18,
    ))
fig.show()

In [31]:
df['IG'] = df['Insulin'] * df['Glucose']
df['BS'] = df['BMI'] * df['SkinThickness']
# df['AP'] = df['Pregnancies'] * df['Age']
# df['AB'] = df['Age'] * df['BloodPressure']
df['BB'] = df['BMI'] * df['BloodPressure']

In [32]:
plt.figure(figsize = (16,10))
sns.heatmap(df.corr(), annot=True)

In [33]:
from sklearn import preprocessing

In [34]:
def label_encoder(dataframe, binary_col):
    labelencoder = preprocessing.LabelEncoder()
    for i in binary_cols:
        dataframe[i] = labelencoder.fit_transform(dataframe[i])
    return dataframe

def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

def standard_scalar(dataframe, num_cols):
    std = preprocessing.StandardScaler()
    scaled = std.fit_transform(df[num_cols])
    return pd.DataFrame(scaled,columns=num_cols)

In [35]:
pre_data = df.copy()

In [36]:
#cat_cols = [col for col in df.columns if 12 >= len(df[col].unique()) > 2]

cat_cols   = [x for x in df.nunique()[df.nunique() < 12].keys().tolist() ]
#pre_data = one_hot_encoder(df,cat_cols)


In [37]:
cat_cols

In [38]:
binary_cols = [col for col in df.columns if df[col].dtypes == "O"
               and len(df[col].unique()) == 2]

pre_data = label_encoder(df,binary_cols)
target_col = ['Outcome']

In [39]:
#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in binary_cols + target_col]

pre_data = pd.get_dummies(data = pre_data,columns = multi_cols )

In [40]:

num_cols   = [x for x in df.columns if x not in cat_cols + target_col]

pre_data = pre_data.drop(columns = num_cols,axis = 1)
scaled = standard_scalar(pre_data,num_cols)
pre_data = pre_data.merge(scaled,left_index=True,right_index=True,how = "left")


In [41]:
pre_data.head()

In [42]:
pre_data.columns

In [43]:
plt.figure(figsize = (16,10))
sns.heatmap(df.corr(), annot=True)

# Model Training

In [44]:
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier, export_graphviz, export_text
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgbm
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score

In [45]:
# y = pre_data["Outcome"]
# X = pre_data.drop(["Outcome"], axis=1)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)

In [46]:
y = pre_data["Outcome"]
X = pre_data.drop(["Outcome"], axis=1)


In [47]:
def evaluate(model):
    cv = KFold(n_splits=5,shuffle=True, random_state = 42)
    y_real = []
    y_proba = []
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0,1,100)
    i = 1
    for train,test in cv.split(X,y):
        model.fit(X.iloc[train], y.iloc[train])
    
    # Confusion matrix
    y_pred = cross_val_predict(model, X, y, cv=5)
    conf_matrix = confusion_matrix(y, y_pred)
    print(conf_matrix)
    
#     predictions = model.predict(test_features)
#     errors = abs(predictions - test_labels)
#     mape = 100 * np.mean(errors / test_labels)
#     accuracy = round(accuracy_score(test_labels, predictions), 5)
#     print('Model Performance')
#     print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
#     print('Accuracy:', round(accuracy_score(test_labels, predictions), 5))
    
    
    

In [48]:
def scores_table(model, subtitle):
    scores = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    res = []
    for sc in scores:
        scores = cross_val_score(model, X, y, cv = 5, scoring = sc)
        res.append(scores)
    df = pd.DataFrame(res).T
    df.loc['mean'] = df.mean()
    df.loc['std'] = df.std()
    df= df.rename(columns={0: 'accuracy', 1:'precision', 2:'recall',3:'f1',4:'roc_auc'})
    print(df)

#     trace = go.Table(
#         header=dict(values=['<b>Fold', '<b>Accuracy', '<b>Precision', '<b>Recall', '<b>F1 score', '<b>Roc auc'],
#                     line = dict(color='#7D7F80'),
#                     fill = dict(color='#a1c3d1'),
#                     align = ['center'],
#                     font = dict(size = 15)),
#         cells=dict(values=[('1','2','3','4','5','6', '7', '8', '9', '10','mean', 'std'),
#                            np.round(df['accuracy'],3),
#                            np.round(df['precision'],3),
#                            np.round(df['recall'],3),
#                            np.round(df['f1'],3),
#                            np.round(df['roc_auc'],3)],
#                    line = dict(color='#7D7F80'),
#                    fill = dict(color='#EDFAFF'),
#                    align = ['center'], font = dict(size = 15)))

#     layout = dict(width=800, height=400, title = '<b>Cross Validation - 5 folds</b><br>'+subtitle, font = dict(size = 15))
#     fig = dict(data=[trace], layout=layout)

#     py.iplot(fig, filename = 'styled_table')


## Decision Tree Classifier

In [49]:
cart_model = DecisionTreeClassifier(random_state=17)

In [50]:
cart_params = {'max_depth': range(1, 11),
               "min_samples_split": [2, 3, 4]}

cart_cv = GridSearchCV(cart_model, cart_params, cv=10, n_jobs=-1, verbose=True)
cart_cv.fit(X,y)

In [51]:
cart_cv.best_params_

In [52]:
cart_tuned = DecisionTreeClassifier(**cart_cv.best_params_)

In [53]:
evaluate(cart_tuned)

In [54]:
scores_table(cart_tuned, "Decision Tree")

## Random Forest Classifier

In [None]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
random_for = RandomForestClassifier(random_state=17)
grid_search = GridSearchCV(estimator = random_for, param_grid = param_grid, 
                          cv = 10, n_jobs = -1, verbose = 4)
grid_search.fit(X,y)

In [None]:
grid_search.best_params_

In [None]:
param_rf ={'bootstrap': True,
 'max_depth': 80,
 'max_features': 3,
 'min_samples_leaf': 5,
 'min_samples_split': 8,
 'n_estimators': 100}

In [None]:
#rf_tuned = RandomForestClassifier(**grid_search.best_params_)

In [None]:
rf_tuned = RandomForestClassifier(**param_rf)

In [None]:
evaluate(rf_tuned)

In [None]:
scores_table(rf_tuned, "Random Forest")

In [None]:
# test error
y_pred = rf_tuned.predict(X_test)
y_prob = rf_tuned.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred,digits=4))
roc_auc_score(y_test, y_prob)

In [None]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)

## KNN

In [None]:
param_grid = {
      'n_neighbors': np.arange(20,30)
}

knn_clf = KNeighborsClassifier()
grid_search_knn = GridSearchCV(estimator = knn_clf, param_grid = param_grid, 
                          cv = 10, n_jobs = -1, verbose = 4)
grid_search_knn.fit(X, y)

In [None]:
grid_search_knn.best_params_

In [None]:
knn_tuned = KNeighborsClassifier(**grid_search_knn.best_params_)

In [None]:
evaluate(knn_tuned)

In [None]:
scores_table(knn_tuned, "KNN")

In [None]:
# test error
y_pred_knn = knn_tuned.predict(X_test)
y_prob_knn = knn_tuned.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred_knn,digits=4))
roc_auc_score(y_test, y_prob_knn)

In [None]:
best_grid_knn = grid_search_knn.best_estimator_
grid_accuracy_knn = evaluate(best_grid_knn, X_test, y_test)

In [None]:
plot_roc_curve(knn_tuned,X_test, y_test)  
plt.show()

## LGBM + KNN

In [None]:
# fit_params{
# "eval_metric" : 'auc', 
#              "eval_set" : [(X_train, y_train)],
#              'eval_names': ['valid'],
#              'verbose': 0,
#              'categorical_feature': 'auto'
#              }

# param_test = {'learning_rate' : [0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1, 0.2, 0.3, 0.4],
#               'n_estimators' : [100, 200, 300, 400, 500, 600, 800, 1000, 1500, 2000],
#               'num_leaves': sp_randint(6, 50), 
#               'min_child_samples': sp_randint(100, 500), 
#               'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
#               'subsample': sp_uniform(loc=0.2, scale=0.8), 
#               'max_depth': [1, 2, 3, 4, 5, 6, 7],
#               'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
#               'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
#               'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
#              }

#number of combinations
#n_iter = 240


In [None]:
class_weight = [None,'balanced']
boosting_type = ['gbdt']
num_leaves = [30,50,100,150] #list(range(30, 150)),
#learning_rate = list(np.logspace(np.log(0.005), np.log(0.2), base = np.exp(1), num = 10)) #1000
learning_rate = [0.02, 0.04, 0.06, 0.08]
max_bin = [500, 700, 1000]
param_test = dict(class_weight=class_weight, boosting_type=boosting_type, num_leaves=num_leaves, learning_rate =learning_rate, max_bin=max_bin)

In [None]:
# lgbm_clf = lgbm.LGBMClassifier(random_state=42, silent=True, metric='None', n_jobs=4)
# grid_search = RandomizedSearchCV(
#     estimator=lgbm_clf, param_distributions=param_test, 
#     n_iter=n_iter,
#     scoring='accuracy',
#     cv=5,
#     refit=True,
#     random_state=42,
#     verbose=True)

In [None]:
lgbm_clf = lgbm.LGBMClassifier(random_state=42, silent=True, metric='None', n_jobs=4)
grid_search = RandomizedSearchCV(
    estimator=lgbm_clf, param_distributions=param_test, 
    scoring='accuracy',
    cv=5,
    refit=True,
    random_state=42,
    verbose=True)

In [None]:
grid_search.fit(X, y)

In [None]:
opt_parameters =  grid_search.best_params_
opt_parameters

In [None]:
opt_parameters =  grid_search.best_params_
lgbm_tuned = lgbm.LGBMClassifier(**opt_parameters)

In [None]:
evaluate(lgbm_tuned)

In [None]:
scores_table(lgbm_tuned, "LGBM")

In [None]:
knn_clf = KNeighborsClassifier()

voting_clf = VotingClassifier(estimators=[ 
    ('lgbm_clf', lgbm_tuned),
    ('knn', KNeighborsClassifier())], voting='soft', weights = [1,1])

params = {
      'knn__n_neighbors': np.arange(1,30)
      }

In [None]:
grid_lgbmknn = GridSearchCV(estimator=voting_clf, param_grid=params, cv=10, verbose=4)

In [None]:
grid_lgbmknn.fit(X, y)

In [None]:
grid_lgbmknn.best_params_

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors =27)

voting_clf = VotingClassifier (
        estimators = [('knn', knn_clf), ('lgbm', lgbm_tuned)],
                     voting='soft', weights = [1,1])

In [None]:
evaluate(voting_clf)

In [None]:
scores_table(voting_clf, "LGBM+KNN")

## ADABOOST

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 1000],
    'learning_rate' : [0.001, 0.01,0.1,0.2,0.5],
}

In [None]:
adaboost = AdaBoostClassifier(random_state=17)
grid_search_ada = GridSearchCV(estimator = adaboost, param_grid = param_grid, 
                          cv = 10, n_jobs = -1, verbose = 15)

In [None]:
grid_search_ada.fit(X, y)

In [None]:
grid_search_ada.best_params_

In [None]:
ada_tuned = AdaBoostClassifier(**grid_search_ada.best_params_)

In [None]:
evaluate(ada_tuned)

In [None]:
scores_table(ada_tuned, "ADABOOST")

## ADABOOST + LGBM

In [None]:
ada_clf = AdaBoostClassifier()
voting_clf2 = VotingClassifier(estimators=[ 
    ('lgbm_clf', lgbm_tuned),
    ('adaboost', ada_clf)], voting='soft', weights=[accuracy_score(y_test, lgbm_tuned.predict(X_test)),
                                              accuracy_score(y_test, ada_clf.predict(X_test))])

params = {
      'adaboost__n_estimators': [100, 200, 300, 1000],
      'adaboost__learning_rate' : [0.001, 0.01,0.1,0.2,0.5],
      }

In [None]:
grid_adalgbm = GridSearchCV(estimator=voting_clf2, param_grid=params, cv=10,verbose=4)

In [None]:
grid_adalgbm.fit(X, y)

In [None]:
grid_adalgbm.best_params_

In [None]:
#ada_tuned_lgbm = AdaBoostClassifier(learning_rate=0.01, n_estimators=1000)

voting_clf2 = VotingClassifier (
        estimators = [('adaboost', ada_tuned), ('lgbm', lgbm_tuned)],
                     voting='soft', weights = [0.872413, 0.898515])

In [None]:
evaluate(voting_clf2)

In [None]:
scores_table(voting_clf2, "LGBM + ADABOOST")

In [None]:
y_pred_adalgbm = voting_clf2.predict(X_test)
y_prob_adalgbm = voting_clf2.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred_adalgbm,digits=4))
roc_auc_score(y_test, y_prob_adalgbm)

In [None]:
best_grid_adalgbm = grid_adalgbm.best_estimator_
grid_accuracy_adalgbm = evaluate(best_grid_adalgbm, X_test, y_test)

In [None]:
grid_adalgbm.best_score_

## LGBM + XGBOOST

In [None]:
xgboost = XGBClassifier(random_state=17, min_child_weight = 5, gamma = 1.5, subsample= 1.0, max_depth = 5, use_label_encoder=False)

In [None]:
evaluate(xgboost)

In [None]:
scores_table(xgboost, "XGBOOST")

In [None]:
voting_clf3 = VotingClassifier (
        estimators = [('xgboost', xgboost), ('lgbm', lgbm_tuned)],
                     voting='soft', weights = [1,1])

In [None]:
evaluate(voting_clf3)

In [None]:
scores_table(voting_clf3, "XGBOOST + LGBM")

## SVM

In [None]:
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid_svm = GridSearchCV(SVC(), param_grid, refit = True, verbose = 4)
  
# fitting the model for grid search
grid_svm.fit(X, y)

In [None]:
grid_svm.best_params_

In [None]:
svm_tuned = SVC(**grid_svm.best_params_)

In [None]:
evaluate(svm_tuned)

In [None]:
scores_table(svm_tuned, "SVM")

In [None]:
# test error
y_pred_svm = svm_tuned.predict(X_test)
#y_prob_svm = svm_tuned.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred_svm,digits=4))
#roc_auc_score(y_test, y_prob_svm)

In [None]:
best_grid_svm = grid_svm.best_estimator_
grid_accuracy_svm = evaluate(best_grid_svm, X_test, y_test)

## XGBOOST

In [None]:
param_grid = {
     "learning_rate"    : [0.01, 0.05, 0.10] ,
     "max_depth"        : [ 3, 5, 8],
     "min_child_weight" : [ 1, 3, 5,],
     "gamma"            : [1],
     "colsample_bytree" : [ 0.3]
}

In [None]:
# param_grid = {
#     'learning_rate': [0.01, 0.1],
#     'max_depth': [3, 5, 7, 10],
#     'min_child_weight': [1, 3, 5],
#     'subsample': [0.5, 0.7],
#     'colsample_bytree': [0.5, 0.7],
#     'n_estimators' : [100, 200, 500],
#     'objective': ['reg:squarederror']
# }

In [None]:
# param_grid = {
#     'learning_rate': [0.01, 0.1],
#     'max_depth': [3, 5],
#     'min_child_weight': [1, 3],
#     'subsample': [0.5, 0.7],
#     'colsample_bytree': [0.5, 0.7],
#     'objective': ['reg:squarederror']
# }

In [None]:
xgboost = XGBClassifier(random_state=17)
grid_search_xg = GridSearchCV(estimator = xgboost, param_grid = param_grid, 
                          cv = 10, n_jobs = -1, verbose = 4)
grid_search_xg.fit(X_train, y_train)

In [None]:
grid_search_xg.best_params_

In [None]:
xg_tuned = XGBClassifier(**grid_search_xg.best_params_).fit(X_train, y_train)

In [None]:
# test error
y_pred_xg = xg_tuned.predict(X_test)
y_prob_xg = xg_tuned.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred_xg,digits=4))
roc_auc_score(y_test, y_prob_xg)

In [None]:
best_grid_xg = grid_search_xg.best_estimator_
grid_accuracy_xg = evaluate(best_grid_xg, X_test, y_test)

In [None]:
confusion_matrix(y_test, y_pred_xg)

In [None]:
plot_roc_curve(xg_tuned,X_test, y_test)  
plt.show()