In [None]:
import pandas as pd
import numpy as np

# Plots
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.tools as tls
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)
import squarify

from scipy import stats

# Data processing, metrics and modeling
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_curve, precision_recall_curve, accuracy_score, classification_report, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict

import tensorflow as tf
from tensorflow.keras import models, layers, optimizers, losses, callbacks
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU, Dropout, Flatten, Input, Activation, PReLU, LeakyReLU, ThresholdedReLU
import keras.backend as K

import xgboost
import lightgbm as lgbm

#ignore warning messages 
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
display(data.info(),data.head())

In [None]:
#Outlier Removal

#Z Score
z = np.abs(stats.zscore(data))
data = data[(z < 3).all(axis=1)]

# IQR
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
data = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]

data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI', 'Age']] = data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI', 'Age']].replace(0,np.NaN)
data.loc[(data['Insulin'].isnull()), 'Insulin'] = data['Insulin'].mean()
data.loc[(data['Glucose'].isnull()), 'Glucose'] = data['Glucose'].mean()
data.loc[(data['BloodPressure'].isnull()), 'BloodPressure'] = data['BloodPressure'].mean()
data.loc[(data['SkinThickness'].isnull()), 'SkinThickness'] = data['SkinThickness'].mean()
data.loc[(data['BMI'].isnull()), 'BMI'] = data['BMI'].mean()

In [None]:
data.info()

In [None]:
# Drop nulls
# data = data[data['Insulin'] > 0]
# data = data[data['Glucose'] > 0]
# data = data[data['BloodPressure'] > 0]
# data = data[data['SkinThickness'] > 0]
# data = data[data['BMI'] > 0]
# display(data.info(),data.head())

In [None]:
data = data.reset_index(drop=True)

target_col = ["Outcome"]
cat_cols   = data.nunique()[data.nunique() < 12].keys().tolist()
cat_cols   = [x for x in cat_cols ]
#numerical columns
num_cols   = [x for x in data.columns if x not in cat_cols + target_col]
#Binary columns with 2 values
bin_cols   = data.nunique()[data.nunique() == 2].keys().tolist()
#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]

#Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    data[i] = le.fit_transform(data[i])
    
#Duplicating columns for multi value columns
data = pd.get_dummies(data = data,columns = multi_cols )

#Scaling Numerical columns
std = MinMaxScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

#dropping original values merging scaled values for numerical columns
df_data_og = data.copy()
data = data.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index=True,right_index=True,how = "left")

In [None]:
data.head()

In [None]:
# Def X and Y
X = data.drop('Outcome', 1)
y = data['Outcome']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
model = models.Sequential(name="pimamodel")
model.add(Input(shape=[X.shape[1]], name="Features"))
model.add(Dense(128))
model.add(PReLU())
model.add(Dense(64))
model.add(PReLU())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=optimizers.Adam(), metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
mc = tf.keras.callbacks.ModelCheckpoint('best_model_nn.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
history_model = model.fit(
  X_train, y_train,
  batch_size=32,
  epochs=50, validation_split=0.1, verbose=1, callbacks=[mc]
)

In [None]:
def score_custom_network(y_train, y_test):    
    print("=====> Scoring Custom Network <=====")
    
    model.load_weights('best_model_nn.h5')
    preds = model.predict(X_test)
    preds = preds.flatten()
    y_pred = np.where(preds > 0.5, 1, 0)

    print (classification_report(y_test, y_pred, digits=4))
    print ("Accuracy:", accuracy_score(y_test, y_pred))
    print ("Precision", precision_score(y_test, y_pred))
    print ("Recall:", recall_score(y_test, y_pred))
    print ("F1 Score:", f1_score(y_test, y_pred))

score_custom_network(y_train, y_test)

In [None]:
lreg = LogisticRegression(random_state=1234, max_iter=5000)
lreg.fit(X_train, y_train)
preds = lreg.predict(X_test)
print (classification_report(y_test, preds, digits=4))
print ("Accuracy:", accuracy_score(y_test, preds))
print ("Precision", precision_score(y_test, preds))
print ("Recall:", recall_score(y_test, preds))
print ("F1 Score:", f1_score(y_test, preds))

In [None]:
xgb_model = xgboost.XGBClassifier(random_state=1234)
xgb_model.fit(X_train, y_train)
preds = xgb_model.predict(X_test)
print (classification_report(y_test, preds, digits=4))
print ("Accuracy:", accuracy_score(y_test, preds))
print ("Precision", precision_score(y_test, preds))
print ("Recall:", recall_score(y_test, preds))
print ("F1 Score:", f1_score(y_test, preds))

In [None]:
svm = SVC(C=100, gamma='scale', random_state=1234)
svm.fit(X_train, y_train)
preds = svm.predict(X_test)
print (classification_report(y_test, preds, digits=4))
print ("Accuracy:", accuracy_score(y_test, preds))
print ("Precision", precision_score(y_test, preds))
print ("Recall:", recall_score(y_test, preds))
print ("F1 Score:", f1_score(y_test, preds))

In [None]:
knn = KNeighborsClassifier(n_neighbors=25)
knn.fit(X_train, y_train)
preds = knn.predict(X_test)
print (classification_report(y_test, preds, digits=4))
print ("Accuracy:", accuracy_score(y_test, preds))
print ("Precision", precision_score(y_test, preds))
print ("Recall:", recall_score(y_test, preds))
print ("F1 Score:", f1_score(y_test, preds))

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
preds = gnb.predict(X_test)
print (classification_report(y_test, preds, digits=4))
print ("Accuracy:", accuracy_score(y_test, preds))
print ("Precision", precision_score(y_test, preds))
print ("Recall:", recall_score(y_test, preds))
print ("F1 Score:", f1_score(y_test, preds))