In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# IMPORTING LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
from keras.layers import Dense, BatchNormalization, Dropout, LSTM
from keras.models import Sequential
from keras.utils import to_categorical
from keras import callbacks
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

# LOADING DATA

In [None]:
#loading data
data = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
data.head()

In [None]:
data.info()

**About the data:**

The dataset contains 8 attributes (predictors) and a target variable (outcome). The attributes are:

- Pregnancies: number of times pregnant
- Glucose: plasma glucose concentration a 2 hours in an oral glucose            tolerance test
- BloodPressure: diastolic blood pressure (mm Hg)
- SkinThickness: triceps skin fold thickness (mm)
- Insulin: 2-hour serum insulin (mu U/ml)
- BMI: body mass index (weight in kg/(height in m)^2)
- DiabetesPedigreeFunction: diabetes pedigree function
- Age: age in years

# DATA ANALYSIS

In [None]:
#first of all let us evaluate the target and find out if our data is imbalanced or not
data = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
cols= ["#6daa9f","#774571"]
sns.countplot(x= data["Outcome"], palette= cols)

As we saw above, **There is an imbalance in the data**

In [None]:
#Examaning a corelation matrix of all the features 
cmap = sns.diverging_palette(275,150,  s=40, l=65, n=9)
corrmat = data.corr()
plt.subplots(figsize=(18,18))
sns.heatmap(corrmat,cmap= cmap,annot=True, square=True);

**Notable points:**
- SkinThickness is the first most important feature. The much high it is, the much indicator that subcoutaneous fat is present in the body which increase developing a Diabetes.
- BloodPressure is the second most important feature, as it is a known risk factor for insulin resistence which increase the risk of developing Diabetes.
- Glucose is the third most correlated feature, one of the keys diagonstic criterias for the disease.


**Next, we will examine the count plot of age.**

In [None]:
#Evauating age distrivution 
plt.figure(figsize=(20,12))
#colours =["#774571","#b398af","#f1f1f1" ,"#afcdc7", "#6daa9f"]
Days_of_week=sns.countplot(x=data['Age'],data=data, hue ="Outcome",palette = cols)
Days_of_week.set_title("Distribution Of Age", color="#774571")

In [None]:
# Boxen and swarm plot of some non binary features.
feature = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI", "DiabetesPedigreeFunction", "Age"]
for i in feature:
    plt.figure(figsize=(30, 20))
    sns.swarmplot(x=data["Outcome"], y=data[i], color="black", alpha=0.5)
    sns.boxenplot(x=data["Outcome"], y=data[i], palette=cols)
    plt.show()

**Next, we examine the kdeplot of time and age as they both are significant features.**

In [None]:
sns.kdeplot(x=data["BloodPressure"], y=data["Age"], hue =data["Outcome"], palette=cols)

In [None]:
data.describe().T

# DATA PROCESSING

In [None]:
#assigning values to features as X and target as y
X=data.drop(["Outcome"],axis=1)
y=data["Outcome"]

In [None]:
#Set up a standard scaler for the features
col_names = list(X.columns)
s_scaler = preprocessing.StandardScaler()
X_df= s_scaler.fit_transform(X)
X_df = pd.DataFrame(X_df, columns=col_names)   
X_df.describe().T

In [None]:
#looking at the scaled features
colours =["#774571","#b398af","#f1f1f1" ,"#afcdc7", "#6daa9f"]
plt.figure(figsize=(20,10))
sns.boxenplot(data = X_df,palette = colours)
plt.xticks(rotation=90)
plt.show()

In [None]:
#spliting test and training sets
X_train, X_test, y_train,y_test = train_test_split(X_df,y,test_size=0.25,random_state=7)

# MODEL BUILDING

In [None]:
early_stopping = callbacks.EarlyStopping(
    min_delta=0.001, # minimium amount of change to count as an improvement
    patience=20, # how many epochs to wait before stopping
    restore_best_weights=True)

# Initialising the Diabetes database
model = Sequential()

# layers
model.add(Dense(units = 16, kernel_initializer = 'uniform', activation = 'relu', input_dim = 8))
model.add(Dense(units = 8, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dropout(0.25))
model.add(Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
from keras.optimizers import SGD
# Compiling the Diabetes
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Train the Diabetes
history = model.fit(X_train, y_train, batch_size = 32, epochs = 500,callbacks=[early_stopping], validation_split=0.2)

In [None]:
val_accuracy = np.mean(history.history['val_accuracy'])
print("\n%s: %.2f%%" % ('val_accuracy', val_accuracy*100))

**Plotting training and validation loss over epochs**

In [None]:
history_df = pd.DataFrame(history.history)

plt.plot(history_df.loc[:, ['loss']], "#6daa9f", label='Training loss')
plt.plot(history_df.loc[:, ['val_loss']],"#774571", label='Validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc="best")

plt.show()

**Plotting training and validation accuracy over epochs**

In [None]:
history_df = pd.DataFrame(history.history)

plt.plot(history_df.loc[:, ['accuracy']], "#6daa9f", label='Training accuracy')
plt.plot(history_df.loc[:, ['val_accuracy']], "#774571", label='Validation accuracy')

plt.title('Training and Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# CONCLUSIONS

In [None]:
# Predicting the test set results
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)
np.set_printoptions()

In [None]:
# confusion matrix
cmap1 = sns.diverging_palette(275,150,  s=40, l=65, n=6)
plt.subplots(figsize=(12,8))
cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix/np.sum(cf_matrix), cmap = cmap1, annot = True, annot_kws = {'size':15})

In [None]:
print(classification_report(y_test, y_pred))

# FINISHED
