In [148]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from keras import models, layers

In [149]:
df = pd.read_csv('diabetes_prediction_dataset.csv')

In [150]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


## Data Pre-Processing


### Vectorization

In [151]:
## Check gender column
df['gender'].unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [152]:
df['gender'] = df['gender'].apply(lambda x: 0 if x == 'Female' else (1 if x == 'Male' else 2) )

In [153]:
## Check vals now
df['gender'].unique()

array([0, 1, 2], dtype=int64)

In [154]:
## Check Smoking History

df['smoking_history'].unique()

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [155]:
## Create a dict to map values
hist = {'never':0, 'No Info':1, 'current':2, 'former':3, 'ever':4, 'not current':5}

df['smoking_history'] = df['smoking_history'].map(hist)

In [156]:
df['smoking_history'].unique()

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [157]:
df.dtypes

gender                   int64
age                    float64
hypertension             int64
heart_disease            int64
smoking_history          int64
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
diabetes                 int64
dtype: object

In [158]:
## Since all columns are now in numeric form, convert them to float 

df = df.astype(np.float32)

In [159]:
df.dtypes

gender                 float32
age                    float32
hypertension           float32
heart_disease          float32
smoking_history        float32
bmi                    float32
HbA1c_level            float32
blood_glucose_level    float32
diabetes               float32
dtype: object

### Handling missing values

In [160]:
df.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [161]:
## Split column to be predicted

y = df['diabetes'].copy()
df = df.drop(['diabetes'], axis=1)

### Value Normalization

In [162]:
## split into train and test models

X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.7, random_state=42)

In [163]:
train_mean = np.mean(X_train, axis=0)
X_train -= train_mean

train_std = np.std(X_train, axis=0)
X_train /= train_std

X_test -= train_mean
X_test /= train_std

### Feature Engineering

#### No suitable feature engineering required

### Model Training


In [165]:
print(X_train.shape, y_train.shape)

(70000, 8) (70000,)


In [173]:
### Create neural network

model = models.Sequential()

model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
    
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [174]:
history = model.fit(X_train, y_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [175]:
results = model.evaluate(X_test, y_test)

