# Load Dataset

In [1]:
import pandas as pd
df = pd.read_csv('diabetics.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0,0
764,2.0,122.0,70.0,27.0,0.0,36.8,0.340,27.0,0
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0,0
766,1.0,126.0,60.0,0.0,0.0,30.1,0.349,47.0,1


In [2]:
df.dtypes

Pregnancies                 float64
Glucose                     float64
BloodPressure               float64
SkinThickness               float64
Insulin                     float64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                         float64
Outcome                       int64
dtype: object

In [3]:
df.shape

(768, 9)

# Data Preparation

***Find missing values***

In [4]:
df.isna().sum()

Pregnancies                  6
Glucose                     11
BloodPressure                8
SkinThickness               20
Insulin                      6
BMI                         11
DiabetesPedigreeFunction     5
Age                          7
Outcome                      0
dtype: int64

***Handle missing values***

Replace all the missing data with the mean, median, or mode of the column

In [5]:
df['Pregnancies'].fillna( df['Pregnancies'].median(), inplace = True )
df['Glucose'].fillna( df['Glucose'].mean(), inplace = True )
df['BloodPressure'].fillna( df['BloodPressure'].mean(), inplace = True )
df['SkinThickness'].fillna( df['SkinThickness'].mean(), inplace = True)
df['Insulin'].fillna( df['Insulin'].mean(), inplace = True )
df['BMI'].fillna( df['BMI'].mean(), inplace = True)
df['DiabetesPedigreeFunction'].fillna( df['DiabetesPedigreeFunction'].mean(), inplace= True)
df['Age'].fillna( df['Age'].median(), inplace = True )


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Pregnancies'].fillna( df['Pregnancies'].median(), inplace = True )
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Glucose'].fillna( df['Glucose'].mean(), inplace = True )
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate

In [6]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

# Data Seperation

***Seperate input and output from dataset***

In [7]:
x = df.iloc[:,:-1].values              # [row, col]    : means full row    :-1 means except last col
x            # input

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [8]:
y = df.iloc[:,-1].values
y             # output

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,

# Data Splitting

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split( x , y, test_size=0.30, random_state = 52)     # test_size = .3 ie, 30% data
x_train

array([[ 12.   , 151.   ,  70.   , ...,  41.8  ,   0.742,  38.   ],
       [  3.   , 158.   ,  70.   , ...,  35.5  ,   0.344,  35.   ],
       [  6.   , 154.   ,  74.   , ...,  29.3  ,   0.839,  39.   ],
       ...,
       [  4.   , 114.   ,  65.   , ...,  21.9  ,   0.432,  37.   ],
       [  3.   ,  87.   ,  60.   , ...,  21.8  ,   0.444,  21.   ],
       [  6.   ,  98.   ,  58.   , ...,  34.   ,   0.43 ,  43.   ]])

In [10]:
x_test

array([[3.00e+00, 1.80e+02, 6.40e+01, ..., 3.40e+01, 2.71e-01, 2.60e+01],
       [3.00e+00, 9.90e+01, 8.00e+01, ..., 1.93e+01, 2.84e-01, 3.00e+01],
       [4.00e+00, 1.48e+02, 6.00e+01, ..., 3.09e+01, 1.50e-01, 2.90e+01],
       ...,
       [3.00e+00, 1.70e+02, 6.40e+01, ..., 3.45e+01, 3.56e-01, 3.00e+01],
       [2.00e+00, 1.14e+02, 6.80e+01, ..., 2.87e+01, 9.20e-02, 2.50e+01],
       [3.00e+00, 1.07e+02, 6.20e+01, ..., 2.29e+01, 6.78e-01, 2.30e+01]])

In [11]:
y_train

array([1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,

In [12]:
y_test

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1], dtype=int64)

# Data Normalization

***Normalization technique used - standard scalar***

**z = (x-u)/s ***  

x is mean of training data

u is exact value

s is std deviation of training data

Feature scaling is a common preprocessing step in machine learning to ensure that all features have the same scale.


In [13]:
from sklearn.preprocessing import StandardScaler    # another one - MinMaxScalar
scalar = StandardScaler()
scalar.fit(x_train)
x_train = scalar.transform(x_train)      # fit and transform can be together performed. scalar.fit_transform(x_train)
x_test = scalar.transform(x_test)


# Model creation

In [14]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [15]:
model = Sequential()
model.add(Dense(32, activation = 'relu', input_dim = 8))
model.add(Dropout(0.5))  # Increased from 0.2/0.3

model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_test,y_test) )

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.4407 - loss: 0.7984 - val_accuracy: 0.4675 - val_loss: 0.7442
Epoch 2/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4803 - loss: 0.7936 - val_accuracy: 0.5758 - val_loss: 0.6924
Epoch 3/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5563 - loss: 0.7233 - val_accuracy: 0.6277 - val_loss: 0.6529
Epoch 4/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6063 - loss: 0.6438 - val_accuracy: 0.6494 - val_loss: 0.6244
Epoch 5/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5948 - loss: 0.6748 - val_accuracy: 0.6580 - val_loss: 0.6025
Epoch 6/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6402 - loss: 0.6420 - val_accuracy: 0.7143 - val_loss: 0.5833
Epoch 7/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x23ab5ee36d0>

In [16]:
model = Sequential()
model.add(Dense(32, activation = 'relu', input_dim = 8))
model.add(Dropout(0.5))  # Increased from 0.2/0.3

model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_test,y_test) )

Epoch 1/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.4709 - loss: 0.7662 - val_accuracy: 0.5974 - val_loss: 0.6834
Epoch 2/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6195 - loss: 0.6838 - val_accuracy: 0.6883 - val_loss: 0.6446
Epoch 3/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6084 - loss: 0.7196 - val_accuracy: 0.7100 - val_loss: 0.6166
Epoch 4/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6556 - loss: 0.6399 - val_accuracy: 0.7273 - val_loss: 0.5938
Epoch 5/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6848 - loss: 0.6149 - val_accuracy: 0.7316 - val_loss: 0.5741
Epoch 6/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6959 - loss: 0.6098 - val_accuracy: 0.7403 - val_loss: 0.5595
Epoch 7/10
[1m17/17[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x23ab603a850>

# Select good optimizer

In [17]:
!pip install keras-tuner --upgrade





[notice] A new release of pip available: 22.3 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
import keras_tuner as kt
import keras

In [19]:
def build_model(hp):

  model = Sequential()
  model.add(Dense(32, activation = 'relu', input_dim = 8))
  model.add(Dense(1, activation = 'sigmoid'))

  optimizer = hp.Choice('optimizer', values=['adam', 'sgd', 'rmsprop', 'adadelta'])
  model.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])

  return model

In [20]:
tuner = kt.RandomSearch(build_model, objective='val_accuracy', max_trials=5, directory='my_dir', project_name='optimizer')

Reloading Tuner from my_dir\optimizer\tuner0.json


In [21]:
tuner.search(x_train, y_train, epochs=5, validation_data=(x_test, y_test))


In [22]:
best_optimizer = tuner.get_best_hyperparameters()[0].values
best_optimizer

{'optimizer': 'rmsprop'}

In [23]:
model = tuner.get_best_models(num_models=1)[0]





  saveable.load_own_variables(weights_store.get(inner_path))


In [24]:
model.summary()

In [25]:
model.fit(x_train, y_train, batch_size=64, epochs=60, validation_data=(x_test,y_test) )

Epoch 1/60
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.7677 - loss: 0.5272 - val_accuracy: 0.7706 - val_loss: 0.5239
Epoch 2/60
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7581 - loss: 0.5320 - val_accuracy: 0.7835 - val_loss: 0.5129
Epoch 3/60
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7617 - loss: 0.5211 - val_accuracy: 0.7792 - val_loss: 0.5055
Epoch 4/60
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7644 - loss: 0.5052 - val_accuracy: 0.7835 - val_loss: 0.4992
Epoch 5/60
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7769 - loss: 0.4828 - val_accuracy: 0.7749 - val_loss: 0.4949
Epoch 6/60
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7645 - loss: 0.4981 - val_accuracy: 0.7749 - val_loss: 0.4911
Epoch 7/60
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x23ab878f990>

In [26]:
from sklearn.metrics import classification_report
y_pred = (model.predict(x_test) > 0.5).astype(int)
print(classification_report(y_test, y_pred))

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       155
           1       0.68      0.61      0.64        76

    accuracy                           0.77       231
   macro avg       0.75      0.73      0.74       231
weighted avg       0.77      0.77      0.77       231



In [27]:
model.save('trained_model.h5')




In [28]:
import joblib

joblib.dump(scalar, 'scaler.pkl')


['scaler.pkl']