In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Load the data

In [6]:
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

EDA

How to tackle 0's in the above features like BP, Insulin, SkinThickness etc.
Outliers -> Specially in insulin (right-skewed distribution)
Feature Selection
Data Standardization [StandardScaler, Min-Max Scaler]
Data Imbalance

Data Split

In [9]:
x = data.drop(columns='Outcome', axis=1)
y = data['Outcome']

In [10]:
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [11]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=44)

In [14]:
X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
407,0,101,62,0,0,21.9,0.336,25
765,5,121,72,23,112,26.2,0.245,30
558,11,103,68,40,0,46.2,0.126,42
82,7,83,78,26,71,29.3,0.767,36
165,6,104,74,18,156,29.9,0.722,41
...,...,...,...,...,...,...,...,...
96,2,92,62,28,0,31.6,0.130,24
571,2,130,96,0,0,22.6,0.268,21
173,1,79,60,42,48,43.5,0.678,23
753,0,181,88,44,510,43.3,0.222,26


In [15]:
y_train

407    0
765    0
558    0
82     0
165    1
      ..
96     0
571    0
173    0
753    1
419    1
Name: Outcome, Length: 576, dtype: int64

In [16]:
X_test

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
452,0,91,68,32,210,39.9,0.381,25
370,3,173,82,48,465,38.4,2.137,25
746,1,147,94,41,0,49.3,0.358,27
122,2,107,74,30,100,33.6,0.404,23
614,11,138,74,26,144,36.1,0.557,50
...,...,...,...,...,...,...,...,...
458,10,148,84,48,237,37.6,1.001,51
524,3,125,58,0,0,31.6,0.151,24
43,9,171,110,24,240,45.4,0.721,54
66,0,109,88,30,0,32.5,0.855,38


In [17]:
y_test

452    0
370    1
746    1
122    0
614    1
      ..
458    1
524    0
43     1
66     1
76     0
Name: Outcome, Length: 192, dtype: int64

In [18]:
# Data Imbalance
data['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

1. Oversampling
2. Undersampling

In [19]:
type(y_train)

pandas.core.series.Series

In [20]:
from collections import Counter
Counter(y_train)

Counter({0: 385, 1: 191})

RandomOverSampler: Randomly duplicating samples from the minority class until both the classes have the same number of samples.

In [21]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

model_over_sampling = RandomOverSampler(random_state=44)
X_train_over, y_train_over = model_over_sampling.fit_resample(X_train, y_train)

Advanced Over Sampling Techniques: SMOTE and ADASYN (Synthetic Data)

Model Training

In [22]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_over, y_train_over)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Prediction

In [23]:
y_pred = model.predict(X_test)
y_pred

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0])

In [24]:
result_comparison = pd.DataFrame({'Actual Outcome': y_test, 'Predicted Outcome': y_pred})
result_comparison

Unnamed: 0,Actual Outcome,Predicted Outcome
452,0,0
370,1,1
746,1,1
122,0,0
614,1,1
...,...,...
458,1,1
524,0,0
43,1,1
66,1,0


In [25]:
y_pred_prob = model.predict_proba(X_test)
y_pred_prob

array([[0.78241645, 0.21758355],
       [0.01872862, 0.98127138],
       [0.32274941, 0.67725059],
       [0.72060927, 0.27939073],
       [0.19424402, 0.80575598],
       [0.3705494 , 0.6294506 ],
       [0.46734821, 0.53265179],
       [0.87255371, 0.12744629],
       [0.79175654, 0.20824346],
       [0.6199314 , 0.3800686 ],
       [0.54599169, 0.45400831],
       [0.44975473, 0.55024527],
       [0.83571231, 0.16428769],
       [0.83615794, 0.16384206],
       [0.83997952, 0.16002048],
       [0.12382667, 0.87617333],
       [0.72121608, 0.27878392],
       [0.39655717, 0.60344283],
       [0.89995543, 0.10004457],
       [0.367595  , 0.632405  ],
       [0.62770924, 0.37229076],
       [0.19974123, 0.80025877],
       [0.73493051, 0.26506949],
       [0.52196943, 0.47803057],
       [0.95312831, 0.04687169],
       [0.76180242, 0.23819758],
       [0.49808429, 0.50191571],
       [0.4367006 , 0.5632994 ],
       [0.85312103, 0.14687897],
       [0.16344716, 0.83655284],
       [0.

Model Evaluation

In [26]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
confusion_matrix(y_test, y_pred)

array([[94, 21],
       [22, 55]])

In [27]:
recall = recall_score(y_test, y_pred)
print(recall)

0.7142857142857143
