In [1]:
import numpy as np
import pandas as pd

# Loading Dataset

In [7]:
df = pd.read_csv('diabetes.csv')


# Descriptive Stats

In [8]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
df.shape

(768, 9)

In [10]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [14]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [13]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


# Replacing 0(missing values) to NaN

In [16]:
df['Glucose'].replace(0, np.nan, inplace=True)
df['BloodPressure'].replace(0, np.nan, inplace=True)
df['SkinThiSkinThicknessckness'].replace(0, np.nan, inplace=True)
df['Insulin'].replace(0, np.nan, inplace=True)
df['BMI'].replace(0, np.nan, inplace=True)

In [17]:
df.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

# Data Cleaning

## Univariate Feature Implutation

### Implutation for SkinThickness

In [18]:
arr = df['SkinThickness'].values.reshape(-1,1)
arr.shape

(768, 1)

In [19]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp.fit(df['SkinThickness'].values.reshape(-1,1))
df['SkinThickness'] = imp.transform(df['SkinThickness'].values.reshape(-1,1))

In [20]:
df['SkinThickness'].describe()

count    768.000000
mean      29.994792
std        8.886506
min        7.000000
25%       25.000000
50%       32.000000
75%       32.000000
max       99.000000
Name: SkinThickness, dtype: float64

In [21]:
df.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness                 0
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

### Implutation for Glucose

In [22]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp.fit(df['Glucose'].values.reshape(-1,1))
df['Glucose'] = imp.transform(df['Glucose'].values.reshape(-1,1))

In [23]:
df['Glucose'].describe()

count    768.000000
mean     121.656250
std       30.438286
min       44.000000
25%       99.750000
50%      117.000000
75%      140.250000
max      199.000000
Name: Glucose, dtype: float64

In [24]:
df.isnull().sum()

Pregnancies                   0
Glucose                       0
BloodPressure                35
SkinThickness                 0
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

### Implutation for BloodPressure

In [25]:
mp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(df['BloodPressure'].values.reshape(-1,1))
df['BloodPressure'] = imp.transform(df['BloodPressure'].values.reshape(-1,1))

In [26]:
df['BloodPressure'].describe()

count    768.000000
mean      72.386719
std       12.096642
min       24.000000
25%       64.000000
50%       72.000000
75%       80.000000
max      122.000000
Name: BloodPressure, dtype: float64

### Implutation for BMI

In [27]:
mp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=32)
imp.fit(df['BMI'].values.reshape(-1,1))
df['BMI'] = imp.transform(df['BMI'].values.reshape(-1,1))

In [28]:
df['BMI'].describe()

count    768.000000
mean      32.455208
std        6.875177
min       18.200000
25%       27.500000
50%       32.300000
75%       36.600000
max       67.100000
Name: BMI, dtype: float64

## Multivariate Implutation

In [29]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [30]:
df_feature = df.drop('Outcome', axis=1)
df_label = df[['Outcome']]

df_feature.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,,33.6,0.627,50
1,1,85.0,66.0,29.0,,26.6,0.351,31
2,8,183.0,64.0,32.0,,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33


In [31]:
imp = IterativeImputer(max_iter=10000, random_state=0)


In [32]:
imp.fit(df_feature)

IterativeImputer(max_iter=10000, random_state=0)

In [33]:
df_feature_arr = imp.transform(df_feature)

In [34]:
df_feature_arr.shape

(768, 8)

In [39]:
df_feature = pd.DataFrame(df_feature_arr, columns=df_feature.columns)

df_feature.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,219.029355,33.6,0.627,50.0
1,1.0,85.0,66.0,29.0,70.34693,26.6,0.351,31.0
2,8.0,183.0,64.0,32.0,270.585967,23.3,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0


In [38]:
df = pd.concat([df_feature, df_label], axis=1)

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,219.029355,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,70.34693,26.6,0.351,31.0,0
2,8.0,183.0,64.0,32.0,270.585967,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


In [40]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64