## Data Pre-Processing And Cleaning

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('SDG 3 Diabetes.csv')

print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [15]:
# Replacing zeros with NaN
columns_to_replace = ['Glucose', 'BloodPressure', 'Pregnancies']
df[columns_to_replace] = df[columns_to_replace].replace(0, np.nan)

# Replacing missing values of Blood pressure with Mean
df['BloodPressure'].fillna(df['BloodPressure'].mean(), inplace=True)

# Replacing missing values of Insulin,BMI and SkinThickness with Median
df['Pregnancies'].fillna(df['Pregnancies'].median(), inplace=True)

# Displaying updated dataset
print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0          6.0    148.0           72.0             35        0  33.6   
1          1.0     85.0           66.0             29        0  26.6   
2          8.0    183.0           64.0              0        0  23.3   
3          1.0     89.0           66.0             23       94  28.1   
4          4.0    137.0           40.0             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['BloodPressure'].fillna(df['BloodPressure'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Pregnancies'].fillna(df['Pregnancies'].median(), inplace=True)


In [16]:
# Scaling the data
scaler = StandardScaler()
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

## SMOTE for handling class imbalance

In [20]:
%pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [23]:
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_imputed, y_train)