<a href="https://colab.research.google.com/github/abhuvan345/6thSem-ML-Lab/blob/main/1BM24CS403_Lab_1_DataProcessing_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
# =========================
# 1. Import Libraries
# =========================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# =========================
# 2. Load Dataset
# =========================
# Upload CSV in Colab first, then update path if needed
df = pd.read_csv('/content/Diabetes.csv')

print("Initial Shape:", df.shape)
df.head()


Initial Shape: (1000, 14)


Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,502,17975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,735,34221,M,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,420,47975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,680,87656,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,504,34223,M,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N


In [42]:
# Check missing values
df.isnull().sum()


Unnamed: 0,0
ID,0
No_Pation,0
Gender,0
AGE,0
Urea,0
Cr,0
HbA1c,0
Chol,0
TG,0
HDL,0


In [43]:
# Numerical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

# Categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [44]:
df['Gender'] = df['Gender'].astype(str).str.strip().str.upper()
df['CLASS'] = df['CLASS'].astype(str).str.strip().str.upper()


In [45]:
# Gender encoding
df['Gender'] = df['Gender'].map({'M': 1, 'F': 0})

# Target encoding
df['CLASS'] = df['CLASS'].map({'N': 0, 'Y': 1})


In [46]:
df

Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,502,17975,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0.0
1,735,34221,1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,0.0
2,420,47975,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0.0
3,680,87656,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0.0
4,504,34223,1,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,200,454317,1,71,11.0,97,7.0,7.5,1.7,1.2,1.8,0.6,30.0,1.0
996,671,876534,1,31,3.0,60,12.3,4.1,2.2,0.7,2.4,15.4,37.2,1.0
997,669,87654,1,30,7.1,81,6.7,4.1,1.1,1.2,2.4,8.1,27.4,1.0
998,99,24004,1,38,5.8,59,6.7,5.3,2.0,1.6,2.9,14.0,40.5,1.0


In [47]:
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[column] >= lower) & (data[column] <= upper)]


In [48]:
# Apply outlier removal to numerical columns
for col in num_cols:
    df = remove_outliers_iqr(df, col)


In [49]:
print("Dataset shape after outlier removal:", df.shape)


Dataset shape after outlier removal: (629, 14)


In [50]:
minmax_scaler = MinMaxScaler()

df_minmax = df.copy()
df_minmax[num_cols] = minmax_scaler.fit_transform(df_minmax[num_cols])

df_minmax.head()


Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
5,0.79198,0.451603,0,0.193548,0.157895,0.02381,0.153846,0.152542,0.148936,0.4,0.183673,0.142857,0.1,0.0
6,0.901003,0.451616,0,0.354839,0.118421,0.333333,0.153846,0.271186,0.212766,0.333333,0.306122,0.238095,0.25,0.0
7,0.525063,0.451643,1,0.290323,0.473684,0.297619,0.153846,0.152542,0.106383,0.333333,0.204082,0.142857,0.25,0.0
13,0.162907,0.451736,0,0.193548,0.460526,0.380952,0.238462,0.372881,0.297872,0.533333,0.326531,0.333333,0.2,0.0
14,0.501253,0.451749,0,0.354839,0.315789,0.202381,0.153846,0.338983,0.255319,0.533333,0.326531,0.285714,0.25,0.0


In [51]:
standard_scaler = StandardScaler()

df_standard = df.copy()
df_standard[num_cols] = standard_scaler.fit_transform(df_standard[num_cols])

df_standard.head()


Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
5,1.26687,0.302737,0,-1.807219,-1.521396,-1.964305,-1.773485,-1.702821,-1.130824,-0.355744,-1.071193,-1.316184,-1.916821,0.0
6,1.638333,0.30281,0,-0.900004,-1.723499,-0.457681,-1.773485,-1.069745,-0.831334,-0.669665,-0.463216,-0.872875,-1.256753,0.0
7,0.357426,0.302957,1,-1.26289,0.095429,-0.631522,-1.773485,-1.702821,-1.330483,-0.669665,-0.969863,-1.316184,-1.256753,0.0
13,-0.876516,0.303469,0,-1.807219,0.028061,-0.225892,-1.33927,-0.527108,-0.432014,0.272098,-0.361886,-0.429566,-1.476776,0.0
14,0.276301,0.303542,0,-0.900004,-0.712984,-1.095098,-1.773485,-0.707987,-0.631674,0.272098,-0.361886,-0.65122,-1.256753,0.0


In [52]:
df.head()

Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
5,634,34224,0,45,2.3,24,4.0,2.9,1.0,1.0,1.5,0.4,21.0,0.0
6,721,34225,0,50,2.0,50,4.0,3.6,1.3,0.9,2.1,0.6,24.0,0.0
7,421,34227,1,48,4.7,47,4.0,2.9,0.8,0.9,1.6,0.4,24.0,0.0
13,132,34234,0,45,4.6,54,5.1,4.2,1.7,1.2,2.2,0.8,23.0,0.0
14,402,34235,0,50,3.5,39,4.0,4.0,1.5,1.2,2.2,0.7,24.0,0.0


In [53]:
df.isnull().sum()

Unnamed: 0,0
ID,0
No_Pation,0
Gender,0
AGE,0
Urea,0
Cr,0
HbA1c,0
Chol,0
TG,0
HDL,0
