In [8]:
%pip install plotly kaleido pandas numpy


Note: you may need to restart the kernel to use updated packages.


In [9]:
import os
import pandas as pd
import numpy as np
import plotly.express as px


In [10]:
df = pd.read_csv('../data/diabetes_unclean.csv')
print("Original shape:", df.shape)
df.head()


Original shape: (1009, 14)


Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,502,17975,F,50.0,4.7,46.0,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,735,34221,M,26.0,4.5,62.0,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,420,47975,F,50.0,4.7,46.0,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,680,87656,F,50.0,4.7,46.0,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,504,34223,M,33.0,7.1,46.0,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N


In [11]:
# Drop unnecessary columns
df.drop(labels=['ID', 'No_Pation', 'Gender', 'Urea', 'Cr'], axis=1, inplace=True, errors='ignore')

# Replace 0s with NaN and fill with median
meas = ['HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI']
for var in meas:
    if var in df.columns:
        df[var] = df[var].replace(0, np.nan)
        df[var] = df[var].fillna(df[var].median())

# Handle missing AGE
if 'AGE' in df.columns:
    df['AGE'] = df['AGE'].fillna(df['AGE'].median())

# Normalize CLASS column
if 'CLASS' in df.columns:
    df['CLASS'] = df['CLASS'].astype(str).str.strip().str.upper()
    df['CLASS'] = df['CLASS'].apply(lambda x: 1 if x in ['Y', 'YES', 'DIABETIC'] else 0)

print(df.info())
print("Missing per column:\n", df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AGE     1009 non-null   float64
 1   HbA1c   1009 non-null   float64
 2   Chol    1009 non-null   float64
 3   TG      1009 non-null   float64
 4   HDL     1009 non-null   float64
 5   LDL     1009 non-null   float64
 6   VLDL    1009 non-null   float64
 7   BMI     1009 non-null   float64
 8   CLASS   1009 non-null   int64  
dtypes: float64(8), int64(1)
memory usage: 71.1 KB
None
Missing per column:
 AGE      0
HbA1c    0
Chol     0
TG       0
HDL      0
LDL      0
VLDL     0
BMI      0
CLASS    0
dtype: int64


In [12]:
os.makedirs('data/cleaned', exist_ok=True)
df.to_csv('data/cleaned/cleaned_diabetes.csv', index=False)
print("✅ Cleaned dataset saved to: data/cleaned/cleaned_diabetes.csv")


✅ Cleaned dataset saved to: data/cleaned/cleaned_diabetes.csv


In [13]:
os.makedirs("../outputs/images", exist_ok=True)

# 1. Class Distribution
fig1 = px.histogram(
    df,
    x="CLASS",
    color="CLASS",
    barmode="group",
    labels={"CLASS": "Diabetes (1=Yes, 0=No)"},
    title="Class Distribution"
)
fig1.write_image("../outputs/images/class_distribution.png")

# 2. BMI vs HbA1c
fig2 = px.scatter(
    df,
    x="BMI",
    y="HbA1c",
    color=df["CLASS"].astype(str),
    labels={"color": "Class"},
    opacity=0.7,
    title="BMI vs HbA1c Scatter Plot"
)
fig2.write_image("../outputs/images/bmi_vs_hba1c.png")

# 3. Average Cholesterol by Age
chol_age = df.groupby("AGE")["Chol"].mean().reset_index()
fig3 = px.line(chol_age, x="AGE", y="Chol", markers=True, title="Average Cholesterol by Age")
fig3.write_image("../outputs/images/avg_chol_by_age.png")

print("✅ All plots saved to ../outputs/images/")


✅ All plots saved to ../outputs/images/


In [6]:
print(df.head())
print(df.info())
print("Missing per column:\n", df.isnull().sum())


    AGE  HbA1c  Chol   TG  HDL  LDL  VLDL   BMI  CLASS
0  50.0    4.9   4.2  0.9  2.4  1.4   0.5  24.0      0
1  26.0    4.9   3.7  1.4  1.1  2.1   0.6  23.0      0
2  50.0    4.9   4.2  0.9  2.4  1.4   0.5  24.0      0
3  50.0    4.9   4.2  0.9  2.4  1.4   0.5  24.0      0
4  33.0    4.9   4.9  1.0  0.8  2.0   0.4  21.0      0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AGE     1009 non-null   float64
 1   HbA1c   1009 non-null   float64
 2   Chol    1009 non-null   float64
 3   TG      1009 non-null   float64
 4   HDL     1009 non-null   float64
 5   LDL     1009 non-null   float64
 6   VLDL    1009 non-null   float64
 7   BMI     1009 non-null   float64
 8   CLASS   1009 non-null   int64  
dtypes: float64(8), int64(1)
memory usage: 71.1 KB
None
Missing per column:
 AGE      0
HbA1c    0
Chol     0
TG       0
HDL      0
LDL      0
VLDL     0
BMI