<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/mthree-c422-rama/Excercises/Day-7/Diabetes_Risk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# Step 1: Import Required Libraries

import pandas as pd
import numpy as np


# Step 2: Load Dataset

url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df = pd.read_csv(url)


# Step 3: Define clean_data() Function

def clean_data(df):
    # Columns to clean
    cols_to_clean = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

    # Replace 0 with NaN and fill with median
    for col in cols_to_clean:
        df[col] = df[col].replace(0, np.nan)
        median = df[col].median()
        df[col] = df[col].fillna(median)

    # Drop duplicate rows
    df = df.drop_duplicates()
    return df


# Step 4: Define engineer_features() Function

def engineer_features(df):
    # Create BMI category
    def bmi_category(bmi):
        if bmi < 18.5:
            return 'Underweight'
        elif bmi < 25:
            return 'Normal'
        elif bmi < 30:
            return 'Overweight'
        else:
            return 'Obese'

    # Apply BMI category
    df['BMI_category'] = df['BMI'].apply(bmi_category)

    # Create age_bin by decade
    df['age_bin'] = (df['Age'] // 10) * 10

    # Create interaction term Glucose * Insulin
    df['Glucose_Insulin'] = df['Glucose'] * df['Insulin']

    return df


# Step 5: Define validate_data() Function

def validate_data(df):
    # Check for nulls
    assert df.isnull().sum().sum() == 0, "There are null values in the dataset."

    # Validate BMI categories
    expected_bmi_categories = {'Underweight', 'Normal', 'Overweight', 'Obese'}
    actual_bmi_categories = set(df['BMI_category'].unique())
    assert actual_bmi_categories.issubset(expected_bmi_categories), \
        f" Unexpected BMI categories found: {actual_bmi_categories - expected_bmi_categories}"

    # Check age_bin range (should be decades like 10 to 90)
    if not df['age_bin'].between(10, 100).all():
        raise ValueError(" Some age_bin values are out of expected range (10–100).")

    print(" Validation passed: No nulls and categorical values are within expected ranges.")

# ---------------------------------------
# Step 6: Run the Full Pipeline
# ---------------------------------------
df_cleaned = clean_data(df)
df_features = engineer_features(df_cleaned)
validate_data(df_features)

# Preview final data
df_features.head()


✅ Validation passed: No nulls and categorical values are within expected ranges.


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BMI_category,age_bin,Glucose_Insulin
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1,Obese,50,18500.0
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0,Overweight,30,10625.0
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1,Normal,30,22875.0
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0,Overweight,20,8366.0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1,Obese,30,23016.0
