<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/mthree-422-srilatha/DRPP_FE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
#  Step 1: Import libraries
import pandas as pd
import numpy as np

In [51]:
#Run pipeline
# Load data
url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df_raw = pd.read_csv(url)


In [52]:
#  Step 2: Define clean_data() function
def clean_data(df):
    df = df.copy()

    # Replace 0s with NaN in physiological columns
    physio_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
    df[physio_cols] = df[physio_cols].replace(0, np.nan)

    # Fill missing with column medians
    for col in physio_cols:
        df[col] = df[col].fillna(df[col].median())

    # Drop duplicates
    df = df.drop_duplicates()
    return df

In [53]:
#  Step 3: Define engineer_features() function
def engineer_features(df):
    df = df.copy()

    # BMI Category
    def categorize_bmi(bmi):
        if bmi < 18.5:
            return "Underweight"
        elif 18.5 <= bmi < 25:
            return "Normal"
        elif 25 <= bmi < 30:
            return "Overweight"
        else:
            return "Obese"

    df["BMI_Category"] = df["BMI"].apply(categorize_bmi)

    # Age Bin (by decade, dynamic)
    min_age = df["Age"].min()
    max_age = df["Age"].max()
    bin_start = int(min_age // 10 * 10)
    bin_end = int((max_age // 10 + 1) * 10)
    bins = list(range(bin_start, bin_end + 1, 10))
    labels = [f"{i}s" for i in bins[:-1]]

    df["Age_Bin"] = pd.cut(df["Age"], bins=bins, labels=labels, include_lowest=True, right=False)

    # Interaction term
    df["Glucose_Insulin"] = df["Glucose"] * df["Insulin"]

    return df


In [54]:

#Step 4: Define validate_data() function
def validate_data(df):
    # Null check
    null_counts = df.isnull().sum()
    assert null_counts.sum() == 0, f"There are null values in the data:\n{null_counts[null_counts > 0]}"

    # BMI category validation
    expected_bmi = {"Underweight", "Normal", "Overweight", "Obese"}
    actual_bmi = set(df["BMI_Category"].unique())
    assert expected_bmi.issuperset(actual_bmi), f"Unexpected BMI categories: {actual_bmi}"

    # Age bin validation
    actual_age_bins = set(df["Age_Bin"].dropna().unique())
    assert len(actual_age_bins) > 0, "Age binning failed — all bins are NaN."

    print(" Validation passed! No nulls and all expected categories are present.")



In [55]:
# Run pipeline
df_cleaned = clean_data(df_raw)
df_features = engineer_features(df_cleaned)
validate_data(df_features)

 Validation passed! No nulls and all expected categories are present.


In [56]:
df_features.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BMI_Category,Age_Bin,Glucose_Insulin
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1,Obese,50s,18500.0
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0,Overweight,30s,10625.0
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1,Normal,30s,22875.0
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0,Overweight,20s,8366.0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1,Obese,30s,23016.0
