In [1]:


# Step 1: Install and Import Libraries

!pip install pandas numpy -q

import pandas as pd
import numpy as np

In [12]:


# Step 2: Load the Dataset (Pima Indians Diabetes

url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df = pd.read_csv(url)
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [13]:


# Step 3: Data Cleaning
# 1. Detect Missing Values and Data Types

print(df.info())
print(df.isnull().sum())




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                      

# Task
Clean the data by replacing zero values in physiological measures (Glucose, BloodPressure, SkinThickness, Insulin, BMI) with column medians and dropping duplicate rows. Engineer features by creating BMI categories, computing age bins by decade, and generating a Glucose*Insulin interaction term. Validate the results by asserting no nulls remain and checking that all new categorical bins cover expected ranges. Display the cleaned and prepared data and summarize the changes made.

## Data cleaning

### Subtask:
Identify and replace zero values in physiological measures (Glucose, BloodPressure, SkinThickness, Insulin, BMI) with column medians. Drop duplicate rows.


**Reasoning**:
Identify zero values in the specified columns and replace them with the median of the non-zero values, then drop duplicate rows.



In [14]:
cols_to_impute = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

for col in cols_to_impute:
    median_val = df[df[col] != 0][col].median()
    df[col] = df[col].replace(0, median_val)

df.drop_duplicates(inplace=True)

display(df.head())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,125,33.6,0.627,50,1
1,1,85,66,29,125,26.6,0.351,31,0
2,8,183,64,29,125,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Feature engineering

### Subtask:
Create BMI categories, compute age bins by decade, and generate a Glucose*Insulin interaction term.


**Reasoning**:
Create the new features 'BMI_category', 'age_bin', and 'Glucose_Insulin_Interaction' and display the head of the dataframe.



In [17]:
# 1. Create BMI categories
bins_bmi = [0, 18.5, 24.9, 29.9, float('inf')]
labels_bmi = ['Underweight', 'Normal', 'Overweight', 'Obese']
df['BMI_category'] = pd.cut(df['BMI'], bins=bins_bmi, labels=labels_bmi, right=True)

# 2. Compute age bins by decade
bins_age = range(0, int(df['Age'].max()) + 10, 10)
labels_age = [f'{i}-{i+9}' for i in range(0, int(df['Age'].max()), 10)]

bins_age = list(range(0, 101, 10)) # Bins up to 100
labels_age = [f'{i}-{i+9}' for i in range(0, 100, 10)]
df['age_bin'] = pd.cut(df['Age'], bins=bins_age, labels=labels_age, right=False, include_lowest=True)


# 3. Generate Glucose*Insulin interaction term
df['Glucose_Insulin_Interaction'] = df['Glucose'] * df['Insulin']

# Display the first few rows with new columns
display(df.head())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BMI_category,age_bin,Glucose_Insulin_Interaction
0,6,148,72,35,125,33.6,0.627,50,1,Obese,50-59,18500
1,1,85,66,29,125,26.6,0.351,31,0,Overweight,30-39,10625
2,8,183,64,29,125,23.3,0.672,32,1,Normal,30-39,22875
3,1,89,66,23,94,28.1,0.167,21,0,Overweight,20-29,8366
4,0,137,40,35,168,43.1,2.288,33,1,Obese,30-39,23016


## Validation

### Subtask:
Assert that no null values remain in the DataFrame and check that all new categorical bins (BMI categories, age_bin) cover the expected ranges.


**Reasoning**:
Check for null values and validate the unique values of the newly created categorical columns to ensure the data cleaning and feature engineering steps were successful.



In [16]:
# 1. Check for any null values
print("Null values in the DataFrame:")
print(df.isnull().sum())
assert df.isnull().sum().sum() == 0, "There are null values in the DataFrame."

# 2. Check unique values in 'BMI_category'
expected_bmi_categories = ['Underweight', 'Normal', 'Overweight', 'Obese']
actual_bmi_categories = df['BMI_category'].cat.categories.tolist()
print("\nUnique BMI categories:")
print(actual_bmi_categories)
assert all(cat in actual_bmi_categories for cat in expected_bmi_categories) and \
       len(actual_bmi_categories) == len(expected_bmi_categories), \
       "BMI categories do not match expected values."


# 3. Check unique values in 'age_bin'
actual_age_bins = df['age_bin'].cat.categories.tolist()
print("\nUnique age bins:")
print(actual_age_bins)
assert len(actual_age_bins) > 0, "Age bins were not created."
min_age_in_bins = min([int(bin.split('-')[0]) for bin in actual_age_bins if isinstance(bin, str)])
max_age_in_bins = max([int(bin.split('-')[1]) for bin in actual_age_bins if isinstance(bin, str)])
print(f"Age bins cover range: {min_age_in_bins}-{max_age_in_bins}")
assert min_age_in_bins <= df['Age'].min() and max_age_in_bins >= df['Age'].max(), "Age bins do not cover the full age range."

display(df.head())

Null values in the DataFrame:
Pregnancies                    0
Glucose                        0
BloodPressure                  0
SkinThickness                  0
Insulin                        0
BMI                            0
DiabetesPedigreeFunction       0
Age                            0
Outcome                        0
BMI_category                   0
age_bin                        0
Glucose_Insulin_Interaction    0
dtype: int64

Unique BMI categories:
['Underweight', 'Normal', 'Overweight', 'Obese']

Unique age bins:
['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99']
Age bins cover range: 0-99


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BMI_category,age_bin,Glucose_Insulin_Interaction
0,6,148,72,35,125,33.6,0.627,50,1,Obese,50-59,18500
1,1,85,66,29,125,26.6,0.351,31,0,Overweight,30-39,10625
2,8,183,64,29,125,23.3,0.672,32,1,Normal,30-39,22875
3,1,89,66,23,94,28.1,0.167,21,0,Overweight,20-29,8366
4,0,137,40,35,168,43.1,2.288,33,1,Obese,30-39,23016


## Summary:

### Data Analysis Key Findings

*   Zero values in 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', and 'BMI' columns were successfully replaced with the median of non-zero values for each respective column.
*   Duplicate rows were removed from the dataset.
*   Three new features were successfully engineered: `BMI_category` (categorizing BMI into 'Underweight', 'Normal', 'Overweight', 'Obese'), `age_bin` (binning Age into decades), and `Glucose_Insulin_Interaction` (the product of 'Glucose' and 'Insulin').
*   Validation checks confirmed that no null values remain in the DataFrame.
*   Validation checks confirmed that the created `BMI_category` and `age_bin` columns contain the expected range of categories. The age bins specifically cover the full range of ages present in the original 'Age' column.

### Insights or Next Steps

*   The cleaned and prepared data is now suitable for further analysis or model training, free from zero placeholders in key physiological measures and duplicate entries, and enriched with relevant engineered features.
*   Further analysis could explore the distribution of the new categorical features (`BMI_category`, `age_bin`) in relation to the target variable (Outcome) to identify potential relationships.
