# Data Setup

## Imports

In [86]:
import kagglehub
import pandas as pd
import numpy as np

from src.__00__paths import raw_data_dir, processed_data_dir
from pathlib import Path
import shutil

## Dataset Download

In [76]:
# List of files to check
raw_dataset = raw_data_dir / "patient_dataset.csv"

# Check and download
if raw_dataset.exists():
    print("✔️ Dataset is already downloaded.")
else:
    # Download dataset
    dataset_path = Path(kagglehub.dataset_download("arjunnsharma/patient-dataset-for-clustering-raw-data"))

    if not dataset_path.exists():
        raise FileNotFoundError("⚠ Dataset not found.")

    # Check for an extra "Data" folder
    data_root = dataset_path / "Data" if (dataset_path / "Data").exists() else dataset_path

    # Copy files/folders to raw_data_dir
    for item in data_root.iterdir():
        target = raw_data_dir / item.name
        if item.is_file():
            shutil.copy2(item, target)

    print("✔️ Dataset successfully downloaded.")

✔️ Dataset is already downloaded.


## Load Datasets

In [77]:
raw_df = pd.read_csv(raw_dataset)
raw_df.head()

Unnamed: 0,age,gender,chest_pain_type,blood_pressure,cholesterol,max_heart_rate,exercise_angina,plasma_glucose,skin_thickness,insulin,bmi,diabetes_pedigree,hypertension,heart_disease,residence_type,smoking_status
0,24,1.0,4,250,139,212,0,108.0,33.0,109.0,37.999303,0.480277,1,1,Urban,Smoker
1,29,0.0,4,132,187,147,0,202.0,42.0,,25.588346,0.283986,1,1,Urban,Unknown
2,46,0.0,3,271,185,193,0,149.0,43.0,102.0,37.892029,2.472309,1,0,Rural,Non-Smoker
3,73,,2,102,200,125,0,105.0,77.0,165.0,18.660241,1.472052,0,1,Rural,Smoker
4,49,1.0,3,91,163,192,0,162.0,31.0,170.0,12.76798,0.537627,1,1,Rural,Smoker


## Dataset Auditing

In [85]:
print("Shape:", raw_df.shape)

# NaN counts per column
nan_cnt = raw_df.isna().sum()

# 'Unknown' count aligned to columns (zeros except smoking_status)
unknown_cnt = pd.Series(0, index=raw_df.columns)
if 'smoking_status' in raw_df.columns:
    unknown_cnt['smoking_status'] = (
        raw_df['smoking_status'].astype(str).str.strip().str.lower().eq('unknown').sum()
    )

# Combine into one table
gaps = pd.DataFrame({
    "NaN": nan_cnt,
    "'Unknown' (smoking_status)": unknown_cnt
})
gaps["Total gaps"] = gaps["NaN"] + gaps["'Unknown' (smoking_status)"]
gaps["% of rows (total)"] = (gaps["Total gaps"] / len(raw_df) * 100).round(2)

# Sort by most problematic
gaps = gaps.sort_values("Total gaps", ascending=False)

display(gaps)

Shape: (6000, 16)


Unnamed: 0,NaN,'Unknown' (smoking_status),Total gaps,% of rows (total)
skin_thickness,614,0,614,10.23
plasma_glucose,609,0,609,10.15
insulin,568,0,568,9.47
smoking_status,0,476,476,7.93
gender,472,0,472,7.87
residence_type,455,0,455,7.58
age,0,0,0,0.0
chest_pain_type,0,0,0,0.0
blood_pressure,0,0,0,0.0
cholesterol,0,0,0,0.0


## Data Preprocessing

In [79]:
# Start from raw
df = raw_df.copy()

# Drop rows with any nulls (your chosen strategy)
before = df.shape
df = df.dropna().reset_index(drop=True)
print(f"After Drop NA: {before} -> {df.shape}")

# Drop rows with 'Unknown' smoking status
before = df.shape
df['smoking_status'] = df['smoking_status'].str.strip().str.title()
df = df[df['smoking_status'] != 'Unknown'].reset_index(drop=True)
print(f"After removing 'Unknown' smoking status: {before} -> {df.shape}")

# Drop duplicates
before = df.shape
df = df.drop_duplicates().reset_index(drop=True)
print(f"Dropped duplicates: {before} -> {df.shape}")

After Drop NA: (6000, 16) -> (3734, 16)
After removing 'Unknown' smoking status: (3734, 16) -> (3442, 16)
Dropped duplicates: (3442, 16) -> (3442, 16)


In [80]:
df['gender'] = df['gender'].astype(int)
df['residence_type'] = df['residence_type'].map({'Urban': 0, 'Rural': 1})
df['smoker'] = df['smoking_status'].map({'Non-Smoker': 0, 'Smoker': 1})
df.drop(columns=['smoking_status'], inplace=True)

## Save Processed Data

In [81]:
processed_data_path = processed_data_dir / "processed_patients_dataset.csv"
df.to_csv(processed_data_path, index=False)
print(f"✔️ Processed data saved → {'/'.join(processed_data_path.parts[-3:])}")
df.head()

✔️ Processed data saved → data/processed/processed_patients_dataset.csv


Unnamed: 0,age,gender,chest_pain_type,blood_pressure,cholesterol,max_heart_rate,exercise_angina,plasma_glucose,skin_thickness,insulin,bmi,diabetes_pedigree,hypertension,heart_disease,residence_type,smoker
0,24,1,4,250,139,212,0,108.0,33.0,109.0,37.999303,0.480277,1,1,0,1
1,46,0,3,271,185,193,0,149.0,43.0,102.0,37.892029,2.472309,1,0,1,0
2,49,1,3,91,163,192,0,162.0,31.0,170.0,12.76798,0.537627,1,1,1,1
3,63,1,3,18,154,107,0,103.0,67.0,102.0,22.373849,1.062411,0,0,1,0
4,37,1,4,263,201,201,0,186.0,21.0,180.0,35.663404,0.151236,0,0,0,1
