In [8]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sys.path.append(str(Path("..") / "src"))

from ds_project_arch_lab.utils.preprocessing import standardize_column_names

RAW_PATH = Path("..") / "data" / "raw" / "asthma_disease_data.csv"
PROC_DIR = Path("..") / "data" / "processed"
PROC_DIR.mkdir(parents=True, exist_ok=True)

# Data Cleaning and Preprocessing

In this step, we prepared the dataset for further exploration and modeling.  
Based on EDA (Problem 2), the following preprocessing steps were applied:

- **Dropped columns:**
  - `PatientID` → pure identifier, no predictive power.
  - `DoctorInCharge` → constant value (`Dr_Confid`), no variance.
- **Standardized column names** using a helper function
  `standardize_column_names` (in `src/ds_project_arch_lab/utils/preprocessing.py`):  
  - All names converted to lowercase snake_case.  
  - Example: `LungFunctionFEV1` → `lungfunctionfev1`.  
- **No missing values** and **no duplicates** were found.  
- The cleaned dataset was saved to `data/processed/clean_for_eda.csv`.  

This makes the dataset consistent and ready for further EDA and modeling.

In [3]:
df_raw = pd.read_csv(RAW_PATH)
print("Raw shape:", df_raw.shape)
df_raw.head()

Raw shape: (2392, 29)


Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,...,LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis,DoctorInCharge
0,5034,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,...,1.369051,4.941206,0,0,1,0,0,1,0,Dr_Confid
1,5035,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,...,2.197767,1.702393,1,0,0,1,1,1,0,Dr_Confid
2,5036,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,...,1.698011,5.022553,1,1,1,0,1,1,0,Dr_Confid
3,5037,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,...,3.032037,2.300159,1,0,1,1,1,0,0,Dr_Confid
4,5038,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,...,3.470589,3.067944,1,1,1,0,0,1,0,Dr_Confid


In [6]:
df = df_raw.copy()

drop_cols = ["PatientID", "DoctorInCharge"]
df = df.drop(columns=drop_cols, errors="ignore")

df = standardize_column_names(df)

print("After cleaning:", df.shape)
print("Columns now standardized:")
print(df.columns[:10].tolist(), "...")

After cleaning: (2392, 27)
Columns now standardized:
['age', 'gender', 'ethnicity', 'education_level', 'bmi', 'smoking', 'physical_activity', 'diet_quality', 'sleep_quality', 'pollution_exposure'] ...


In [7]:
PROC_PATH = PROC_DIR / "clean_for_eda.csv"
df.to_csv(PROC_PATH, index=False)
print(f"Clean dataset saved to {PROC_PATH}")

Clean dataset saved to ..\data\processed\clean_for_eda.csv
