In [10]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif

In [2]:
pd.set_option("display.max_columns", None)

ROOT = Path("..").resolve()
PROC_DIR = ROOT / "data" / "processed"
PROC_DIR.mkdir(parents=True, exist_ok=True)

CLEAN_CSV    = PROC_DIR / "clean_asthma_disease_data.csv"
FEATURES_ALL = PROC_DIR / "asthma_features_all.csv"
FEATURES_X   = PROC_DIR / "asthma_features_X.csv"
FEATURES_Y   = PROC_DIR / "asthma_features_y.csv"
FEATURES_TOP = PROC_DIR / "asthma_features_topk.csv"

Feature Manipulation

**Goal:** starting from the cleaned dataset produced in Problem 3 (`data/processed/clean_asthma_disease_data.csv`), create a **model‑ready, numeric (floats‑only)** feature table.  
This step is model‑agnostic: we engineer/select features that a later model can use.

Steps:
1) Load cleaned data  
2) Engineer intuitive features (ratios/scores/composites)  
3) One‑hot encode categoricals  
4) Scale continuous features  
5) (Optional) Univariate feature selection (Mutual Information)  
6) Save float‑only tables to `data/processed/`

### 1) Input & assumptions
- Input file: `data/processed/clean_asthma_disease_data.csv`
- Columns are already standardized to `snake_case`.
- Target column: `diagnosis` (0/1).
- Identifiers/constant columns were removed in Problem 3.

In [3]:
df = pd.read_csv(CLEAN_CSV)
print("Loaded:", CLEAN_CSV, "| shape:", df.shape)
df.head(3)

Loaded: C:\Users\WALL-E\PycharmProjects\pythonProject\Projects_2024\Data_Science_Course_2025\09_Data_Science_Project_Architecture_Lab\ds_project_arch_lab\data\processed\clean_asthma_disease_data.csv | shape: (2392, 27)


Unnamed: 0,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,pollution_exposure,pollen_exposure,dust_exposure,pet_allergy,family_history_asthma,history_of_allergies,eczema,hay_fever,gastroesophageal_reflux,lung_function_fev_1,lung_function_fvc,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis
0,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,7.388481,2.855578,0.974339,1,1,0,0,0,0,1.369051,4.941206,0,0,1,0,0,1,0
1,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,1.969838,7.457665,6.584631,0,0,1,0,0,0,2.197767,1.702393,1,0,0,1,1,1,0
2,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,1.460593,1.448189,5.445799,0,1,1,0,1,0,1.698011,5.022553,1,1,1,0,1,1,0


### 2) Feature groups
- **Categorical (coded):** `gender`, `ethnicity`, `education_level`  
- **Continuous:** `age`, `bmi`, `physical_activity`, `diet_quality`, `sleep_quality`,  
  `pollution_exposure`, `pollen_exposure`, `dust_exposure`,  
  `lung_function_fev1`, `lung_function_fvc`
- **Binary indicators:** remaining 0/1 features (e.g., symptoms, allergy/history flags)

> Rationale: handling types differently lets us apply the right transformations (one‑hot vs. scaling vs. passthrough).

In [4]:
target = "diagnosis"
assert target in df.columns, f"Expected '{target}' in {df.columns.tolist()}"

cat_cols = [c for c in ["gender", "ethnicity", "education_level"] if c in df.columns]

cont_cols = [c for c in [
    "age","bmi","physical_activity","diet_quality","sleep_quality",
    "pollution_exposure","pollen_exposure","dust_exposure",
    "lung_function_fev1","lung_function_fvc"
] if c in df.columns]

binary_cols = [
    c for c in df.columns
    if c not in cat_cols + cont_cols + [target]
    and set(df[c].unique()).issubset({0, 1})
]

print("Categorical:", cat_cols)
print("Continuous:", cont_cols)
print("Binary (examples):", binary_cols[:10])

Categorical: ['gender', 'ethnicity', 'education_level']
Continuous: ['age', 'bmi', 'physical_activity', 'diet_quality', 'sleep_quality', 'pollution_exposure', 'pollen_exposure', 'dust_exposure', 'lung_function_fvc']
Binary (examples): ['smoking', 'pet_allergy', 'family_history_asthma', 'history_of_allergies', 'eczema', 'hay_fever', 'gastroesophageal_reflux', 'wheezing', 'shortness_of_breath', 'chest_tightness']


### 3) Feature engineering (additive, non‑destructive)
- **`fev1_fvc_ratio`** = `lung_function_fev1 / lung_function_fvc`  
  *Motivation:* proxy for airflow limitation; clinically relevant in asthma/COPD contexts.
- **`symptom_score`** = sum of common symptom indicators  
  (`wheezing`, `shortness_of_breath`, `chest_tightness`, `coughing`, `nighttime_symptoms`, `exercise_induced`)  
  *Motivation:* a compact signal of symptom burden.
- **`exposure_index`** = mean of (`pollution_exposure`, `pollen_exposure`, `dust_exposure`)  
  *Motivation:* aggregate environmental trigger exposure.

> These features capture interactions/aggregates that may be more predictive than any single raw column.

In [5]:
df_fe = df.copy()

if {"lung_function_fev1","lung_function_fvc"}.issubset(df_fe.columns):
    df_fe["fev1_fvc_ratio"] = (df_fe["lung_function_fev1"] / df_fe["lung_function_fvc"]).replace([np.inf, -np.inf], np.nan)

symptom_cols = [c for c in ["wheezing","shortness_of_breath","chest_tightness","coughing","nighttime_symptoms","exercise_induced"] if c in df_fe.columns]
if symptom_cols:
    df_fe["symptom_score"] = df_fe[symptom_cols].sum(axis=1)

expo_cols = [c for c in ["pollution_exposure","pollen_exposure","dust_exposure"] if c in df_fe.columns]
if expo_cols:
    df_fe["exposure_index"] = df_fe[expo_cols].mean(axis=1)

print("Engineered added:", [c for c in ["fev1_fvc_ratio","symptom_score","exposure_index"] if c in df_fe.columns])
df_fe.head(3)

Engineered added: ['symptom_score', 'exposure_index']


Unnamed: 0,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,pollution_exposure,pollen_exposure,dust_exposure,pet_allergy,family_history_asthma,history_of_allergies,eczema,hay_fever,gastroesophageal_reflux,lung_function_fev_1,lung_function_fvc,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis,symptom_score,exposure_index
0,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,7.388481,2.855578,0.974339,1,1,0,0,0,0,1.369051,4.941206,0,0,1,0,0,1,0,2,3.739466
1,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,1.969838,7.457665,6.584631,0,0,1,0,0,0,2.197767,1.702393,1,0,0,1,1,1,0,4,5.337378
2,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,1.460593,1.448189,5.445799,0,1,1,0,1,0,1.698011,5.022553,1,1,1,0,1,1,0,5,2.78486


### 4) Encoding and scaling
- **One‑hot encode**: `gender`, `ethnicity`, `education_level` (drop first category to avoid multicollinearity).
- **Scale continuous‑like** features with `StandardScaler`:
  - `age`, `bmi`, `physical_activity`, `diet_quality`, `sleep_quality`,  
    `pollution_exposure`, `pollen_exposure`, `dust_exposure`,  
    `lung_function_fev1`, `lung_function_fvc`,  
    engineered: `fev1_fvc_ratio`, `symptom_score`, `exposure_index`
- **Ensure float‑only**: cast all feature columns to `float`; keep `diagnosis` as 0/1 (float).

> Rationale: scaling benefits linear models; it’s harmless for tree‑based models.

In [7]:
df_enc = pd.get_dummies(df_fe, columns=cat_cols, drop_first=True)
print("After one-hot:", df_enc.shape)
df_enc.head(3)

After one-hot: (2392, 33)


Unnamed: 0,age,bmi,smoking,physical_activity,diet_quality,sleep_quality,pollution_exposure,pollen_exposure,dust_exposure,pet_allergy,family_history_asthma,history_of_allergies,eczema,hay_fever,gastroesophageal_reflux,lung_function_fev_1,lung_function_fvc,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis,symptom_score,exposure_index,gender_1,ethnicity_1,ethnicity_2,ethnicity_3,education_level_1,education_level_2,education_level_3
0,63,15.848744,0,0.894448,5.488696,8.701003,7.388481,2.855578,0.974339,1,1,0,0,0,0,1.369051,4.941206,0,0,1,0,0,1,0,2,3.739466,False,True,False,False,False,False,False
1,26,22.757042,0,5.897329,6.341014,5.153966,1.969838,7.457665,6.584631,0,0,1,0,0,0,2.197767,1.702393,1,0,0,1,1,1,0,4,5.337378,True,False,True,False,False,True,False
2,57,18.395396,0,6.739367,9.196237,6.840647,1.460593,1.448189,5.445799,0,1,1,0,1,0,1.698011,5.022553,1,1,1,0,1,1,0,5,2.78486,False,False,True,False,True,False,False


In [9]:
to_scale = [c for c in [
    "age","bmi","physical_activity","diet_quality","sleep_quality",
    "pollution_exposure","pollen_exposure","dust_exposure",
    "lung_function_fev1","lung_function_fvc",
    "fev1_fvc_ratio","symptom_score","exposure_index"
] if c in df_enc.columns]

df_scaled = df_enc.copy()
if to_scale:
    scaler = StandardScaler()
    df_scaled[to_scale] = scaler.fit_transform(df_scaled[to_scale])

for col in df_scaled.columns:
    if col != target:
        df_scaled[col] = pd.to_numeric(df_scaled[col], errors="coerce").astype(float)
df_scaled[target] = df_scaled[target].astype(float)

print("Scaled + float-only:", df_scaled.shape)
df_scaled.head(3)

Scaled + float-only: (2392, 33)


Unnamed: 0,age,bmi,smoking,physical_activity,diet_quality,sleep_quality,pollution_exposure,pollen_exposure,dust_exposure,pet_allergy,family_history_asthma,history_of_allergies,eczema,hay_fever,gastroesophageal_reflux,lung_function_fev_1,lung_function_fvc,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis,symptom_score,exposure_index,gender_1,ethnicity_1,ethnicity_2,ethnicity_3,education_level_1,education_level_2,education_level_3
0,0.96574,-1.582769,0.0,-1.432099,0.160113,0.971063,0.809355,-0.780866,-1.401921,1.0,1.0,0.0,0.0,0.0,0.0,1.369051,0.920608,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.061248,-0.775459,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-0.747054,-0.6233,0.0,0.291269,0.453069,-1.076746,-1.036866,0.810184,0.560684,0.0,0.0,1.0,0.0,0.0,0.0,2.197767,-1.564256,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.558213,0.179981,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.687989,-1.229074,0.0,0.58133,1.434458,-0.102976,-1.210374,-1.267434,0.162295,0.0,1.0,1.0,0.0,1.0,0.0,1.698011,0.983019,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.367944,-1.346247,0.0,0.0,1.0,0.0,1.0,0.0,0.0


### 5) Optional univariate selection (Mutual Information)
- Compute **mutual information (MI)** between each feature and `diagnosis`.
- Produce a **Top‑K** subset (e.g., K = 30) for quick experiments.
- Note: MI is univariate; keep the full set for models that capture interactions.

In [None]:
X = df_scaled.drop(columns=[target]).fillna(0.0)
y = df_scaled[target].astype(int).values

discrete_mask = [set(df_scaled[c].dropna().unique()).issubset({0.0,1.0}) for c in X.columns]

mi = mutual_info_classif(X, y, discrete_features=discrete_mask, random_state=42)
mi_series = pd.Series(mi, index=X.columns).sort_values(ascending=False)

top_k = min(30, X.shape[1])
top_features = mi_series.head(top_k).index.tolist()

print("Top 10 MI features:\n", mi_series.head(10))

Top 10 MI features:

 bmi                  0.005352
pollen_exposure      0.005240
diet_quality         0.001788
lung_function_fvc    0.001553
exercise_induced     0.001518
education_level_3    0.000862
education_level_2    0.000840
exposure_index       0.000783
chest_tightness      0.000774
wheezing             0.000376
dtype: float64


### 6) Outputs (saved to `data/processed/`)
- **Full features + target (float‑only):**
`data/processed/asthma_features_all.csv`
- **Separated matrices:**
`data/processed/asthma_features_X.csv`
`data/processed/asthma_features_y.csv`
- **Optional MI subset (Top‑K + target):**
`data/processed/asthma_features_topk.csv`

In [14]:
# 1) Full feature table (features + target)
df_scaled.to_csv(FEATURES_ALL, index=False)

# 2) Separate X/y
X.to_csv(FEATURES_X, index=False)
pd.Series(y, name=target).to_csv(FEATURES_Y, index=False)

# 3) Optional MI subset (with target)
df_topk = df_scaled[top_features + [target]].copy()
df_topk.to_csv(FEATURES_TOP, index=False)

print("Saved:")
print(" -", FEATURES_ALL)
print(" -", FEATURES_X)
print(" -", FEATURES_Y)
print(" -", FEATURES_TOP, "(optional)")

Saved:
 - C:\Users\WALL-E\PycharmProjects\pythonProject\Projects_2024\Data_Science_Course_2025\09_Data_Science_Project_Architecture_Lab\ds_project_arch_lab\data\processed\asthma_features_all.csv
 - C:\Users\WALL-E\PycharmProjects\pythonProject\Projects_2024\Data_Science_Course_2025\09_Data_Science_Project_Architecture_Lab\ds_project_arch_lab\data\processed\asthma_features_X.csv
 - C:\Users\WALL-E\PycharmProjects\pythonProject\Projects_2024\Data_Science_Course_2025\09_Data_Science_Project_Architecture_Lab\ds_project_arch_lab\data\processed\asthma_features_y.csv
 - C:\Users\WALL-E\PycharmProjects\pythonProject\Projects_2024\Data_Science_Course_2025\09_Data_Science_Project_Architecture_Lab\ds_project_arch_lab\data\processed\asthma_features_topk.csv (optional)


### 7) What changed vs. Problem 3/4
- Built **composite/interaction features** (`fev1_fvc_ratio`, `symptom_score`, `exposure_index`).
- Converted the dataset to a **purely numeric, floats‑only** table.
- Prepared **encoded & scaled** features suitable for a variety of models.
- (Optionally) produced a **feature‑selected** view via MI for rapid prototyping.
