# 🩺 Exploratory Data Analysis — Combined UCI Heart Dataset

This notebook performs **clinical-style EDA** on the merged cross-regional heart disease dataset:

- Cleveland  
- Hungarian  
- Switzerland  
- VA (Long Beach)

It uses the unified file: `data/processed/combined_uci_heart.csv`.


In [None]:
import matplotlib
matplotlib.use("Agg")  # safe backend
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd
import numpy as np

%matplotlib inline
plt.style.use("seaborn-v0_8-whitegrid")
sns.set_palette("deep")

# Paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_DIR = os.path.join(BASE_DIR, "data", "processed")
FIG_DIR = os.path.join(BASE_DIR, "experiments", "figures")
os.makedirs(FIG_DIR, exist_ok=True)

csv_path = os.path.join(DATA_DIR, "combined_uci_heart.csv")
df = pd.read_csv(csv_path)

print("✅ Loaded:", csv_path)
print("Shape:", df.shape)
df.head()


## 📊 Dataset overview

In [None]:
print("Columns:", df.columns.tolist())
print("\nSource distribution:")
print(df["source"].value_counts())

print("\nBasic stats:")
display(df.describe(include="all").T)


## 🩹 Missingness and data quality

In [None]:
plt.figure(figsize=(10, 4))
sns.heatmap(df.isna(), cbar=False)
plt.title("Missing Value Heatmap")
plt.tight_layout()
save_path = os.path.join(FIG_DIR, "combined_missingness_heatmap.png")
plt.savefig(save_path, dpi=200)
plt.show()
print("💾 Saved:", save_path)

missing_by_col = df.isna().sum().sort_values(ascending=False)
print("Missing values per column:")
print(missing_by_col)


## ⚖️ Class distribution (heart disease vs no disease)

In [None]:
plt.figure(figsize=(5, 4))
sns.countplot(x="target", data=df)
plt.title("Class Distribution (0 = No Disease, 1 = Disease)")
plt.xlabel("Target")
plt.ylabel("Count")
save_path = os.path.join(FIG_DIR, "combined_class_distribution.png")
plt.savefig(save_path, dpi=200)
plt.show()
print("💾 Saved:", save_path)

print("Proportions:")
print(df["target"].value_counts(normalize=True))


## 📈 Feature distributions — key clinical variables

In [None]:
core_features = ["age", "trestbps", "chol", "thalach", "oldpeak"]
for col in core_features:
    if col not in df.columns:
        continue
    plt.figure(figsize=(6, 3))
    sns.histplot(data=df, x=col, hue="target", element="step", stat="density", common_norm=False)
    plt.title(f"Distribution of {col} by Target")
    plt.tight_layout()
    save_path = os.path.join(FIG_DIR, f"dist_{col}.png")
    plt.savefig(save_path, dpi=200)
    plt.show()
    print("💾 Saved:", save_path)


## 👩‍⚕️ Sex-based clinical differences

In [None]:
if "sex" in df.columns:
    plt.figure(figsize=(6, 4))
    sns.countplot(x="sex", hue="target", data=df)
    plt.title("Sex vs Heart Disease")
    plt.xlabel("Sex (0 = female, 1 = male)")
    save_path = os.path.join(FIG_DIR, "sex_vs_target.png")
    plt.savefig(save_path, dpi=200)
    plt.show()
    print("💾 Saved:", save_path)

    for col in ["age", "chol", "trestbps", "thalach"]:
        if col not in df.columns:
            continue
        plt.figure(figsize=(6, 3))
        sns.boxplot(x="sex", y=col, data=df)
        plt.title(f"{col} by Sex")
        plt.xlabel("Sex (0 = female, 1 = male)")
        plt.tight_layout()
        save_path = os.path.join(FIG_DIR, f"box_{col}_by_sex.png")
        plt.savefig(save_path, dpi=200)
        plt.show()
        print("💾 Saved:", save_path)


## 🔬 Clinical scatter plots

In [None]:
# Age vs Cholesterol
if set(["age", "chol"]).issubset(df.columns):
    plt.figure(figsize=(6, 5))
    sns.scatterplot(data=df, x="age", y="chol", hue="target", alpha=0.7)
    plt.title("Age vs Cholesterol by Disease Status")
    save_path = os.path.join(FIG_DIR, "scatter_age_chol.png")
    plt.savefig(save_path, dpi=200)
    plt.show()
    print("💾 Saved:", save_path)

# Resting BP vs Max Heart Rate
if set(["trestbps", "thalach"]).issubset(df.columns):
    plt.figure(figsize=(6, 5))
    sns.scatterplot(data=df, x="trestbps", y="thalach", hue="target", alpha=0.7)
    plt.title("Resting BP vs Max Heart Rate by Disease Status")
    save_path = os.path.join(FIG_DIR, "scatter_trestbps_thalach.png")
    plt.savefig(save_path, dpi=200)
    plt.show()
    print("💾 Saved:", save_path)


## 🔗 Correlation matrix (core medical features)

In [None]:
core_cols = [c for c in ["age","sex","cp","trestbps","chol","fbs","restecg",
                         "thalach","exang","oldpeak","slope","ca","thal","target"]
             if c in df.columns]

corr = df[core_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=False, cmap="coolwarm", center=0)
plt.title("Correlation Heatmap — Core Clinical Features")
plt.tight_layout()
save_path = os.path.join(FIG_DIR, "corr_heatmap_core.png")
plt.savefig(save_path, dpi=200)
plt.show()
print("💾 Saved:", save_path)

print("Correlation with target:")
display(corr["target"].sort_values(ascending=False))


## 🧬 Clustered correlation (feature grouping)

In [None]:
sns.clustermap(corr, cmap="coolwarm", center=0, figsize=(8, 8))
plt.suptitle("Clustered Correlation — Core Clinical Features", y=1.02)
save_path = os.path.join(FIG_DIR, "corr_clustermap_core.png")
plt.savefig(save_path, dpi=200)
plt.show()
print("💾 Saved:", save_path)


## 🧠 PCA visualization (first 2 components)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

features_no_target = [c for c in core_cols if c != "target"]
X = df[features_no_target].values
y = df["target"].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

pca_df = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
pca_df["target"] = y

plt.figure(figsize=(7, 6))
sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue="target", alpha=0.8)
plt.title("PCA Projection (PC1 vs PC2) — Combined Dataset")
save_path = os.path.join(FIG_DIR, "pca_pc1_pc2.png")
plt.savefig(save_path, dpi=200)
plt.show()
print("💾 Saved:", save_path)

print("Explained variance ratio:", pca.explained_variance_ratio_)
