In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)

# =========================
# 1. Load raw data
# =========================
DATA_PATH = "../heart.csv"   # change this if your CSV is elsewhere
df = pd.read_csv(DATA_PATH)
df.columns = df.columns.str.strip()

expected = [
    "Age", "Sex", "ChestPainType", "RestingBP", "Cholesterol", "FastingBS",
    "RestingECG", "MaxHR", "ExerciseAngina", "Oldpeak", "ST_Slope", "HeartDisease"
]
missing = [c for c in expected if c not in df.columns]
if missing:
    raise ValueError(f"missing column(s): {missing}")

print("Raw data info():")
print(df.info())
print("-" * 30)
print("Number of missing values per column:")
print(df.isna().sum())

# =========================
# 2. Build X, y
# =========================
y = df["HeartDisease"].astype(int)
X = df.drop(columns=["HeartDisease"]).copy()

# =========================
# 3. Manual encoding for categorical features
# =========================

# Sex: M/F -> 1/0
X["Sex"] = X["Sex"].str.strip().map({"M": 1, "F": 0}).astype(int)

# ExerciseAngina: Y/N -> 1/0
X["ExerciseAngina"] = X["ExerciseAngina"].str.strip().map({"Y": 1, "N": 0}).astype(int)

# ChestPainType: ATA, NAP, ASY, TA -> 0,1,2,3
chestPainType = {"ATA": 0, "NAP": 1, "ASY": 2, "TA": 3}
X["ChestPainType"] = X["ChestPainType"].str.strip().map(chestPainType).astype(int)

# RestingECG: Normal, ST, LVH -> 0,1,2
restingECGType = {"Normal": 0, "ST": 1, "LVH": 2}
X["RestingECG"] = X["RestingECG"].str.strip().map(restingECGType).astype(int)

# ST_Slope: Up, Flat, Down -> 0,1,2
slope = {"Up": 0, "Flat": 1, "Down": 2}
X["ST_Slope"] = X["ST_Slope"].str.strip().map(slope).astype(int)

# Make sure FastingBS is integer (0/1)
X["FastingBS"] = X["FastingBS"].astype(int)

print("\nEncoded X.head():")
print(X.head())
print("\nX shape:", X.shape)
print("y shape:", y.shape)

# =========================
# 4. (Optional) export encoded data to CSV
# =========================
out_dir = Path("processed")
out_dir.mkdir(exist_ok=True)

X_path = out_dir / "X_encoded.csv"
y_path = out_dir / "y.csv"
cols_path = out_dir / "feature_names.txt"

X.to_csv(X_path, index=False)
y.to_csv(y_path, index=False, header=["HeartDisease"])

with open(cols_path, "w", encoding="utf-8") as f:
    for c in X.columns:
        f.write(c + "\n")

print("\nData cleaning and encoding completed.")
print(f"X_encoded shape: {X.shape} -> {X_path}")
print(f"y shape        : {y.shape} -> {y_path}")
print("Feature names saved to:", cols_path)

# =========================
# 5. Logistic Regression Baseline (hold-out test)
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # preserve class ratio
)

log_reg = LogisticRegression(
    max_iter=1000,
    random_state=42
)

log_reg.fit(X_train, y_train)

y_pred_baseline = log_reg.predict(X_test)

baseline_acc = accuracy_score(y_test, y_pred_baseline)
precision_pos = precision_score(y_test, y_pred_baseline, pos_label=1)
recall_pos    = recall_score(y_test, y_pred_baseline, pos_label=1)
f1_pos        = f1_score(y_test, y_pred_baseline, pos_label=1)

print("\n=== Logistic Regression Baseline (Hold-out Test) ===")
print(f"Accuracy        : {baseline_acc:.4f}")
print(f"Precision (pos) : {precision_pos:.4f}")
print(f"Recall    (pos) : {recall_pos:.4f}")
print(f"F1-score  (pos) : {f1_pos:.4f}")

print("\nClassification report:")
print(classification_report(y_test, y_pred_baseline, target_names=["Normal (0)", "HeartDisease (1)"]))

# =========================
# 6. K-Fold cross-validation (accuracy only)
# =========================
k = 5  # change to 10 for 10-fold CV, etc.
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    log_reg,
    X, y,
    cv=skf,
    scoring="accuracy"
)

print(f"\n=== Logistic Regression {k}-Fold CV ===")
print("Each fold accuracy:", np.round(cv_scores, 4))
print("Mean CV accuracy :", cv_scores.mean().round(4))
print("Std  CV accuracy :", cv_scores.std().round(4))


Raw data info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB
None
------------------------------
Number of missing values per column:
Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS   

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)

# =========================
# 1. Load raw data
# =========================
DATA_PATH = "../heart.csv"   # change this if your CSV is elsewhere
df = pd.read_csv(DATA_PATH)
df.columns = df.columns.str.strip()

expected = [
    "Age", "Sex", "ChestPainType", "RestingBP", "Cholesterol", "FastingBS",
    "RestingECG", "MaxHR", "ExerciseAngina", "Oldpeak", "ST_Slope", "HeartDisease"
]
missing = [c for c in expected if c not in df.columns]
if missing:
    raise ValueError(f"missing column(s): {missing}")

print("Raw data info():")
print(df.info())
print("-" * 30)
print("Number of missing values per column:")
print(df.isna().sum())

# =========================
# 2. Build X, y
# =========================
y = df["HeartDisease"].astype(int)
X = df.drop(columns=["HeartDisease"]).copy()

# =========================
# 3. Manual encoding for categorical features
# =========================

# Sex: M/F -> 1/0
X["Sex"] = X["Sex"].str.strip().map({"M": 1, "F": 0}).astype(int)

# ExerciseAngina: Y/N -> 1/0
X["ExerciseAngina"] = X["ExerciseAngina"].str.strip().map({"Y": 1, "N": 0}).astype(int)

# ChestPainType: ATA, NAP, ASY, TA -> 0,1,2,3
chestPainType = {"ATA": 0, "NAP": 1, "ASY": 2, "TA": 3}
X["ChestPainType"] = X["ChestPainType"].str.strip().map(chestPainType).astype(int)

# RestingECG: Normal, ST, LVH -> 0,1,2
restingECGType = {"Normal": 0, "ST": 1, "LVH": 2}
X["RestingECG"] = X["RestingECG"].str.strip().map(restingECGType).astype(int)

# ST_Slope: Up, Flat, Down -> 0,1,2
slope = {"Up": 0, "Flat": 1, "Down": 2}
X["ST_Slope"] = X["ST_Slope"].str.strip().map(slope).astype(int)

# Make sure FastingBS is integer (0/1)
X["FastingBS"] = X["FastingBS"].astype(int)

print("\nEncoded X.head():")
print(X.head())
print("\nX shape:", X.shape)
print("y shape:", y.shape)

# =========================
# 4. (Optional) export encoded data to CSV
# =========================
out_dir = Path("processed")
out_dir.mkdir(exist_ok=True)

X_path = out_dir / "X_encoded.csv"
y_path = out_dir / "y.csv"
cols_path = out_dir / "feature_names.txt"

X.to_csv(X_path, index=False)
y.to_csv(y_path, index=False, header=["HeartDisease"])

with open(cols_path, "w", encoding="utf-8") as f:
    for c in X.columns:
        f.write(c + "\n")

print("\nData cleaning and encoding completed.")
print(f"X_encoded shape: {X.shape} -> {X_path}")
print(f"y shape        : {y.shape} -> {y_path}")
print("Feature names saved to:", cols_path)

# =====================================================
# 5. Logistic Regression Baseline (hold-out + K-Fold)
# =====================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # preserve class ratio
)

log_reg = LogisticRegression(
    max_iter=1000,
    random_state=42
)

log_reg.fit(X_train, y_train)

y_pred_baseline = log_reg.predict(X_test)

baseline_acc = accuracy_score(y_test, y_pred_baseline)
precision_pos = precision_score(y_test, y_pred_baseline, pos_label=1)
recall_pos    = recall_score(y_test, y_pred_baseline, pos_label=1)
f1_pos        = f1_score(y_test, y_pred_baseline, pos_label=1)

print("\n=== Logistic Regression Baseline (Hold-out Test) ===")
print(f"Accuracy        : {baseline_acc:.4f}")
print(f"Precision (pos) : {precision_pos:.4f}")
print(f"Recall    (pos) : {recall_pos:.4f}")
print(f"F1-score  (pos) : {f1_pos:.4f}")

print("\nClassification report (Logistic Regression):")
print(classification_report(
    y_test,
    y_pred_baseline,
    target_names=["Normal (0)", "HeartDisease (1)"]
))

# K-Fold cross-validation (accuracy only)
k = 5  # change to 10 for 10-fold CV, etc.
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    log_reg,
    X, y,
    cv=skf,
    scoring="accuracy"
)

print(f"\n=== Logistic Regression {k}-Fold CV ===")
print("Each fold accuracy:", np.round(cv_scores, 4))
print("Mean CV accuracy :", cv_scores.mean().round(4))
print("Std  CV accuracy :", cv_scores.std().round(4))

# =====================================================
# 6. Linear Regression Baseline (treated as classifier)
# =====================================================
# We use LinearRegression to predict a continuous score,
# then threshold at 0.5 to obtain class labels.

X_train_linreg, X_test_linreg, y_train_linreg, y_test_linreg = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

lin_reg = LinearRegression()
lin_reg.fit(X_train_linreg, y_train_linreg)

# Continuous predictions, then threshold at 0.5
y_pred_cont = lin_reg.predict(X_test_linreg)
y_pred_lincls = (y_pred_cont >= 0.5).astype(int)

acc_lin = accuracy_score(y_test_linreg, y_pred_lincls)
precision_lin = precision_score(y_test_linreg, y_pred_lincls, pos_label=1)
recall_lin = recall_score(y_test_linreg, y_pred_lincls, pos_label=1)
f1_lin = f1_score(y_test_linreg, y_pred_lincls, pos_label=1)

print("\n=== Linear Regression Baseline (0.5-threshold classifier) ===")
print(f"Accuracy        : {acc_lin:.4f}")
print(f"Precision (pos) : {precision_lin:.4f}")
print(f"Recall    (pos) : {recall_lin:.4f}")
print(f"F1-score  (pos) : {f1_lin:.4f}")

print("\nClassification report (Linear Regression as classifier):")
print(classification_report(
    y_test_linreg,
    y_pred_lincls,
    target_names=["Normal (0)", "HeartDisease (1)"]
))


Raw data info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB
None
------------------------------
Number of missing values per column:
Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS   