In [None]:
# Module Imported 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

In [None]:
import pandas as pd
from pathlib import Path

# ===== 1) read data =====
DATA_PATH = "dataSet/heart.csv"   
df = pd.read_csv(DATA_PATH)
df.columns = df.columns.str.strip()

# ===== 2) basic check =====
expected = ['Age','Sex','ChestPainType','RestingBP','Cholesterol','FastingBS',
            'RestingECG','MaxHR','ExerciseAngina','Oldpeak','ST_Slope','HeartDisease']
missing = [c for c in expected if c not in df.columns]
if missing:
    raise ValueError(f"缺少列: {missing}")

# ===== 3) target y =====
y = df['HeartDisease'].astype(int) #*********这个就是处理好的y**********

# ===== 4) features X=====
X = df.drop(columns=['HeartDisease']).copy()

# ===== 5) change it to 0/1 =====
# Sex: M/F -> 1/0
if X['Sex'].dtype == object:
    X['Sex'] = X['Sex'].str.strip().map({'M': 1, 'F': 0}).astype(int)

# ExerciseAngina: Y/N -> 1/0
if X['ExerciseAngina'].dtype == object:
    X['ExerciseAngina'] = X['ExerciseAngina'].str.strip().map({'Y': 1, 'N': 0}).astype(int)

# FastingBS 
X['FastingBS'] = X['FastingBS'].astype(int)

# ===== 6) do one-hot =====
cat_cols = []
for c in ['ChestPainType', 'RestingECG', 'ST_Slope']:
    if c in X.columns and X[c].dtype == object:
        cat_cols.append(c)

X_encoded = pd.get_dummies(X, columns=cat_cols, prefix=cat_cols, drop_first=False)  #*****这个就是X处理好的可以直接用******


# ===== 7) save output =====
out_dir = Path("processed")
out_dir.mkdir(exist_ok=True)

X_path = out_dir / "X_encoded.csv"
y_path = out_dir / "y.csv"
cols_path = out_dir / "feature_names.txt"

X_encoded.to_csv(X_path, index=False)
y.to_csv(y_path, index=False, header=['HeartDisease'])

with open(cols_path, "w", encoding="utf-8") as f:
    for c in X_encoded.columns:
        f.write(c + "\n")

print("Data clean is done")
print(f"X_encoded shap: {X_encoded.shape} -> {X_path}")
print(f"y shap        : {y.shape} -> {y_path}")
print("Has been saved at:", cols_path)



✅ 数据清洗与编码完成
X_encoded 形状: (918, 18) -> processed\X_encoded.csv
y 形状        : (918,) -> processed\y.csv
特征列已保存到: processed\feature_names.txt


In [2]:
X_encoded

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0,0.0,False,True,False,False,False,True,False,False,False,True
1,49,0,160,180,0,156,0,1.0,False,False,True,False,False,True,False,False,True,False
2,37,1,130,283,0,98,0,0.0,False,True,False,False,False,False,True,False,False,True
3,48,0,138,214,0,108,1,1.5,True,False,False,False,False,True,False,False,True,False
4,54,1,150,195,0,122,0,0.0,False,False,True,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,132,0,1.2,False,False,False,True,False,True,False,False,True,False
914,68,1,144,193,1,141,0,3.4,True,False,False,False,False,True,False,False,True,False
915,57,1,130,131,0,115,1,1.2,True,False,False,False,False,True,False,False,True,False
916,57,0,130,236,0,174,0,0.0,False,True,False,False,True,False,False,False,True,False
