In [49]:
import joblib
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [50]:
df = pd.read_csv("./heart.csv").drop(columns=["id", "dataset", "chol", "fbs"]).dropna(how="any")
df

Unnamed: 0,age,sex,cp,trestbps,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,atypical angina,130.0,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...
299,68,Male,asymptomatic,144.0,normal,141.0,False,3.4,flat,2.0,reversable defect,2
300,57,Male,asymptomatic,130.0,normal,115.0,True,1.2,flat,1.0,reversable defect,3
301,57,Female,atypical angina,130.0,lv hypertrophy,174.0,False,0.0,flat,1.0,normal,1
508,47,Male,asymptomatic,150.0,normal,98.0,True,1.5,flat,0.0,reversable defect,1


In [51]:
columns_to_encode = [
    "thal",
    "ca",
    "slope",
    "exang",
    "restecg",
    "cp",
    "sex",
    "num",
]

label_encoders = {}

for col in columns_to_encode:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df

Unnamed: 0,age,sex,cp,trestbps,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,1,3,145.0,0,150.0,0,2.3,0,0,0,0
1,67,1,0,160.0,0,108.0,1,1.5,1,3,1,2
2,67,1,0,120.0,0,129.0,1,2.6,1,2,2,1
3,37,1,2,130.0,1,187.0,0,3.5,0,0,1,0
4,41,0,1,130.0,0,172.0,0,1.4,2,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
299,68,1,0,144.0,1,141.0,0,3.4,1,2,2,2
300,57,1,0,130.0,1,115.0,1,1.2,1,1,2,3
301,57,0,1,130.0,0,174.0,0,0.0,1,1,1,1
508,47,1,0,150.0,1,98.0,1,1.5,1,0,2,1


In [52]:
df['target'] = ((df['num'] > 0) * 1)
df = df.drop(columns=["num"])
df

Unnamed: 0,age,sex,cp,trestbps,restecg,thalch,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145.0,0,150.0,0,2.3,0,0,0,0
1,67,1,0,160.0,0,108.0,1,1.5,1,3,1,1
2,67,1,0,120.0,0,129.0,1,2.6,1,2,2,1
3,37,1,2,130.0,1,187.0,0,3.5,0,0,1,0
4,41,0,1,130.0,0,172.0,0,1.4,2,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
299,68,1,0,144.0,1,141.0,0,3.4,1,2,2,1
300,57,1,0,130.0,1,115.0,1,1.2,1,1,2,1
301,57,0,1,130.0,0,174.0,0,0.0,1,1,1,1
508,47,1,0,150.0,1,98.0,1,1.5,1,0,2,1


In [53]:
xgb_model = xgb.XGBClassifier()
xgb_model.load_model("../models/best_model_xgboost.json")

In [54]:
min_max_scalers: dict[str, MinMaxScaler] = joblib.load("../models/scalers.pkl")
min_max_scalers

{'oldpeak': MinMaxScaler(),
 'thalch': MinMaxScaler(),
 'chol': MinMaxScaler(),
 'trestbps': MinMaxScaler(),
 'age': MinMaxScaler()}

In [55]:
scaled = df.copy()

columns_to_scale = ["oldpeak", "thalch", "trestbps", "age"]
for col in columns_to_scale:
    scaled[col] = min_max_scalers[col].transform(scaled[[col]])

scaled

Unnamed: 0,age,sex,cp,trestbps,restecg,thalch,exang,oldpeak,slope,ca,thal,target
0,0.714286,1,3,0.490741,0,0.609023,0,0.370968,0,0,0,0
1,0.795918,1,0,0.629630,0,0.293233,1,0.241935,1,3,1,1
2,0.795918,1,0,0.259259,0,0.451128,1,0.419355,1,2,2,1
3,0.183673,1,2,0.351852,1,0.887218,0,0.564516,0,0,1,0
4,0.265306,0,1,0.351852,0,0.774436,0,0.225806,2,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
299,0.816327,1,0,0.481481,1,0.541353,0,0.548387,1,2,2,1
300,0.591837,1,0,0.351852,1,0.345865,1,0.193548,1,1,2,1
301,0.591837,0,1,0.351852,0,0.789474,0,0.000000,1,1,1,1
508,0.387755,1,0,0.537037,1,0.218045,1,0.241935,1,0,2,1


In [56]:
def test(predictions, targets):
    tp, fp, tn, fn = 0 ,0, 0, 0
    for prediction, target in zip(predictions, targets):
        if prediction == 1 and target == 1:
            tp += 1
        elif prediction == 1 and target == 0:
            fp += 1
        elif prediction == 0 and target == 0:
            tn += 1
        elif prediction == 0 and target == 1:
            fn += 1
    precision = tp / (tp + fp) * 100 if tp + fp > 0 else 0
    recall = tp / (tp + fn) * 100 if tp + fn > 0 else 0
    print(f"{precision = :.2f}%, {recall = :.2f}%")

In [58]:
predictions = xgb_model.predict(scaled.drop(columns=["target"]))

test(predictions, scaled["target"])

precision = 91.79%, recall = 88.49%


In [60]:
df['pred'] = predictions
df

Unnamed: 0,age,sex,cp,trestbps,restecg,thalch,exang,oldpeak,slope,ca,thal,target,pred
0,63,1,3,145.0,0,150.0,0,2.3,0,0,0,0,0
1,67,1,0,160.0,0,108.0,1,1.5,1,3,1,1,1
2,67,1,0,120.0,0,129.0,1,2.6,1,2,2,1,1
3,37,1,2,130.0,1,187.0,0,3.5,0,0,1,0,0
4,41,0,1,130.0,0,172.0,0,1.4,2,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,68,1,0,144.0,1,141.0,0,3.4,1,2,2,1,1
300,57,1,0,130.0,1,115.0,1,1.2,1,1,2,1,1
301,57,0,1,130.0,0,174.0,0,0.0,1,1,1,1,1
508,47,1,0,150.0,1,98.0,1,1.5,1,0,2,1,1
