In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("..\\Data\\heart_disease_uci.csv")

In [3]:
df.columns

Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

In [4]:
df = df.drop(columns=['id', 'dataset'])

In [5]:
df.head(3)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1


In [6]:
df.isna().sum()

age           0
sex           0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [7]:
df = df.drop(columns=['slope','ca','thal'])

In [8]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch',
       'exang', 'oldpeak', 'num'],
      dtype='object')

In [9]:
(df == 0).sum()


age           0
sex           0
cp            0
trestbps      1
chol        172
fbs         692
restecg       0
thalch        0
exang       528
oldpeak     370
num         411
dtype: int64

In [10]:
df.fillna({"trestbps":df["trestbps"].median()}, inplace=True)


In [11]:
df.fillna({"chol":df["chol"].mean()}, inplace=True)
df.fillna({"thalch":df["thalch"].mean()}, inplace=True)
df.fillna({"oldpeak":df["oldpeak"].mean()}, inplace=True)
df.fillna({"fbs":df["fbs"].mode()[0]}, inplace=True)
df.fillna({"exang":df["exang"].mode()[0]}, inplace=True)
df.fillna({"restecg":df["restecg"].mode()[0]}, inplace=True)

  df.fillna({"fbs":df["fbs"].mode()[0]}, inplace=True)
  df.fillna({"exang":df["exang"].mode()[0]}, inplace=True)


In [12]:
df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
num         0
dtype: int64

In [13]:
(df == 0).sum()


age           0
sex           0
cp            0
trestbps      1
chol        172
fbs         782
restecg       0
thalch        0
exang       583
oldpeak     370
num         411
dtype: int64

In [14]:
import numpy as np

df.replace({"trestbps":0}, {"trestbps":np.nan}, inplace=True)
df.replace({"chol":0}, {"chol":np.nan}, inplace=True)

In [15]:
df.fillna({"trestbps": df["trestbps"].median()}, inplace=True)
df.fillna({"chol": df["chol"].mean()}, inplace=True)

In [16]:
df["sex"] = df["sex"].map({'Male':1, 'Female':0})
df["fbs"] = df["fbs"].map({True:1, False:0})
df["exang"] = df["exang"].map({True:1, False:0})
df = pd.get_dummies(df, columns=['cp', 'restecg'], drop_first=True)

In [17]:
df.head(3)

Unnamed: 0,age,sex,trestbps,chol,fbs,thalch,exang,oldpeak,num,cp_atypical angina,cp_non-anginal,cp_typical angina,restecg_normal,restecg_st-t abnormality
0,63,1,145.0,233.0,1,150.0,0,2.3,0,False,False,True,False,False
1,67,1,160.0,286.0,0,108.0,1,1.5,2,False,False,False,False,False
2,67,1,120.0,229.0,0,129.0,1,2.6,1,False,False,False,False,False


In [18]:
df["cp_atypical angina"] = df["cp_atypical angina"].map({True:1, False:0})
df["cp_non-anginal"] = df["cp_non-anginal"].map({True:1, False:0})
df["cp_typical angina"] = df["cp_typical angina"].map({True:1, False:0})
df["restecg_normal"] = df["restecg_normal"].map({True:1, False:0})
df["restecg_st-t abnormality"] = df["restecg_st-t abnormality"].map({True:1, False:0})

In [19]:
df["target"] = (df["num"] > 0).astype(int)
df = df.drop(columns=["num"])


In [20]:
df.head(3)

Unnamed: 0,age,sex,trestbps,chol,fbs,thalch,exang,oldpeak,cp_atypical angina,cp_non-anginal,cp_typical angina,restecg_normal,restecg_st-t abnormality,target
0,63,1,145.0,233.0,1,150.0,0,2.3,0,0,1,0,0,0
1,67,1,160.0,286.0,0,108.0,1,1.5,0,0,0,0,0,1
2,67,1,120.0,229.0,0,129.0,1,2.6,0,0,0,0,0,1


## Now, we move into the Modeling.
### Since our dataset is small, we are gonna use Cross-Validation approach to train our model.

In [21]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression


In [22]:
log_reg = LogisticRegression(max_iter=1000)


In [23]:
X = df.drop("target", axis=1)
y=pd.DataFrame()
y["target"] = df["target"]


In [24]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(log_reg, X, y, cv=cv, scoring="accuracy")

print("Accuracies for each fold:", scores)
print("Mean accuracy:", np.mean(scores))


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracies for each fold: [0.80434783 0.81521739 0.78804348 0.78804348 0.79891304]
Mean accuracy: 0.7989130434782609


  y = column_or_1d(y, warn=True)


# Conclusion
**Using a simple 80/20 train-test split, our Logistic Regression model achieved a relatively high accuracy (~82%). However, this result may be optimistic, since it depends heavily on how the dataset was divided. A single test set cannot fully capture how the model would generalize to unseen data, and the accuracy can vary if the split changes.**

**In contrast, applying 5-fold cross-validation produced a slightly lower mean accuracy (~79%), but this value is more reliable. Cross-validation evaluates the model across multiple train-test splits, ensuring that every data point is used for both training and testing. While the score is lower, it offers a more realistic estimate of generalization performance.**