In [2]:
# import libraries

# 1. to handle the data
import pandas as pd
import numpy as np

# to visualize the dataset
import matplotlib.pyplot as plt
import seaborn as sns


# To preprocess the data
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
# import iterative imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

#metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error, r2_score

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [3]:


# load the data from csv file placed locally in our pc
df = pd.read_csv('heart_disease_uci.csv')

# print the first 5 rows of the dataframe
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [4]:
df.info()

df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


(920, 16)

In [5]:
df.isnull().sum()[df.isnull().sum() > 0].sort_values(ascending=False)
missing_data_cols = df.isnull().sum()[df.isnull().sum() > 0].index.tolist()
missing_data_cols

['trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalch',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal']

In [6]:
categorical_cols = ['thal', 'ca', 'slope', 'exang', 'restecg','fbs', 'cp', 'sex', 'num']
bool_cols = ['fbs', 'exang']
numeric_cols = ['oldpeak', 'thalch', 'chol', 'trestbps', 'age']

In [10]:
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

MISSING_TOKEN = "__MISSING__"
UNK_TOKEN = "__UNK__"

def fit_label_encoders_per_column(X: pd.DataFrame) -> dict:
    """
    Fit a separate LabelEncoder for each object/category column in X.
    Adds MISSING_TOKEN and UNK_TOKEN to the classes for safe transforms later.
    """
    encoders = {}
    for col in X.columns:
        if X[col].dtype == 'object' or str(X[col].dtype).startswith('category'):
            le = LabelEncoder()
            s = X[col].astype(str).fillna(MISSING_TOKEN)

            # Fit on observed values + reserve UNK token
            le.fit(pd.Index(s.unique()).append(pd.Index([UNK_TOKEN])))

            encoders[col] = le
    return encoders

def transform_with_encoders(X: pd.DataFrame, encoders: dict) -> pd.DataFrame:
    """
    Transform object/category columns using fitted encoders.
    Unseen values are mapped to UNK_TOKEN.
    """
    X_out = X.copy()
    for col, le in encoders.items():
        s = X_out[col].astype(str).fillna(MISSING_TOKEN)

        # map unseen categories to UNK_TOKEN
        known = set(le.classes_)
        s = s.where(s.isin(known), UNK_TOKEN)

        X_out[col] = le.transform(s)
    return X_out

def impute_categorical_missing_data(df, passed_col, missing_data_cols, bool_cols):
    df_null = df[df[passed_col].isnull()].copy()
    df_not_null = df[df[passed_col].notnull()].copy()

    X = df_not_null.drop(passed_col, axis=1).copy()
    y = df_not_null[passed_col].copy()

    other_missing_cols = [col for col in missing_data_cols if col != passed_col]

    # ---- FIX: fit encoders ONCE on df_not_null X and reuse ----
    encoders = fit_label_encoders_per_column(X)
    X = transform_with_encoders(X, encoders)

    # y encoding (if boolean-like column stored as strings)
    if passed_col in bool_cols:
        y_le = LabelEncoder()
        y = y_le.fit_transform(y.astype(str).fillna(MISSING_TOKEN))
    else:
        # If y is categorical strings, encode it too (recommended)
        if y.dtype == 'object' or str(y.dtype).startswith('category'):
            y_le = LabelEncoder()
            y = y_le.fit_transform(y.astype(str).fillna(MISSING_TOKEN))
        else:
            y_le = None

    # Iterative impute other missing columns inside X (numeric-only expectation)
    iterative_imputer = IterativeImputer(
        estimator=RandomForestRegressor(random_state=42),
        add_indicator=True,
        random_state=42
    )

    for col in other_missing_cols:
        if col in X.columns and X[col].isnull().sum() > 0:
            X[[col]] = iterative_imputer.fit_transform(X[[col]])[:, [0]]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    rf_classifier = RandomForestClassifier(random_state=42)
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)

    acc_score = accuracy_score(y_test, y_pred)
    print(f"The feature '{passed_col}' has been imputed with {round(acc_score*100, 2)} accuracy\n")

    # ---- Predict missing rows ----
    if len(df_null) > 0:
        X_null = df_null.drop(passed_col, axis=1).copy()

        # IMPORTANT: reuse SAME encoders (no refit!)
        X_null = transform_with_encoders(X_null, encoders)

        for col in other_missing_cols:
            if col in X_null.columns and X_null[col].isnull().sum() > 0:
                X_null[[col]] = iterative_imputer.fit_transform(X_null[[col]])[:, [0]]

        pred = rf_classifier.predict(X_null)

        # decode back if we encoded y
        if passed_col in bool_cols:
            # if original bool expected: map back
            # y_le classes are strings; adapt if you want True/False
            # safer: keep as 0/1 for model features
            df_null[passed_col] = pred
        else:
            if y_le is not None:
                df_null[passed_col] = y_le.inverse_transform(pred)
            else:
                df_null[passed_col] = pred

    df_combined = pd.concat([df_not_null, df_null], axis=0).sort_index()
    return df_combined[passed_col]

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

def impute_continuous_missing_data(df, passed_col, missing_data_cols):
    df_null = df[df[passed_col].isnull()].copy()
    df_not_null = df[df[passed_col].notnull()].copy()

    X = df_not_null.drop(passed_col, axis=1).copy()
    y = df_not_null[passed_col].copy()

    other_missing_cols = [col for col in missing_data_cols if col != passed_col]

    # ---- fit encoders ONCE and reuse ----
    encoders = fit_label_encoders_per_column(X)
    X = transform_with_encoders(X, encoders)

    iterative_imputer = IterativeImputer(
        estimator=RandomForestRegressor(random_state=42),
        add_indicator=True,
        random_state=42
    )

    for col in other_missing_cols:
        if col in X.columns and X[col].isnull().sum() > 0:
            X[[col]] = iterative_imputer.fit_transform(X[[col]])[:, [0]]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    rf_regressor = RandomForestRegressor(random_state=42)
    rf_regressor.fit(X_train, y_train)
    y_pred = rf_regressor.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)   # squared=False kullanma
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print("MAE =", mae)
    print("RMSE =", rmse)
    print("R2 =", r2, "\n")

    # ---- Predict missing rows ----
    if len(df_null) > 0:
        X_null = df_null.drop(passed_col, axis=1).copy()
        X_null = transform_with_encoders(X_null, encoders)

        for col in other_missing_cols:
            if col in X_null.columns and X_null[col].isnull().sum() > 0:
                X_null[[col]] = iterative_imputer.fit_transform(X_null[[col]])[:, [0]]

        df_null[passed_col] = rf_regressor.predict(X_null)

    df_combined = pd.concat([df_not_null, df_null], axis=0).sort_index()
    return df_combined[passed_col]

In [11]:
df.isnull().sum()[df.isnull().sum() > 0].sort_values(ascending=False)

ca          611
thal        486
slope       309
fbs          90
oldpeak      62
trestbps     59
thalch       55
exang        55
chol         30
restecg       2
dtype: int64

In [12]:
for col in missing_data_cols:
    print("Missing Values", col, ":", f"{round((df[col].isnull().sum()/len(df))*100, 2)}%")

    if col in categorical_cols:
        df[col] = impute_categorical_missing_data(
            df=df,
            passed_col=col,
            missing_data_cols=missing_data_cols,
            bool_cols=bool_cols
        )
    elif col in numeric_cols:
        df[col] = impute_continuous_missing_data(
            df=df,
            passed_col=col,
            missing_data_cols=missing_data_cols
        )

Missing Values trestbps : 6.41%
MAE = 13.281445086705203
RMSE = 17.310355165442367
R2 = 0.06440098823305063 

Missing Values chol : 3.26%
MAE = 44.93601123595506
RMSE = 63.67904325600377
R2 = 0.6787732491150087 

Missing Values fbs : 9.78%
The feature 'fbs' has been imputed with 80.12 accuracy

Missing Values restecg : 0.22%
The feature 'restecg' has been imputed with 66.3 accuracy

Missing Values thalch : 5.98%
MAE = 16.85670520231214
RMSE = 21.749592315626373
R2 = 0.31215346312728454 

Missing Values exang : 5.98%
The feature 'exang' has been imputed with 80.35 accuracy

Missing Values oldpeak : 6.74%
MAE = 0.5712093023255813
RMSE = 0.8086901554006085
R2 = 0.37695092112852147 

Missing Values slope : 33.59%
The feature 'slope' has been imputed with 67.48 accuracy

Missing Values ca : 66.41%
The feature 'ca' has been imputed with 66.13 accuracy

Missing Values thal : 52.83%
The feature 'thal' has been imputed with 73.56 accuracy



In [13]:
df.isnull().sum()

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [14]:
data = df.copy()
# In some of the features, there is space will will create problem later on. 
# So we rename those attributes to handle problems in the future.
data['thal'].replace({'fixed defect':'fixed_defect' , 'reversable defect': 'reversable_defect' }, inplace =True)
data['cp'].replace({'typical angina':'typical_angina', 'atypical angina': 'atypical_angina' }, inplace =True)
data['restecg'].replace({'normal': 'normal' , 'st-t abnormality': 'ST-T_wave_abnormality' , 'lv hypertrophy': 'left_ventricular_hypertrophy' }, inplace =True)

# Genrating New Dataset with Less Columns Which Are Necessary .
data_1 = data[['age','sex','cp','dataset', 'trestbps', 'chol', 'fbs','restecg' , 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal']].copy()
# Some Changes in Target Variable | Only Two Categories (0,1) . 0 for No-Disease , 1 for Disease
data_1['target'] = ((data['num'] > 0)*1).copy()
# Encoding Sex 
data_1['sex'] = (data['sex'] == 'Male')*1
# Encoding Fbs and exang
data_1['fbs'] = (data['fbs'])*1
data_1['exang'] = (data['exang'])*1
# Renaming COlumns Names.
data_1.columns = ['age', 'sex', 'chest_pain_type','country' ,'resting_blood_pressure', 
              'cholesterol', 'fasting_blood_sugar','Restecg',
              'max_heart_rate_achieved', 'exercise_induced_angina', 
              'st_depression', 'st_slope_type', 'num_major_vessels', 
              'thalassemia_type', 'target']
# Load Data Sample 
data_1.head()

Unnamed: 0,age,sex,chest_pain_type,country,resting_blood_pressure,cholesterol,fasting_blood_sugar,Restecg,max_heart_rate_achieved,exercise_induced_angina,st_depression,st_slope_type,num_major_vessels,thalassemia_type,target
0,63,1,typical_angina,Cleveland,145.0,233.0,1,left_ventricular_hypertrophy,150.0,0,2.3,downsloping,0.0,fixed_defect,0
1,67,1,asymptomatic,Cleveland,160.0,286.0,0,left_ventricular_hypertrophy,108.0,1,1.5,flat,3.0,normal,1
2,67,1,asymptomatic,Cleveland,120.0,229.0,0,left_ventricular_hypertrophy,129.0,1,2.6,flat,2.0,reversable_defect,1
3,37,1,non-anginal,Cleveland,130.0,250.0,0,normal,187.0,0,3.5,downsloping,0.0,normal,0
4,41,0,atypical_angina,Cleveland,130.0,204.0,0,left_ventricular_hypertrophy,172.0,0,1.4,upsloping,0.0,normal,0


In [16]:
# Load the dataset
file_path = 'heart.csv'
data = pd.read_csv(file_path)

# Display first 5 rows
print("--- First 5 rows of the data ---")
display(data.head())

# Check structure
print("\n--- Information & Missing Values ---")
print(data.info())

# Statistical Summary
print("\n--- Statistical Summary ---")
display(data.describe())

--- First 5 rows of the data ---


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0



--- Information & Missing Values ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB
None

--- Statistical Summary ---


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# data_1 = ...  # senin oluşturduğun son tablo (target dahil) burada hazır

# Kategorik ve sayısal sütunları belirle
categorical_cols = ['chest_pain_type', 'Restecg', 'st_slope_type', 'thalassemia_type', 'country']
numeric_cols = [
    'age', 'resting_blood_pressure', 'cholesterol',
    'max_heart_rate_achieved', 'st_depression', 'num_major_vessels'
]

df_processed = data_1.copy()

# Label Encoding - Kategorik Değişkenler
# (Not: En ideali bunu da CV fold içinde yapmak; şimdilik scaling leakage'ını çözüyoruz.)
label_encoders = {}

for col in categorical_cols:
    if col in df_processed.columns:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        label_encoders[col] = le

# DO NOT scale numeric columns here (leakage-free requirement)
# Scaling will be done after train-test split / inside CV folds.

# (Opsiyonel) numeric kolonları float'a çek
for col in numeric_cols:
    if col in df_processed.columns:
        df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')

# Quick sanity checks
print("Missing values per column:\n", df_processed.isnull().sum())
print("\nShape:", df_processed.shape)
if 'target' in df_processed.columns:
    print("Target distribution:\n", df_processed['target'].value_counts())

# Kaydet
output_filename = 'preprocessed_heart_disease_uci_unscaled.csv'
df_processed.to_csv(output_filename, index=False)
print(f"\nSaved: {output_filename}")

Missing values per column:
 age                        0
sex                        0
chest_pain_type            0
country                    0
resting_blood_pressure     0
cholesterol                0
fasting_blood_sugar        0
Restecg                    0
max_heart_rate_achieved    0
exercise_induced_angina    0
st_depression              0
st_slope_type              0
num_major_vessels          0
thalassemia_type           0
target                     0
dtype: int64

Shape: (920, 15)
Target distribution:
 target
1    509
0    411
Name: count, dtype: int64

Saved: preprocessed_heart_disease_uci_unscaled.csv


In [23]:
import pandas as pd
import numpy as np

data = pd.read_csv("heart.csv")  # örnek: sen zaten data'yı yüklemişsin

# --- Preprocessing Pipeline (UNSCALED) ---

# 1) Feature groups
numeric_features = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
categorical_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

# 2) Copy to avoid modifying original
df_processed = data.copy()

# 3) Explicit binary encoding (stable)
if 'Sex' in df_processed.columns:
    df_processed['Sex'] = df_processed['Sex'].map({'M': 1, 'F': 0})

if 'ExerciseAngina' in df_processed.columns:
    df_processed['ExerciseAngina'] = df_processed['ExerciseAngina'].map({'Y': 1, 'N': 0})

# 4) One-hot encode remaining categorical string columns (drop_first=True)
# We only one-hot encode columns that are still object dtype
ohe_cols = [
    col for col in categorical_features
    if col in df_processed.columns and df_processed[col].dtype == 'object'
]
df_processed = pd.get_dummies(df_processed, columns=ohe_cols, drop_first=True)

# 5) Clean column names
df_processed.columns = df_processed.columns.str.strip()

# 6) DO NOT scale here (leakage-free requirement)
# Scaling (StandardScaler/MinMax) will be done after train-test split / inside CV folds.

# 7) Quick sanity checks
print("Missing values per column:\n", df_processed.isnull().sum())
print("\nShape:", df_processed.shape)
if 'HeartDisease' in df_processed.columns:
    print("Target distribution:\n", df_processed['HeartDisease'].value_counts())

# 8) Save processed dataset (UNSCALED)
output_filename = 'processed_heart_unscaled.csv'
df_processed.to_csv(output_filename, index=False)
print(f"\nSaved: {output_filename}")

Missing values per column:
 Age                  0
Sex                  0
RestingBP            0
Cholesterol          0
FastingBS            0
MaxHR                0
ExerciseAngina       0
Oldpeak              0
HeartDisease         0
ChestPainType_ATA    0
ChestPainType_NAP    0
ChestPainType_TA     0
RestingECG_Normal    0
RestingECG_ST        0
ST_Slope_Flat        0
ST_Slope_Up          0
dtype: int64

Shape: (918, 16)
Target distribution:
 HeartDisease
1    508
0    410
Name: count, dtype: int64

Saved: processed_heart_unscaled.csv
