<a href="https://www.kaggle.com/code/aabdollahii/heart-attack-prediction?scriptVersionId=257322025" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# understanding data

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("/kaggle/input/heart-attack-prediction/data.csv")

In [2]:
# Step 3: Quick structure check
print("Shape of dataset:", df.shape)
print("First 20 rows:")
display(df.head(20))


Shape of dataset: (294, 14)
First 20 rows:


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0
1,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0
2,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0
3,30,0,1,170,237,0,1,170,0,0.0,?,?,6,0
4,31,0,2,100,219,0,1,150,0,0.0,?,?,?,0
5,32,0,2,105,198,0,0,165,0,0.0,?,?,?,0
6,32,1,2,110,225,0,0,184,0,0.0,?,?,?,0
7,32,1,2,125,254,0,0,155,0,0.0,?,?,?,0
8,33,1,3,120,298,0,0,185,0,0.0,?,?,?,0
9,34,0,2,130,161,0,0,190,0,0.0,?,?,?,0


In [3]:
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         294 non-null    int64  
 1   sex         294 non-null    int64  
 2   cp          294 non-null    int64  
 3   trestbps    294 non-null    object 
 4   chol        294 non-null    object 
 5   fbs         294 non-null    object 
 6   restecg     294 non-null    object 
 7   thalach     294 non-null    object 
 8   exang       294 non-null    object 
 9   oldpeak     294 non-null    float64
 10  slope       294 non-null    object 
 11  ca          294 non-null    object 
 12  thal        294 non-null    object 
 13  num         294 non-null    int64  
dtypes: float64(1), int64(4), object(9)
memory usage: 32.3+ KB
None


In [4]:
print(df.isnull().sum())


age           0
sex           0
cp            0
trestbps      0
chol          0
fbs           0
restecg       0
thalach       0
exang         0
oldpeak       0
slope         0
ca            0
thal          0
num           0
dtype: int64


# Data Preprocessing & Feature Engineering

In [5]:
# Count missing values in the problematic features
missing_analysis = pd.DataFrame({
    'feature': ['slope', 'thal', 'ca'],
    'total_missing': [
        (df['slope'] == '?').sum(),
        (df['thal'] == '?').sum(),
        (df['ca'] == '?').sum()
    ],
    'missing_percentage': [
        ((df['slope'] == '?').sum() / len(df)) * 100,
        ((df['thal'] == '?').sum() / len(df)) * 100,
        ((df['ca'] == '?').sum() / len(df)) * 100
    ]
})

print("📊 Missing Value Analysis:")
print(missing_analysis)

📊 Missing Value Analysis:
  feature  total_missing  missing_percentage
0   slope            190           64.625850
1    thal            266           90.476190
2      ca            291           98.979592


In [6]:
# Drop the problematic columns
df_clean = df.drop(['slope', 'thal', 'ca'], axis=1)

print(f"Original shape: {df.shape}")
print(f"After dropping: {df_clean.shape}")
print("✅ Dropped slope, thal, and ca columns due to excessive missing data")

Original shape: (294, 14)
After dropping: (294, 11)
✅ Dropped slope, thal, and ca columns due to excessive missing data


* As everything was missing and not enough good in those three columns we had to drop them.

# ML Pipeline

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [8]:
# Handle any remaining minor missing values
df_clean = df_clean.dropna()  # Since we removed major missingness, this should be safe

print("Final Clean Dataset:")
print(f"Shape: {df_clean.shape}")
print(f"Columns: {df_clean.columns.tolist()}")

Final Clean Dataset:
Shape: (294, 11)
Columns: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'num       ']


In [9]:
df_clean.columns = df_clean.columns.str.strip()

# Verify the fix
print("✅ Column names after stripping:")
print(df_clean.columns.tolist())

✅ Column names after stripping:
['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'num']


In [10]:
df_clean = df_clean.apply(lambda col: col.str.strip() if col.dtype == "object" else col)

# Create binary target
df_clean['heart_disease'] = (df_clean['num'] > 0).astype(int)

# Separate features and target
X = df_clean.drop(['num', 'heart_disease'], axis=1)
y = df_clean['heart_disease']

In [11]:
print(f" Target distribution:")
print(y.value_counts())
print(f"Baseline accuracy: {max(y.value_counts(normalize=True)):.3f}")

 Target distribution:
heart_disease
0    188
1    106
Name: count, dtype: int64
Baseline accuracy: 0.639


In [12]:
# First, let's identify where these '?' values are hiding
print("🔍 Checking for remaining '?' values in numerical columns:")
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

for col in numerical_cols:
    if col in df_clean.columns:
        question_marks = (df_clean[col] == '?').sum()
        if question_marks > 0:
            print(f"  {col}: {question_marks} '?' values found")
        else:
            print(f"  {col}: No '?' values ✅")

# Also check categorical columns
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang']
for col in categorical_cols:
    if col in df_clean.columns:
        question_marks = (df_clean[col] == '?').sum()
        if question_marks > 0:
            print(f"  {col}: {question_marks} '?' values found")


🔍 Checking for remaining '?' values in numerical columns:
  age: No '?' values ✅
  trestbps: 1 '?' values found
  chol: 23 '?' values found
  thalach: 1 '?' values found
  oldpeak: No '?' values ✅
  fbs: 8 '?' values found
  restecg: 1 '?' values found
  exang: 1 '?' values found


In [13]:
import pandas as pd
import numpy as np

def clean_heart_disease_data(df):
    """
    Comprehensive cleaning function for UCI Heart Disease dataset
    """
    # Make a copy to avoid modifying original
    df_clean = df.copy()
    
    # Step 1: Drop high-missing columns
    columns_to_drop = ['slope', 'thal', 'ca']
    df_clean = df_clean.drop(columns=columns_to_drop, errors='ignore')
    
    # Step 2: Clean column names
    df_clean.columns = df_clean.columns.str.strip()
    
    # Step 3: Replace '?' with NaN for proper handling
    df_clean = df_clean.replace('?', np.nan)
    
    # Step 4: Convert numerical columns to proper numeric type
    numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
    for col in numerical_cols:
        if col in df_clean.columns:
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
    
    # Step 5: Convert categorical columns (handle any remaining '?')
    categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang']
    for col in categorical_cols:
        if col in df_clean.columns:
            # Replace any non-numeric values with NaN, then convert to numeric
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
    
    # Step 6: Drop rows with any remaining missing values
    initial_rows = len(df_clean)
    df_clean = df_clean.dropna()
    final_rows = len(df_clean)
    
    print(f"📊 Data cleaning completed:")
    print(f"  Rows removed due to missing values: {initial_rows - final_rows}")
    print(f"  Final dataset shape: {df_clean.shape}")
    
    # Step 7: Create binary target
    df_clean['heart_disease'] = (df_clean['num'] > 0).astype(int)
    
    return df_clean

# Apply the comprehensive cleaning
df_clean = clean_heart_disease_data(df)

# Verify no more '?' values
print(" Verification - No more '?' values:")
for col in df_clean.columns:
    if (df_clean[col] == '?').any():
        print(f"  WARNING: Still found '?' in {col}")
    else:
        print(f"  {col}: Clean ✅")


📊 Data cleaning completed:
  Rows removed due to missing values: 33
  Final dataset shape: (261, 11)
 Verification - No more '?' values:
  age: Clean ✅
  sex: Clean ✅
  cp: Clean ✅
  trestbps: Clean ✅
  chol: Clean ✅
  fbs: Clean ✅
  restecg: Clean ✅
  thalach: Clean ✅
  exang: Clean ✅
  oldpeak: Clean ✅
  num: Clean ✅
  heart_disease: Clean ✅
