In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


🔍 Column Explanation
1. age: The age of the patient (in years).
2. sex: The gender of the patient.
  *1 → Male
  *0 → Female
3. cp (Chest Pain Type):

0 → Typical angina
1 → Atypical angina
2 → Non-anginal pain
3 → Asymptomatic
 
4. trestbps (Resting Blood Pressure): The patient’s resting blood pressure (in mm Hg).
   Example: 145, 130, 120

5. chol (Cholesterol): Serum cholesterol level in mg/dl.
   Example: 233, 250, 204
6. fbs (Fasting Blood Sugar > 120 mg/dl):

1 → True (high fasting blood sugar)
0 → False (normal fasting blood sugar)

7. restecg (Resting Electrocardiographic Results):

0 → Normal
1 → ST-T wave abnormality (e.g., T wave inversions)
2 → Left ventricular hypertrophy (LVH)

8. thalach (Maximum Heart Rate Achieved):

Example: 150, 187, 172
Higher values typically indicate better cardiovascular fitness.

9. exang (Exercise-Induced Angina):

1 → Yes (angina induced by exercise)
0 → No

10. oldpeak (ST Depression Induced by Exercise):
    Represents depression relative to the resting state.
    Example: 2.3, 3.5, 1.4
    
12. slope (Slope of the Peak Exercise ST Segment):
0 → Upsloping
1 → Flat
2 → Downsloping

12. ca (Number of Major Vessels Colored by Fluoroscopy):
   Ranges from 0 to 4 (higher values indicate more vessel blockage).

13. thal (Thalassemia):
  1 → Normal
  2 → Fixed defect
  3 → Reversible defect

14. target (Diagnosis of Heart Disease):
 1 → Heart disease present
 0 → No heart disease

In [3]:
df = pd.read_csv(r"D:\EDA of DataSet\heart-disease.csv")

df.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
df.rename(columns={
    'age': 'Age (Years)',
    'sex': 'Gender (0=Female, 1=Male)',
    'cp': 'Chest Pain Type',
    'trestbps': 'Resting BP (mmHg)',
    'chol': 'Cholesterol (mg/dl)',
    'fbs': 'Fasting Blood Sugar (>120 mg/dl)',
    'restecg': 'Resting ECG',
    'thalach': 'Max Heart Rate',
    'exang': 'Exercise-Induced Angina (1=Yes, 0=No)',
    'oldpeak': 'ST Depression',
    'slope': 'Slope of Peak ST',
    'ca': 'No. of Major Vessels',
    'thal': 'Thalassemia Type',
    'target': 'Heart Disease (1=Yes, 0=No)'}, inplace=True)

In [5]:
df.head(2)

Unnamed: 0,Age (Years),"Gender (0=Female, 1=Male)",Chest Pain Type,Resting BP (mmHg),Cholesterol (mg/dl),Fasting Blood Sugar (>120 mg/dl),Resting ECG,Max Heart Rate,"Exercise-Induced Angina (1=Yes, 0=No)",ST Depression,Slope of Peak ST,No. of Major Vessels,Thalassemia Type,"Heart Disease (1=Yes, 0=No)"
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1


In [6]:
df.isna().sum()

Age (Years)                              0
Gender (0=Female, 1=Male)                0
Chest Pain Type                          0
Resting BP (mmHg)                        0
Cholesterol (mg/dl)                      0
Fasting Blood Sugar (>120 mg/dl)         0
Resting ECG                              0
Max Heart Rate                           0
Exercise-Induced Angina (1=Yes, 0=No)    0
ST Depression                            0
Slope of Peak ST                         0
No. of Major Vessels                     0
Thalassemia Type                         0
Heart Disease (1=Yes, 0=No)              0
dtype: int64

In [7]:
df.isnull().sum()

Age (Years)                              0
Gender (0=Female, 1=Male)                0
Chest Pain Type                          0
Resting BP (mmHg)                        0
Cholesterol (mg/dl)                      0
Fasting Blood Sugar (>120 mg/dl)         0
Resting ECG                              0
Max Heart Rate                           0
Exercise-Induced Angina (1=Yes, 0=No)    0
ST Depression                            0
Slope of Peak ST                         0
No. of Major Vessels                     0
Thalassemia Type                         0
Heart Disease (1=Yes, 0=No)              0
dtype: int64

In [9]:
df.shape

(303, 14)

In [16]:
df.drop_duplicates(inplace=True)

In [18]:
df.shape

(302, 14)

In [20]:
df.columns

Index(['Age (Years)', 'Gender (0=Female, 1=Male)', 'Chest Pain Type',
       'Resting BP (mmHg)', 'Cholesterol (mg/dl)',
       'Fasting Blood Sugar (>120 mg/dl)', 'Resting ECG', 'Max Heart Rate',
       'Exercise-Induced Angina (1=Yes, 0=No)', 'ST Depression',
       'Slope of Peak ST', 'No. of Major Vessels', 'Thalassemia Type',
       'Heart Disease (1=Yes, 0=No)'],
      dtype='object')

In [22]:
x = df.iloc[:,:-1]
y = df['Heart Disease (1=Yes, 0=No)']

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state = None)


In [25]:
model = RandomForestClassifier(n_estimators = 20,max_features = 10,criterion = 'entropy')

model.fit(x_train,y_train)

In [28]:
y_pred = model.predict(x_test)


In [30]:
from sklearn.metrics import accuracy_score,f1_score

accuracy_score(y_test,y_pred)

0.7868852459016393

In [32]:
f1_score(y_test,y_pred)

0.8115942028985508

In [34]:
import pickle
path = "C:\\Users\\rimjh\\OneDrive\\Desktop\\ML Project\\heart disease prediction\\model.pkl"

with open(path,'wb') as file:
   pickle.dump(model,file)
    

In [50]:
x



Unnamed: 0,Age (Years),"Gender (0=Female, 1=Male)",Chest Pain Type,Resting BP (mmHg),Cholesterol (mg/dl),Fasting Blood Sugar (>120 mg/dl),Resting ECG,Max Heart Rate,"Exercise-Induced Angina (1=Yes, 0=No)",ST Depression,Slope of Peak ST,No. of Major Vessels,Thalassemia Type
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


test the SVM model


In [36]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

x_train_scale =  sc.fit_transform(x_train)
x_test_scale = sc.fit_transform(x_test)



In [37]:
from sklearn.svm import SVC

svm_rbf_model = SVC(kernel = 'rbf',random_state = None)

svm_rbf_model.fit(x_train_scale,y_train)

In [39]:
y_pred_rbf = svm_rbf_model.predict(x_test_scale)

accuracy_score(y_test,y_pred_rbf)

0.8032786885245902