In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
heart_disease = pd.read_csv('/content/heart_2020_cleaned.csv')

In [None]:
heart_disease.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [None]:
categorical_columns = [
    'HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking',
    'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity',
    'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'
]

# Initialize label encoders for each categorical column
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    heart_disease[column] = label_encoders[column].fit_transform(heart_disease[column])

# Display the first few rows of the transformed dataset
print("Transformed Dataset (First 5 Rows):")
print(heart_disease.head())

# Save the transformed dataset to a new CSV file
heart_disease.to_csv('heart_2020_cleaned_encoded.csv', index=False)

Transformed Dataset (First 5 Rows):
   HeartDisease    BMI  Smoking  AlcoholDrinking  Stroke  PhysicalHealth  \
0             0  16.60        1                0       0             3.0   
1             0  20.34        0                0       1             0.0   
2             0  26.58        1                0       0            20.0   
3             0  24.21        0                0       0             0.0   
4             0  23.71        0                0       0            28.0   

   MentalHealth  DiffWalking  Sex  AgeCategory  Race  Diabetic  \
0          30.0            0    0            7     5         2   
1           0.0            0    0           12     5         0   
2          30.0            0    1            9     5         2   
3           0.0            0    0           11     5         0   
4           0.0            1    0            4     5         0   

   PhysicalActivity  GenHealth  SleepTime  Asthma  KidneyDisease  SkinCancer  
0                 1          4 

In [None]:
heart_disease.shape

(319795, 18)

In [None]:
features = heart_disease.drop(columns = 'HeartDisease', axis = 1)
target = heart_disease['HeartDisease']

In [None]:
print(features)
print(target)

          BMI  Smoking  AlcoholDrinking  Stroke  PhysicalHealth  MentalHealth  \
0       16.60        1                0       0             3.0          30.0   
1       20.34        0                0       1             0.0           0.0   
2       26.58        1                0       0            20.0          30.0   
3       24.21        0                0       0             0.0           0.0   
4       23.71        0                0       0            28.0           0.0   
...       ...      ...              ...     ...             ...           ...   
319790  27.41        1                0       0             7.0           0.0   
319791  29.84        1                0       0             0.0           0.0   
319792  24.24        0                0       0             0.0           0.0   
319793  32.81        0                0       0             0.0           0.0   
319794  46.56        0                0       0             0.0           0.0   

        DiffWalking  Sex  A

In [None]:
heart_disease.value_counts('Race')

Unnamed: 0_level_0,count
Race,Unnamed: 1_level_1
5,245212
3,27446
2,22939
4,10928
1,8068
0,5202


American Indian/Alaskan Native = 0

Asian = 1

Black = 2

Hispanic = 3

Other = 4

White = 5

In [None]:
heart_disease.value_counts('AgeCategory')

Unnamed: 0_level_0,count
AgeCategory,Unnamed: 1_level_1
9,34151
8,33686
10,31065
7,29757
6,25382
12,24153
5,21791
11,21482
0,21064
4,21006


18-24 = 0

25-29 = 1

30-34 = 2

35-39 = 3

40-44 = 4

45-49 = 5

50-54 = 6

55-59 = 7

60-64 = 8

65-69 = 9

70-74 = 10

75-79 = 11

80 or older = 12

In [None]:
heart_disease.value_counts('GenHealth')

Unnamed: 0_level_0,count
GenHealth,Unnamed: 1_level_1
4,113858
2,93129
0,66842
1,34677
3,11289


Poor = 0

Fair = 1

Good = 2

Very good = 3

Excellent = 4

In [None]:
heart_disease.value_counts('Diabetic')

Unnamed: 0_level_0,count
Diabetic,Unnamed: 1_level_1
0,269653
2,40802
1,6781
3,2559


No = 0

Yes = 1

"No, borderline diabetes" = 2

Yes (during pregnancy) = 3

Every other categorical variable is of the form

0 -> No

1 -> Yes

**Data Standardization**

In [None]:
scaler = StandardScaler()
scaler.fit(features)
standardized_data = scaler.transform(features)
print(standardized_data)
features = standardized_data
target = heart_disease['HeartDisease']

[[-1.84475016  1.19347355 -0.27031975 ...  2.54151479 -0.19555439
   3.11841916]
 [-1.25633812 -0.83789038 -0.27031975 ... -0.39346613 -0.19555439
  -0.3206753 ]
 [-0.27460254  1.19347355 -0.27031975 ...  2.54151479 -0.19555439
  -0.3206753 ]
 ...
 [-0.64275338 -0.83789038 -0.27031975 ... -0.39346613 -0.19555439
  -0.3206753 ]
 [ 0.70555975 -0.83789038 -0.27031975 ... -0.39346613 -0.19555439
  -0.3206753 ]
 [ 2.86883929 -0.83789038 -0.27031975 ... -0.39346613 -0.19555439
  -0.3206753 ]]


**Train Test Split**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 7)
print(features.shape, x_train.shape, x_test.shape)

(319795, 17) (255836, 17) (63959, 17)


In [None]:
kernels = ['linear', 'rbf']
models = {}
for kernel in kernels:
    model = SVC(kernel=kernel, probability=True, random_state=42)
    model.fit(x_train, y_train)
    models[kernel] = model
    y_pred = model.predict(x_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, model.predict_proba(x_test)[:, 1])

    print(f"\nSVM with {kernel} kernel:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"AUC: {auc:.4f}")

In [None]:
# Visualization of decision boundary (using two features for simplicity)
# Select two features for 2D visualization (e.g., BMI and PhysicalHealth)
X_vis = X_scaled[:, [1, 4]]  # BMI (index 1), PhysicalHealth (index 4)
X_train_vis, X_test_vis, y_train_vis, y_test_vis = train_test_split(X_vis, y, test_size=0.2, random_state=42)

# Train a new SVM model with linear kernel for visualization
svm_vis = SVC(kernel='linear', random_state=42)
svm_vis.fit(X_train_vis, y_train_vis)

# Create a mesh to plot the decision boundary
h = 0.02  # Step size in the mesh
x_min, x_max = X_train_vis[:, 0].min() - 1, X_train_vis[:, 0].max() + 1
y_min, y_max = X_train_vis[:, 1].min() - 1, X_train_vis[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = svm_vis.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot the decision boundary
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu, alpha=0.4)
plt.scatter(X_train_vis[:, 0], X_train_vis[:, 1], c=y_train_vis, cmap=plt.cm.RdYlBu, edgecolors='k')
plt.xlabel('BMI (Scaled)')
plt.ylabel('PhysicalHealth (Scaled)')
plt.title('SVM Decision Boundary with Linear Kernel')
plt.colorbar(label='HeartDisease (0: No, 1: Yes)')
plt.show()

# Confusion Matrix for the best model (linear kernel as an example)
y_pred_linear = models['linear'].predict(X_test)
cm = confusion_matrix(y_test, y_pred_linear)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Linear Kernel)')
plt.show()