###### ARTIFICIAL INTELLIGENCE ALGORITHMS
### Practical Assignment
# **#2**
---

In [185]:
import pandas as pd
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

In [186]:
heart_df = pd.read_csv("./heart.csv")  # Load the dataset
display(heart_df.head(5))  # Display dataframe
display(heart_df.shape)
display(heart_df.dtypes)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


(918, 12)

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

The dataframe consists of 918 samples with 12 features.

In [187]:
### Handling outliers

numeric_columns = heart_df.select_dtypes(include=["int64", "float64"])
z_scores = numeric_columns.apply(zscore)
outliers = (z_scores.abs() > 3).any(axis=1)  # Filter rows that are outside 3 zscore.

# Filter out rows with outliers
heart_df_final = heart_df[~outliers]

In [188]:
# Apply one-hot-encoding to transform non-numeric columns to numeric
non_numeric_columns = heart_df.select_dtypes(exclude=["int64", "float64"])
heart_df_final = pd.get_dummies(
    heart_df_final, columns=non_numeric_columns.columns.tolist()
).astype("int")

In [189]:
target = heart_df_final["HeartDisease"]  # Backup targets before scaling
# Apply scaling for numeric columns
scaler = StandardScaler()
heart_df_final[numeric_columns.columns] = scaler.fit_transform(
    heart_df_final[numeric_columns.columns]
)
heart_df_final["HeartDisease"] = target  # Restore targets after scaling.

# Display final dataframe
heart_df_final.head(5)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,-1.428154,0.4659,0.849636,-0.550362,1.38432,-0.745287,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
1,-0.475855,1.634714,-0.168122,-0.550362,0.752973,0.330175,1,1,0,0,...,1,0,0,1,0,1,0,0,1,0
2,-1.745588,-0.118507,0.793612,-0.550362,-1.535661,-0.745287,0,0,1,0,...,0,0,0,0,1,1,0,0,0,1
3,-0.581666,0.349019,0.149344,-0.550362,-1.141069,0.330175,1,1,0,1,...,0,0,0,1,0,0,1,0,1,0
4,0.0532,1.050307,-0.028064,-0.550362,-0.58864,-0.745287,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1


In [190]:
# Split data into dependent and indepent variables
X = heart_df_final.drop("HeartDisease", axis=1)
y = heart_df_final["HeartDisease"]  # Unscaled from original dataframe

In [191]:
### Model Training

# Split data into training and testing.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# SVM - Standalone
svm_std_model = SVC(kernel="linear")
svm_std_model.fit(X_train, y_train)

# SVM - Bagging Model
svm_bag_model = BaggingClassifier(
    estimator=SVC(kernel="linear"),
    n_estimators=500,
    max_samples=0.8,
    oob_score=True,
    random_state=16,
)
svm_bag_model.fit(X_train, y_train)

# Decision Tree - Standalone
tree_std_model = DecisionTreeClassifier(random_state=42)
tree_std_model.fit(X_train, y_train)

# Decision Tree - Bagged
tree_bag_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.8,
    oob_score=True,
    random_state=16,
)
tree_bag_model.fit(X_train, y_train)

In [192]:
### Prediction
y_pred_svm_std = svm_std_model.predict(X_test)
y_pred_svm_bag = svm_bag_model.predict(X_test)
y_pred_tree_std = tree_std_model.predict(X_test)
y_pred_tree_bag = tree_bag_model.predict(X_test)

In [193]:
### Evaluation
print("\tMODEL\t\t|\tACCURACY\t|\tPRECISION\t|\tRECALL\t\t|\tF1 SCORE")
print(
    "________________________________________________________________________________________________________________"
)
print(
    f"Standalone SVM\t\t|\t{accuracy_score(y_test, y_pred_svm_std):.3f}\t\t|\t{precision_score(y_test, y_pred_svm_std):.3f}\t\t|\t{recall_score(y_test, y_pred_svm_std):.3f}\t\t|\t{f1_score(y_test, y_pred_svm_std):.3f}\t\t"
)
print(
    f"Bagging SVM\t\t|\t{accuracy_score(y_test, y_pred_svm_bag):.3f}\t\t|\t{precision_score(y_test, y_pred_svm_bag):.3f}\t\t|\t{recall_score(y_test, y_pred_svm_bag):.3f}\t\t|\t{f1_score(y_test, y_pred_svm_bag):.3f}\t\t"
)
print(
    f"Standalone Decision Tree|\t{accuracy_score(y_test, y_pred_tree_std):.3f}\t\t|\t{precision_score(y_test, y_pred_tree_std):.3f}\t\t|\t{recall_score(y_test, y_pred_tree_std):.3f}\t\t|\t{f1_score(y_test, y_pred_tree_std):.3f}\t\t"
)
print(
    f"Bagging Decision Tree\t|\t{accuracy_score(y_test, y_pred_tree_bag):.3f}\t\t|\t{precision_score(y_test, y_pred_tree_bag):.3f}\t\t|\t{recall_score(y_test, y_pred_tree_bag):.3f}\t\t|\t{f1_score(y_test, y_pred_tree_bag):.3f}\t\t"
)
print("\n\nConfusion Matrices:")
print("\nStandalone SVM\n", confusion_matrix(y_test, y_pred_svm_std))
print("\nBagging SVM\n", confusion_matrix(y_test, y_pred_svm_bag))
print("\nStandalone Decision Tree\n", confusion_matrix(y_test, y_pred_tree_std))
print("\nBagging Decision Tree\n", confusion_matrix(y_test, y_pred_tree_bag))

	MODEL		|	ACCURACY	|	PRECISION	|	RECALL		|	F1 SCORE
________________________________________________________________________________________________________________
Standalone SVM		|	0.883		|	0.841		|	0.957		|	0.896		
Bagging SVM		|	0.878		|	0.833		|	0.957		|	0.891		
Standalone Decision Tree|	0.844		|	0.837		|	0.872		|	0.854		
Bagging Decision Tree	|	0.872		|	0.838		|	0.936		|	0.884		


Confusion Matrices:

Standalone SVM
 [[69 17]
 [ 4 90]]

Bagging SVM
 [[68 18]
 [ 4 90]]

Standalone Decision Tree
 [[70 16]
 [12 82]]

Bagging Decision Tree
 [[69 17]
 [ 6 88]]


According to the observed metric results, the Standalone SVM produced overall best results. It produced higher accuracy, higher precision, higher recall, and higher f1 score among 4 models.