<a href="https://colab.research.google.com/github/asheta66/Machine-Learning-2022/blob/main/Classification2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import label_binarize
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Load data
data = pd.read_csv('heart_disease.csv')
# data = pd.read_excel('geometric_features_labels.xlsx')

# Split data into features and labels
features = data.iloc[:, :-1]
labels = data.iloc[:, -1]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=0)

# Initialize algorithms
algorithms = {
    'SVC': SVC(probability=True),
    'GBC': GradientBoostingClassifier(),
    'GNB': GaussianNB(),
    'RFC': RandomForestClassifier(),
    'KNC': KNeighborsClassifier()
}

# Initialize lists to store metrics and cross-validation results
metrics_list = []
cv_results = []

# Initialize the plot for ROC Curves
plt.figure(figsize=(10, 8))
plt.title('ROC Curves for All Algorithms', fontsize=16)

# Define line styles and colors
line_styles = ['-', '--', '-.', ':', (0, (3, 1, 1, 1))]
colors = ['b', 'g', 'r', 'c', 'm']

for i, (name, model) in enumerate(algorithms.items()):
    # Fit model
    model.fit(x_train, y_train)
    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)
    y_pred_prob = model.predict_proba(x_test)
    y_test_bin = label_binarize(y_test, classes=np.unique(y_test))

    # Calculate confusion matrices
    confMatTrain = confusion_matrix(y_train, y_pred_train)
    confMatTest = confusion_matrix(y_test, y_pred_test)

    # Save confusion matrices
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    sns.heatmap(confMatTrain, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 16, "weight": 'bold'})
    plt.title(f'{name} Confusion Matrix (Training)', fontsize=14)
    plt.xlabel('Predicted', fontsize=12)
    plt.ylabel('True', fontsize=12)

    plt.subplot(1, 2, 2)
    sns.heatmap(confMatTest, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 16, "weight": 'bold'})
    plt.title(f'{name} Confusion Matrix (Testing)', fontsize=14)
    plt.xlabel('Predicted', fontsize=12)
    plt.ylabel('True', fontsize=12)

    plt.tight_layout()
    plt.savefig(f'{name}_confusion_matrix.png')
    plt.close()

    # ROC Curve
    if y_test_bin.shape[1] > 1:  # Check if multi-class
        fpr = {}
        tpr = {}
        roc_auc = {}
        for j in range(y_test_bin.shape[1]):
            fpr[j], tpr[j], _ = roc_curve(y_test_bin[:, j], y_pred_prob[:, j])
            roc_auc[j] = round(roc_auc_score(y_test_bin[:, j], y_pred_prob[:, j]), 4)
            plt.plot(fpr[j], tpr[j], linestyle=line_styles[i], color=colors[i], label=f'{name} class {j} (AUC = {roc_auc[j]})')
    else:
        fpr, tpr, _ = roc_curve(y_test_bin, y_pred_prob[:, 1])
        roc_auc = round(roc_auc_score(y_test_bin, y_pred_prob[:, 1]), 4)
        plt.plot(fpr, tpr, linestyle=line_styles[i], color=colors[i], label=f'{name} (AUC = {roc_auc})')

    # Metrics
    metrics_list.append({
        'Algorithm': name,
        'Data': 'Training',
        'Accuracy': accuracy_score(y_train, y_pred_train),
        'Precision': precision_score(y_train, y_pred_train, average='weighted'),
        'Recall': recall_score(y_train, y_pred_train, average='weighted'),
        'F1 Score': f1_score(y_train, y_pred_train, average='weighted')
    })

    metrics_list.append({
        'Algorithm': name,
        'Data': 'Testing',
        'Accuracy': accuracy_score(y_test, y_pred_test),
        'Precision': precision_score(y_test, y_pred_test, average='weighted'),
        'Recall': recall_score(y_test, y_pred_test, average='weighted'),
        'F1 Score': f1_score(y_test, y_pred_test, average='weighted')
    })

    # Cross-validation results
    cv_results.append(cross_val_score(model, features, labels, cv=5, scoring='accuracy'))

# ROC Curve Plot
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.legend(loc='best')
plt.grid()
plt.savefig('roc_curves.png')
plt.close()

# Metrics DataFrame
metrics_df = pd.DataFrame(metrics_list)
print("Metrics DataFrame:")
print(metrics_df)

# Boxplot for cross-validation results
plt.figure(figsize=(10, 6))
plt.boxplot(cv_results, labels=algorithms.keys())
plt.title('Boxplot of Cross-Validation Accuracy for Classifiers', fontsize=16)
plt.ylabel('Accuracy', fontsize=14)
plt.grid(True)
plt.savefig('boxplot.png')
plt.close()

# Correlation Matrix
plt.figure(figsize=(12, 10))
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Input Data', fontsize=16)
plt.savefig('correlation_matrix.png')
plt.close()


Metrics DataFrame:
  Algorithm      Data  Accuracy  Precision    Recall  F1 Score
0       SVC  Training  0.669421   0.679855  0.669421  0.654439
1       SVC   Testing  0.639344   0.645698  0.639344  0.623870
2       GBC  Training  1.000000   1.000000  1.000000  1.000000
3       GBC   Testing  0.852459   0.852441  0.852459  0.852219
4       GNB  Training  0.822314   0.822109  0.822314  0.822084
5       GNB   Testing  0.803279   0.803531  0.803279  0.802529
6       RFC  Training  1.000000   1.000000  1.000000  1.000000
7       RFC   Testing  0.803279   0.803531  0.803279  0.802529
8       KNC  Training  0.743802   0.745850  0.743802  0.740563
9       KNC   Testing  0.704918   0.704918  0.704918  0.704918
