# Coronary Heart Disease Prediction Using SuperLearner

## Import necessary libraries and Dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from tensorflow import keras
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import learning_curve
from sklearn import preprocessing

In [None]:
# Loading the dataset
df = pd.read_csv('C:\\Users\\23480\\Documents\\Personal\\SHU Academic\\3rd Semester\\Dissertation\\Dataset\\heart+disease\\processed_cleveland.csv')

## Exploratory Data Analysis

In [None]:
# Display the first few rows of the dataset to get an overview
print(df.head())

In [None]:
# Numer of rows and columns
df.shape

In [None]:
# Summary statistics of numerical features
print(df.describe())

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
#Check for data types
df.dtypes

In [None]:
# Drop rows with '?' in 'ca' and 'thal' features
df = df[df['ca'] != '?']
df = df[df['thal'] != '?']

In [None]:
# Numer of rows and columns
df.shape

In [None]:
#Check for data type
df.dtypes

## Data Visualization

In [None]:
# Data Classifications and Feature Distributions

# Define a list of numerical and categorical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', ]
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'ca']

# Create subplots for numerical features
num_rows = len(numerical_features)
num_cols = 2
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 15))
fig.subplots_adjust(hspace=0.5)

for i, feature in enumerate(numerical_features):
    row, col = i // num_cols, i % num_cols
    ax = axes[row, col]
    df[feature].plot(kind='hist', ax=ax, legend=True)
    ax.set_title(f'Distribution of {feature}')
    ax.set_xlabel(feature)

# Remove any empty subplot
if len(numerical_features) < num_rows * num_cols:
    for i in range(len(numerical_features), num_rows * num_cols):
        fig.delaxes(axes[i // num_cols, i % num_cols])

# Create bar plots for categorical features
fig, axes = plt.subplots(2, 4, figsize=(15, 10))
fig.subplots_adjust(hspace=0.5)

for i, feature in enumerate(categorical_features):
    row, col = i // 4, i % 4
    ax = axes[row, col]
    df[feature].value_counts().plot(kind='bar', ax=ax, legend=True)
    ax.set_title(f'Distribution of {feature}')
    ax.set_xlabel(feature)

# Remove any empty subplot
if len(categorical_features) < 7:
    for i in range(len(categorical_features), 7):
        fig.delaxes(axes[i // 4, i % 4])

plt.show()


In [None]:
sns.countplot(data= df, x='exang',hue='thal')
plt.title('exang v/s Thalassemia\n')

In [None]:
# Considering only people with heart disease according to the dataset

# Filter out rows with target values 1, 2, 3, or 4
df2 = df[df['num'].isin([1, 2, 3, 4])]
pal = sns.light_palette("blue", as_cmap=True)
print('Age vs trestbps(Resting Blood Pressure)')
sns.jointplot(data=df2,
              x='age',
              y='trestbps',
              kind='hex',
              cmap='Reds'
           
              )

In [None]:
# Create a joint plot for "chol" and "cp"
sns.set(style="whitegrid")  # Set the style of the plot
sns.jointplot(data = df2, x="age", y="thalach", kind="kde", height=7)

# Show the plot
plt.show()

In [None]:
# Create a joint plot for "chol" and "cp"
sns.set(style="whitegrid")  # Set the style of the plot
sns.jointplot(data = df2, x="cp", y="chol", kind="kde", height=7)

# Show the plot
plt.show()

In [None]:
# Heart disease severity as it impacts the sexes
sns.countplot(data= df2, x='sex',hue='num')
plt.title('Sex v/s Heart disease Severity')

In [None]:
# Heart diesae severity in light of chest pain
sns.countplot(data= df2, x='cp',hue='num')
plt.title('Chest Pain v/s Heart disease Severity')

In [None]:
# Number of coloured major blood vessels versus Heart disease severity
sns.countplot(data= df2, x='ca',hue='num')
plt.title('Number of Coloured Major Vessels v/s Heart disease Severity')

In [None]:
# Define the variable descriptions
variable_descriptions = {
    'sex': { 0: 'Female', 1: 'Male'},
    'restecg': {0: 'Normal', 1: 'ST-T Wave Abnormality', 2: 'Left Ventricular Hypertrophy'},
    'slope': {1: 'Upsloping', 2: 'Flat', 3: 'Downsloping'},
    'thal': {3: 'Normal', 6: 'Fixed Defect', 7: 'Reversible Defect'},
    'exang': {0: 'No', 1: 'Yes'},
    'cp': {1:'Typical Anginal', 2: 'Atypical Anginal', 3: 'Non-Anginal Pain', 4:'Asymptomatic' },
    'fbs': {0: 'No', 1: 'Yes'},
    'num': {0: 'Normal', 1: 'SIHD', 2: 'Unstable Angina', 3: 'NSTEMI', 4: 'STEMI'}
}

# Create a DataFrame from the variable descriptions
variable_table = pd.DataFrame.from_dict(variable_descriptions, orient='columns')

# Transpose the table for a more intuitive view
variable_table = variable_table.T

# Rename the index column for clarity
variable_table.index.name = 'Feature'

# Display the table
print(variable_table)

In [None]:
# Correlation matrix heatmap
correlation_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Displaying the correlation matrix using seaborn
sns.heatmap(correlation_matrix, 
        xticklabels=correlation_matrix.columns,
        yticklabels=correlation_matrix.columns)
plt.show()

# Data Preprocessing

In [None]:
# Considering the categorical features with variability more than 0 and 1

df['cp'].replace({1:'typical_angina', 2:'atypical_angina', 3:'non_anginal_pain', 4:'asymptomatic'}, inplace = True)
df['restecg'].replace({0:'normal', 1:'ST-T_wave_abnormality', 2:'left_ventricular_hypertrophy'}, inplace = True)
df['slope'].replace({1:'upsloping', 2:'flat', 3:'downsloping'}, inplace = True)
df['thal'].replace({3:'normal', 6:'fixed_defect', 7:'reversible_defect'}, inplace = True)

features = df.columns.to_list()
categorical_features = ['cp', 'thal', 'restecg', 'slope']
categorical_features = pd.get_dummies(df[categorical_features].applymap(str))
features.remove('num')

features.remove('cp')
features.remove('thal')
features.remove('restecg')
features.remove('slope')


y = df['num']
y.columns = ['target']
X = pd.concat([df[features],categorical_features], axis = 1)
X.drop([92, 138, 163, 164, 251])
X.head()

In [None]:
X.shape

In [None]:
# Check for missing values
print(X.isnull().sum())

In [None]:
y.unique()

In [None]:
y.value_counts()

In [None]:
 # Plot class distribution after applying SMOTE
y.value_counts().plot(kind='bar')
plt.xlabel("Target")
plt.title('Class Distribution before Applying SMOTE')
plt.show()

In [None]:
# Class Balancing Using SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [None]:
y.value_counts()

In [None]:
X.value_counts()

In [None]:
 # Plot class distribution after applying SMOTE
y.value_counts().plot(kind='bar')
plt.xlabel("Target")
plt.title('Class Distribution After Applying SMOTE')
plt.show()

## Feature Selection

In [None]:
# Using Recursive Feature Elimination (RFE) with cross-validation


# Creating a random forest classifier
clf = RandomForestClassifier()

# Initialize the RFE with cross-validation
rfecv = RFECV(estimator=clf, step=1, cv=5, scoring='accuracy')

# Fitting the RFE to the data
rfecv.fit(X, y)

# DataFrame to display the ranking, scores, and p-values for all features
feature_ranking_df = pd.DataFrame({
    "Feature": X.columns,
    "Feature ranking": rfecv.support_,
})
print(feature_ranking_df)

# Print the optimal number of features and their support
print("Optimal number of features : %d" % rfecv.n_features_)
print("Feature ranking: ", rfecv.support_)

In [None]:
# Using Univariate Feature Selection Method: SelectKBest


# Initialize SelectKBest with the scoring function
selector = SelectKBest(score_func=chi2)

# Fit data to compute the scores and p-values for all features
selector.fit(X, y)

# Get the scores and p-values for all features
feature_scores = selector.scores_
feature_pvalues = selector.pvalues_

# DataFrame to display the ranking, scores, and p-values for all features
feature_ranking_df = pd.DataFrame({
    "Feature": X.columns,
    "Score": feature_scores,
    "P-Value": feature_pvalues
})

# Sorting the features by score in descending order
feature_ranking_df = feature_ranking_df.sort_values(by="Score", ascending=False)

print(feature_ranking_df)

In [None]:
# Using Tree-based Feature importance

# Creating a random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fitting the classifier to the data
clf.fit(X, y)

# Getting feature importances
feature_importances = clf.feature_importances_

# DataFrame to display the features and their importance scores
feature_importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": feature_importances
})

# Sorting the features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

# Print or use the features based on their importance scores
print("Feature Importance Scores:")
print(feature_importance_df)

# Adjusting the threshold to select the top features based on importance
threshold = 0.01
selected_features = feature_importance_df[feature_importance_df['Importance'] >= threshold]['Feature']
X_selected = X[selected_features]

## Testing Selected Features with Relevant ML Algorithms

In [None]:
#  Using RFC


# Selected Features
selected_features = ['age', 'sex','trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak', 'ca', 'cp_asymptomatic', 
                     'cp_atypical_angina', 'cp_non_anginal_pain', 'cp_typical_angina', 'thal_3', 'thal_7', 
                     'restecg_left_ventricular_hypertrophy', 'restecg_normal', 'slope_flat', 'slope_upsloping']


# Select the desired features from the DataFrame
X_selected = X[selected_features]

#Train-Test dataset split  
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, shuffle = True)

# Perform data normalization (StandardScaler in this case)
scaler = StandardScaler()
X_train_selected = scaler.fit_transform(X_train)
X_test_selected = scaler.transform(X_test)

# Initialize and train the Random Forest Classifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train_selected, y_train)

# Make predictions and evaluate the model
y_pred = rfc.predict(X_test_selected)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a confusion matrix
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(confusion)

# Display a classification report
classification_rep = classification_report(y_test, y_pred)
print('Classification Report:')
print(classification_rep)

In [None]:
# Using XG Boost


# Selected Features
selected_features = ['age', 'sex','trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak', 'ca', 'cp_asymptomatic', 
                     'cp_atypical_angina', 'cp_non_anginal_pain', 'cp_typical_angina', 'thal_3', 'thal_7', 
                     'restecg_left_ventricular_hypertrophy', 'restecg_normal', 'slope_flat', 'slope_upsloping']


# Select the desired features from the DataFrame
X_selected = X[selected_features]

#Train-Test dataset split  
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, shuffle = True)

# Perform data normalization (StandardScaler in this case)
scaler = StandardScaler()
X_train_selected = scaler.fit_transform(X_train)
X_test_selected = scaler.transform(X_test)

# Initialize the XGBoost classifier
xgb = XGBClassifier()

# Fit the model to the training data with the selected features
xgb.fit(X_train_selected, y_train)

# Make predictions on the test data
y_pred = xgb.predict(X_test_selected)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a confusion matrix
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(confusion)

# Display a classification report
classification_rep = classification_report(y_test, y_pred)
print('Classification Report:')
print(classification_rep)


In [None]:
# Using Support Vector Machine


# Selected Features
selected_features = ['age', 'sex','trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak', 'ca', 'cp_asymptomatic', 
                     'cp_atypical_angina', 'cp_non_anginal_pain', 'cp_typical_angina', 'thal_3', 'thal_7', 
                     'restecg_left_ventricular_hypertrophy', 'restecg_normal', 'slope_flat', 'slope_upsloping']


# Select the desired features from the DataFrame
X_selected = X[selected_features]

#Train-Test dataset split  
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, shuffle = True)

# Perform data normalization (StandardScaler in this case)
scaler = StandardScaler()
X_train_selected = scaler.fit_transform(X_train)
X_test_selected = scaler.transform(X_test)

# Initialize the SVM classifier
clf = SVC(kernel='linear', C=1.0)

# Fit the model to the training data with the selected features
clf.fit(X_train_selected, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test_selected)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a confusion matrix
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(confusion)

# Display a classification report
classification_rep = classification_report(y_test, y_pred)
print('Classification Report:')
print(classification_rep)

In [None]:
# Using Neural Network


# Selected Features
selected_features = ['age', 'sex','trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak', 'ca', 'cp_asymptomatic', 
                     'cp_atypical_angina', 'cp_non_anginal_pain', 'cp_typical_angina', 'thal_3', 'thal_7', 
                     'restecg_left_ventricular_hypertrophy', 'restecg_normal', 'slope_flat', 'slope_upsloping']


# Select the desired features from the DataFrame
X_selected = X[selected_features]

#Train-Test dataset split  
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, shuffle = True)

# Perform data normalization
scaler = StandardScaler()
X_train_selected = scaler.fit_transform(X_train)
X_test_selected = scaler.transform(X_test)

# Create a simple feedforward neural network
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_selected, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Make predictions
y_pred = model.predict(X_test_selected)
y_pred_classes = y_pred.argmax(axis=-1)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_classes)
print(f'Accuracy: {accuracy:.2f}')

# Generate a confusion matrix
confusion = confusion_matrix(y_test, y_pred_classes)
print('Confusion Matrix:')
print(confusion)

# Display a classification report
classification_rep = classification_report(y_test, y_pred_classes)
print('Classification Report:')
print(classification_rep)

In [None]:
#Using KNN


# Selected Features
selected_features = ['age', 'sex','trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak', 'ca', 'cp_asymptomatic', 
                     'cp_atypical_angina', 'cp_non_anginal_pain', 'cp_typical_angina', 'thal_3', 'thal_7', 
                     'restecg_left_ventricular_hypertrophy', 'restecg_normal', 'slope_flat', 'slope_upsloping']


# Select the desired features from the DataFrame
X_selected = X[selected_features]

#Train-Test dataset split  
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, shuffle = True)

# Perform data normalization (StandardScaler in this case)
scaler = StandardScaler()
X_train_selected = scaler.fit_transform(X_train)
X_test_selected = scaler.transform(X_test)

# Create a KNN classifier with a specified number of neighbors (e.g., 5)
k = 5
knn = KNeighborsClassifier(n_neighbors=k)

# Fit the KNN model on the training data
knn.fit(X_train_selected, y_train)

# Make predictions on the test data
y_pred = knn.predict(X_test_selected)

# Calculate and print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print a classification report with precision, recall, and F1-score
print(classification_report(y_test, y_pred))

# Create and print a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

In [None]:
#Using Logistic Regression

# Selected Features
selected_features = ['age', 'sex','trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak', 'ca', 'cp_asymptomatic', 
                     'cp_atypical_angina', 'cp_non_anginal_pain', 'cp_typical_angina', 'thal_3', 'thal_7', 
                     'restecg_left_ventricular_hypertrophy', 'restecg_normal', 'slope_flat', 'slope_upsloping']


# Select the desired features from the DataFrame
X_selected = X[selected_features]

#Train-Test dataset split  
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, shuffle = True)

# Perform data normalization (StandardScaler in this case)
scaler = StandardScaler()
X_train_selected = scaler.fit_transform(X_train)
X_test_selected = scaler.transform(X_test)

# Create a Multinomial Logistic Regression model
lrg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)

# Fit the model to the training data
lrg.fit(X_train_selected, y_train)

# Make predictions on the test data
y_pred = lrg.predict(X_test_selected)

# Evaluate the model for multiclass classification
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print a classification report with precision, recall, and F1-score
print(classification_report(y_test, y_pred))

# Create a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the evaluation metrics
print("Model Evaluation:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

In [None]:

# Selected Features
selected_features = ['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak', 'cp_asymptomatic',
                     'cp_atypical_angina', 'cp_non_anginal_pain', 'cp_typical_angina', 'thal_3', 'thal_7',
                     'restecg_left_ventricular_hypertrophy', 'restecg_normal', 'slope_flat', 'slope_upsloping', 'ca']

# Select the desired features from the DataFrame
X_selected = X[selected_features]

# Convert 'ca' feature to float
X_selected['ca'] = X_selected['ca'].astype(float)

# Train-Test dataset split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, shuffle=True)

# Perform data normalization
scaler = StandardScaler()
X_train_selected = scaler.fit_transform(X_train)
X_test_selected = scaler.transform(X_test)

# Define base learners
knn = KNeighborsClassifier(n_neighbors=5)
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(learning_rate=0.1,objective="multi:softmax", n_estimators=100, random_state=42)
svm = SVC(kernel='linear', C=1)
nn = MLPClassifier(hidden_layer_sizes=(128, 128, 128), max_iter=500, random_state=42)

# Create a SuperLearner class
class SuperLearner(BaseEstimator, ClassifierMixin):
    def __init__(self, base_learners):
        self.base_learners = base_learners
        self.meta_learner = RandomForestClassifier(n_estimators=100, random_state=42)

    def fit(self, X_train_selected, y):
        for model in self.base_learners:
            model.fit(X_train_selected, y)
        predictions = np.column_stack([model.predict(X_train_selected) for model in self.base_learners])
        self.meta_learner.fit(predictions, y)

    def predict(self, X_train_selected):
        predictions = np.column_stack([model.predict(X_train_selected) for model in self.base_learners])
        return self.meta_learner.predict(predictions)

# Initialize the Super Learner with the base learners
super_learner = SuperLearner(base_learners=[knn, rfc, xgb, svm, nn])

# Implement k-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
accuracies = []

for train_index, test_index in kf.split(X_train_selected):
    X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]

    # Fit the Super Learner on the training data
    super_learner.fit(X_train_cv, y_train_cv)

    # Make predictions on the validation data
    y_pred_cv = super_learner.predict(X_test_cv)

    # Calculate and store the accuracy score
    accuracy_cv = accuracy_score(y_test_cv, y_pred_cv)
    accuracies.append(accuracy_cv)

# Calculate the mean accuracy over all folds
mean_accuracy = np.mean(accuracies)

# Fit the Super Learner on the entire training data
super_learner.fit(X_train_selected, y_train)

# Make predictions on the test data
y_pred = super_learner.predict(X_test_selected)

# Calculate and print the accuracy score on the test data
accuracy = accuracy_score(y_test, y_pred)

# Create and print a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the results
print(f'Mean Cross-Validation Accuracy: {mean_accuracy:.2f}')
print(f'Super Learner Test Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(conf_matrix)

In [None]:
#Hyperparameter Tuning

# Selected Features
selected_features = ['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak', 'cp_asymptomatic',
                     'cp_atypical_angina', 'cp_non_anginal_pain', 'cp_typical_angina', 'thal_3', 'thal_7',
                     'restecg_left_ventricular_hypertrophy', 'restecg_normal', 'slope_flat', 'slope_upsloping', 'ca']

# Select the desired features from the DataFrame
X_selected = X[selected_features]

# Convert 'ca' feature to float
X_selected['ca'] = X_selected['ca'].astype(float)

# Train-Test dataset split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, shuffle=True)


# Base Learners
knn = KNeighborsClassifier()
rfc = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(objective="multi:softmax", random_state=42)
nn = MLPClassifier(max_iter=1000, random_state=0)

# Hyperparameter Grids
knn_param_grid = {'n_neighbors': [3, 5, 7]}
rfc_param_grid = {'n_estimators': [100, 200, 300]}
xgb_param_grid = {'n_estimators': [100, 200, 300]}
nn_param_grid = {'hidden_layer_sizes': [(50, 50), (100, 100), (50, 50, 50)]}

# Grid Search
knn_grid = GridSearchCV(knn, knn_param_grid, cv=5, scoring='accuracy')
rfc_grid = GridSearchCV(rfc, rfc_param_grid, cv=5, scoring='accuracy')
xgb_grid = GridSearchCV(xgb_model, xgb_param_grid, cv=5, scoring='accuracy')
nn_grid = GridSearchCV(nn, nn_param_grid, cv=5, scoring='accuracy')

# Fit base learners with hyperparameter tuning
knn_grid.fit(X_train, y_train)
rfc_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)
nn_grid.fit(X_train, y_train)

# Access best hyperparameters
best_knn_params = knn_grid.best_params_
best_rfc_params = rfc_grid.best_params_
best_xgb_params = xgb_grid.best_params_
best_nn_params = nn_grid.best_params_

print(knn_grid.best_params_)
print(rfc_grid.best_params_)
print(xgb_grid.best_params_)
print(nn_grid.best_params_)


## Model Evaluation

In [None]:
# Selected Features
selected_features = ['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak', 'cp_asymptomatic',
                     'cp_atypical_angina', 'cp_non_anginal_pain', 'cp_typical_angina', 'thal_3', 'thal_7',
                     'restecg_left_ventricular_hypertrophy', 'restecg_normal', 'slope_flat', 'slope_upsloping', 'ca']

# Select the desired features from the DataFrame
X_selected = X[selected_features]

# Convert 'ca' feature to float
X_selected['ca'] = X_selected['ca'].astype(float)

# Train-Test dataset split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=0, shuffle=True)

# Perform data normalization
scaler = StandardScaler()
X_train_selected = scaler.fit_transform(X_train)
X_test_selected = scaler.transform(X_test)

# Define base learners
knn = KNeighborsClassifier(n_neighbors=5)
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(learning_rate=0.1, objective="multi:softmax", n_estimators=100, random_state=42)
svm = SVC(kernel='linear', C=1, probability=True)
nn = MLPClassifier(hidden_layer_sizes=(128, 128, 128), max_iter=1000, random_state=42)


# Create a SuperLearner class
class SuperLearner(BaseEstimator, ClassifierMixin):
    def __init__(self, base_learners):
        self.base_learners = base_learners
        self.meta_learner = RandomForestClassifier(n_estimators=100, random_state=0)

    def fit(self, X, y):
        for model in self.base_learners:
            model.fit(X, y)
        predictions = np.column_stack([model.predict(X) for model in self.base_learners])
        self.meta_learner.fit(predictions, y)

    def predict(self, X):
        predictions = np.column_stack([model.predict(X) for model in self.base_learners])
        return self.meta_learner.predict(predictions)

    def predict_proba(self, X):
        predictions = np.column_stack([model.predict(X) for model in self.base_learners])
        return self.meta_learner.predict_proba(predictions)

# Initialize the Super Learner with the base learners
super_learner = SuperLearner(base_learners=[knn, rfc, xgb, svm, nn])

# Implement k-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
accuracies = []

for train_index, test_index in kf.split(X_train_selected):
    X_train_cv, X_test_cv = X_train_selected[train_index], X_train_selected[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]

    # Fit the Super Learner on the training data
    super_learner.fit(X_train_cv, y_train_cv)

    # Make predictions on the validation data
    y_pred_cv = super_learner.predict(X_test_cv)

    # Calculate and store the accuracy score
    accuracy_cv = accuracy_score(y_test_cv, y_pred_cv)
    accuracies.append(accuracy_cv)

# Calculate the mean accuracy over all folds
mean_accuracy = np.mean(accuracies)

# Fit the Super Learner on the entire training data
super_learner.fit(X_train_selected, y_train)

# Make predictions on the test data
y_pred = super_learner.predict(X_test_selected)

# Calculate and print the accuracy score on the test data
accuracy = accuracy_score(y_test, y_pred)

# Create and print a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the results
#print(f'Mean Cross-Validation Accuracy: {mean_accuracy:.2f}')
print(f'Super Learner Test Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(conf_matrix)

# Convert y_test and y_pred to one-hot encoded format
num_classes = len(np.unique(y))
y_bin_test = label_binarize(y_test, classes=list(range(num_classes)))
y_bin_pred = super_learner.predict_proba(X_test_selected)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(y_bin_test[:, i], y_bin_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and AUC
fpr["micro"], tpr["micro"], _ = roc_curve(y_bin_test.ravel(), y_bin_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Compute macro-average ROC curve and AUC
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(num_classes)]))
mean_tpr = np.zeros_like(all_fpr)

for i in range(num_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

mean_tpr /= num_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot the ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label=f'micro-average ROC curve (AUC = {roc_auc["micro"]:.2f})',
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label=f'macro-average ROC curve (AUC = {roc_auc["macro"]:.2f})',
         color='navy', linestyle=':', linewidth=4)

colors = ['aqua', 'darkorange', 'cornflowerblue', 'green', 'red']

for i, color in zip(range(num_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f'ROC curve of class {i} (AUC = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Multi-Class Classification')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Making prediction with the super learner

# Fit the Super Learner on the entire training data
super_learner.fit(X_train, y_train)

# Make 20 predictions on the test data
num_predictions = 20
predictions = np.zeros((num_predictions, len(X_test)))
for i in range(num_predictions):
    y_pred = super_learner.predict(X_test)
    predictions[i] = y_pred

# Create subplots for each prediction iteration
plt.figure(figsize=(15, 10))

for i in range(num_predictions):
    plt.subplot(5, 10, i + 1)
    plt.plot(y_test, 'b.', label='Real Values')
    plt.plot(predictions[i], 'g.', alpha=0.5, label=f'Prediction {i + 1}')
    plt.title(f'Prediction {i + 1}')
    plt.xlabel('Data Point')
    plt.ylabel('Value')
    

# Adjust subplot layout
plt.tight_layout()

# Show the plots
plt.show()

In [None]:
# Making prediction with the super learner
num_predictions = 4
predictions = np.zeros((num_predictions, len(X_test)))
for i in range(num_predictions):
    y_pred = super_learner.predict(X_test)
    predictions[i] = y_pred

# Create a DataFrame to compare the first 20 predictions with actual values
comparison_df = pd.DataFrame({'Actual': y_test[:20]})
for i in range(num_predictions):
    comparison_df[f'Prediction {i + 1}'] = predictions[i][:20]

# Display the DataFrame
print(comparison_df)


In [None]:
# Making predictions
num_predictions = 20
y_pred = super_learner.predict(X_test)

# Plot the first prediction
plt.figure(figsize=(10, 6))
plt.plot(y_test[:20], label='Actual', marker='o')
plt.plot(y_pred[:20], label='Prediction 1', linestyle='--', marker='x')
plt.xlabel('Sample Index')
plt.ylabel('Class')
plt.title('Comparison of First Prediction with Actual')
plt.legend()
plt.grid()
plt.show()


In [None]:
y_pred = super_learner.predict(X_test)
actual = []
predcition = []

for i,j in zip(y_test,y_pred):
  actual.append(i)
  predcition.append(j)

dic = {'Actual':actual,
       'Prediction':predcition
       }

result  = pd.DataFrame(dic)

import plotly.graph_objects as go
 
fig = go.Figure()
 
 
fig.add_trace(go.Scatter(x=np.arange(0,len(y_test)), y=y_test,
                    mode='markers+lines',
                    name='Test'))
fig.add_trace(go.Scatter(x=np.arange(0,len(y_test)), y=y_pred,
                    mode='markers',
                    name='Pred'))

In [None]:

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

# Super Learner instance with the base learners
super_learner = SuperLearner(base_learners=[knn, rfc, xgb, svm, nn])

# plot_learning_curve function
plot_learning_curve(super_learner, "Learning Curve", X_train, y_train, cv=5)

plt.show()
