**CS-C3240 - Machine Learning D**
**Project**

**Authors: Aaron Gutierrez-Hernandez & Alexandre Cojot**

**Date created: 10-sep-2023**

**Last modified: 22-sep-2023**

# Import Libraries and Data

In [None]:
# Import libraries
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

%matplotlib inline

# Read the .csv file containing the dataset
data = pd.read_csv('heart_data.csv')
data.head()

# Clean Data

In [None]:
data.info() # check basic dataframe's information

In [None]:
data.duplicated().sum() # look if there are duplicates

In [None]:
data.drop_duplicates(inplace=True) # drop druplicates

# EDA

In [None]:
data.describe() # basic descriptive statistics of the dataframe

In [None]:
plt.figure(figsize=(16,9)) 
sns.heatmap(data.corr(),annot=True) # see correlation between features and features with response variable

In [None]:
plt.figure(figsize=(16,9))
sns.pairplot(data) # see distribution between each pair of features and features with response variable

In [None]:
# Look at how the response variable is splitted to avoid fitting the model to just one value
target_counts = data['target'].value_counts() # count response variable values to see if they are not unbalanced
target_ratios = target_counts/len(data)       # get the same information in a ratios format
print(target_counts)
print(50*'_')
print(target_ratios)
sns.countplot(x=data['target'])

# Prepare Data

In [None]:
# Make categorical data to the corresponding features
categorical_features = ['sex','cp','fbs','restecg','exang','thal','target'] # columns containing categorical features 
data[categorical_features] = data[categorical_features].astype('category')  # converting dtype to categorical
data.info()

In [None]:
# Standardize numerical features
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca'] # columns containing numerical features 
scaler = StandardScaler()          # initialize scaler
scaler.fit(data[numeric_features]) # fit the scaler to the selected numeric columns
data[numeric_features] = scaler.transform(data[numeric_features]) # standardize the numeric columns
data.describe() # basic descriptive statistics of the dataframe

In [None]:
X = data.drop('target',axis=1) # split the features from the labels
y = data['target']             # split the labels from the features
print(X.shape,y.shape)

# ML Models

In [None]:
clfs = ([LogisticRegression(solver='liblinear', penalty='l2'),
         RandomForestClassifier(criterion='entropy',max_depth=5)]) # initialize proposed classifiers

## K-Fold Cross Validation

In [None]:
kf   = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) # initialize k-folds
acc_valid = []                                                   # store validation accuracy
for clf in clfs:
    warnings.filterwarnings('ignore', category=FutureWarning)
    scrs = cross_val_score(clf, X, y, cv=kf, scoring='accuracy') # compute k-fold cross validation scores
    acc_valid.append(scrs.mean())
    print(70*'_')                                                
    print(clf)
    print(f'Scores: {scrs}')
    print(f'Scores mean: {scrs.mean():.4f}')

## Split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,
                                                    stratify=y, random_state=0) # split data in training and testing sets
y_sets = {'y_train': y_train, 'y_test': y_test}

for y_name, y_set in y_sets.items():
    y_counts = y_set.value_counts() # count response variable values to see if they are not unbalanced
    y_ratios = y_counts/len(y_set)  # get the same information in a ratios format
    print(50*'_')
    print(y_name)
    print('Set sizes')
    print(y_set.shape,y_set.shape)
    print('Label counts')
    print(y_counts)
    print('Label ratios')
    print(y_ratios)
    sns.countplot(x=y_set)
    plt.title(str(y_name)+' label counts')
    plt.show()

# Models evaluation

In [None]:
def plt_confmat(y_true, y_pred, title, ax):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', square=True, cbar= False,
                xticklabels=['No Heart Disease', 'Heart Disease'],
                yticklabels=['No Heart Disease', 'Heart Disease'],
                ax=ax)
    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')
    ax.set_title(title)

In [None]:
acc_train = []
acc_test  = []
for clf in clfs:
    clf.fit(X_train, y_train)         # train classifier (fit to training data)  
    y_pred = clf.predict(X_test)      # compute the predictions (use the trained model on test data) 
    train_pred = clf.predict(X_train) # compute the predictions (use the trained model on train data) 
    # Train and test scores
    train_score = accuracy_score(y_train, train_pred)
    test_score  = accuracy_score(y_test, y_pred)
    acc_train.append(train_score)
    acc_test.append(test_score)
    # Model evaluation metrics
    print(60*'_')                                                
    print(clf)
    print(f'Train Accuracy: {100*train_score:.2f}','%')
    print(f'Test Accuracy : {100*test_score:.2f}','%')
    print('Report:\n',classification_report(y_test, y_pred))
    # Visualize Confusion Matrices
    fig, axes = plt.subplots(1, 2, figsize=(12,6))
    plt_confmat(y_train, train_pred, 'Train Confusion Matrix', axes[0])
    plt_confmat(y_test, y_pred, 'Test Confusion Matrix', axes[1])
    plt.tight_layout()
    plt.show()

## Subset scores

In [None]:
df = pd.DataFrame(columns=['Logistic Regression', 'Random Forest'])
df.loc[0] = acc_train
df.loc[1] = acc_valid
df.loc[2] = acc_test
df.loc[3] = [1-acc for acc in acc_train]
df.loc[4] = [1-acc for acc in acc_valid]
df.loc[5] = [1-acc for acc in acc_test]
df.loc[6] = df.iloc[0:3].mean()
df.loc[7] = df.iloc[3:6].mean()
rows = {
    0: 'Training Accuracy',
    1: 'Validation Accuracy',
    2: 'Test Accuracy',
    3: 'Training Error',
    4: 'Validation Error',
    5: 'Test Error',
    6: 'Average Accuracy',
    7: 'Average Error'
}
df = df.rename(index=rows)
df = df.round(4)
df

In [None]:
plt.figure(figsize=(8,4.5))
df[0:3].plot(kind='bar',rot=0,alpha=.6)
df[3:6].plot(kind='bar',rot=0,ax=plt.gca(), alpha=1)
plt.legend(title='Classifiers', loc='upper left', bbox_to_anchor=(1, 1),
           labels=['Log Regression Accuracy','Random Forest Accuracy',
                   'Log Regression Error   ','Random Forest Error'])
plt.xlabel('Data subsets')
plt.xticks([0,1,2],['Training','Validation','Training'])
plt.ylabel('Accuracy/Error')
plt.title('Accuracy and Error for data subsets')
plt.show()