**CS-C3240 - Machine Learning D**
**Project**

**Authors: Aaron Gutierrez-Hernandez & Alexandre Cojot**

**Date created: 10-sep-2023**

**Last modified: 22-sep-2023**

# Import Libraries and Data

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold

%matplotlib inline

# Read the .csv file containing the dataset
data = pd.read_csv('heart_data.csv')
data.head()

# Clean Data

In [None]:
data.info() # check basic dataframe's information

In [None]:
data.duplicated().sum() # look if there are duplicates

In [None]:
data.drop_duplicates(inplace=True) # drop druplicates

# EDA

In [None]:
data.describe() # basic descriptive statistics of the dataframe

In [None]:
plt.figure(figsize=(16,9)) 
sns.heatmap(data.corr(),annot=True) # see correlation between features and features with response variable

In [None]:
plt.figure(figsize=(16,9))
sns.pairplot(data) # see distribution between each pair of features and features with response variable

In [None]:
# Look at how the response variable is splitted to avoid fitting the model to just one value
target_counts = data['target'].value_counts() # count response variable values to see if they are not unbalanced
target_ratios = target_counts/len(data)       # get the same information in a ratios format
print(target_counts)
print(target_ratios)
sns.countplot(x=data['target'])

# Prepare Data

In [None]:
# Make categorical data to the corresponding features
categorical_features = ['sex','cp','fbs','restecg','exang','thal','target'] # columns containing categorical features 
data[categorical_features] = data[categorical_features].astype('category')  # converting dtype to categorical
data.info()

In [None]:
# Standardize numerical features
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca'] # columns containing numerical features 
scaler = StandardScaler()          # initialize scaler
scaler.fit(data[numeric_features]) # fit the scaler to the selected numeric columns
data[numeric_features] = scaler.transform(data[numeric_features]) # standardize the numeric columns
data.describe() # basic descriptive statistics of the dataframe

In [None]:
X = data.drop('target',axis=1) # split the features from the labels
y = data['target']             # split the labels from the features
print(X.shape,y.shape)

## K-Fold Cross Validation

In [None]:
clf  = LogisticRegression(solver='liblinear', penalty='l2')  # initialize proposed classifier
kf   = KFold(n_splits=5, shuffle=True, random_state=0)       # initialize k-folds
scrs = cross_val_score(clf, X, y, cv=kf, scoring='accuracy') # apply cv
print(scrs)
scrs.mean()

## Split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # split data in training and testing sets
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

# ML Model