In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

csv_path = 'Heart_Attack_Classification.csv'
df = pd.read_csv(csv_path, sep=';', quotechar='"')

# Risk of heart attack is the target variable
X = df.drop('Risk of heart attack', axis=1)
y = df['Risk of heart attack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=109)

# Make a copy of the original data to make changes on
X_train_copy = X_train.copy()
X_test_copy = X_test.copy()


Visualize the variability of variables in the features present in the data i.e how dispersed the data are from each other in that particular variable.

In [2]:
unique = X_train.nunique(axis=0)
unique

Age                         39
Sex                          2
Chest pain                   4
Blood pressure              43
Cholestoral                127
Diabetes                     2
Max heart rate              87
Exercice induced angina      2
Number of vessels            5
Other observations           4
dtype: int64

In [3]:
variability = pd.DataFrame(unique).sort_values(by=0, ascending=False)
variability

Unnamed: 0,0
Cholestoral,127
Max heart rate,87
Blood pressure,43
Age,39
Number of vessels,5
Chest pain,4
Other observations,4
Sex,2
Diabetes,2
Exercice induced angina,2


As we can see from the chart above, cholesterol and max heart rate have the highest variability.

We’ll separate the data into Categoricaland numerical data so as to fill them using different filling methods.

In [4]:
# Handle missing values for object types
X_train_object = X_train_copy.select_dtypes(include=['object'])
X_test_object = X_test_copy.select_dtypes(include=['object'])

The NaNs present in the object columns will be replaced with the mode of the column respectively

In [5]:
for i in X_train_object.columns:
    mode_value = X_train_object[i].mode()[0]
    X_train_copy[i].fillna(mode_value, inplace=True)
    X_test_copy[i].fillna(mode_value, inplace=True)


The numeric dataset with NaN values will be filled with the median

In [6]:
# Handle missing values for numerical types
X_train_value = X_train_copy.select_dtypes(exclude=['object'])
X_test_value = X_test_copy.select_dtypes(exclude=['object'])

for i in X_train_value.columns:
    median_value = X_train_copy[i].median()
    X_train_copy[i].fillna(median_value, inplace=True)
    X_test_copy[i].fillna(median_value, inplace=True)

Encode categorical values with appropriate labels using Label Encoder

In [7]:
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()

X_train_object = X_train_copy.select_dtypes(include=['object'])

for i in X_train_object.columns:
    lb.fit(X_train_copy[i])
    X_train_copy[i] = lb.transform(X_train_copy[i])
    X_test_copy[i] = lb.transform(X_test_copy[i])
    

Make sure data that is absent in the training set does not exist in the test set by checking for any columns in X_test_copy that are not in X_train_copy and drop them

In [8]:
for col in X_test_copy.columns:
    if col not in X_train_copy.columns:
        X_test_copy.drop([col], inplace=True, axis=1)

Use the Z-score to identify and remove outliers from the dataset

In [9]:
from scipy import stats
import numpy as np

z_scores = np.abs(stats.zscore(X_train_copy))
filtered_entries = (z_scores < 3).all(axis=1)

X_train_copy = X_train_copy[filtered_entries]
y_train = y_train[filtered_entries]


Fit data to two different scalers which we will use for achieving the best AUC score

In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Using StandardScaler
scaler_std = StandardScaler()
X_train_std = scaler_std.fit_transform(X_train_copy)
X_test_std = scaler_std.fit_transform(X_test_copy)

# Using MinMaxScaler
scaler_minmax = MinMaxScaler()
X_train_minmax = scaler_minmax.fit_transform(X_train_copy)
X_test_minmax = scaler_minmax.fit_transform(X_test_copy)

## Using MinMaxScaler

Fitting using Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

Lr = LogisticRegression(max_iter=1000)
Lr.fit(X_train_minmax, y_train)

y_pred = Lr.predict_proba(X_test_minmax)[:,1]
prediction_score = roc_auc_score(y_test, y_pred)
print (f'Logistic regression prediction score for MinMaxScaler: {prediction_score}')

Logistic regression prediction score for MinMaxScaler: 0.9160021265284424


Fitting using Gradient Boosting

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train_minmax, y_train)

y_pred = gb.predict_proba(X_test_minmax)[:,1]
prediction_score=roc_auc_score(y_test, y_pred)

print (f'Gradient Boosting prediction score: {prediction_score}')

Gradient Boosting prediction score: 0.8654970760233918


Fitting using Random Forest classifier

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_minmax, y_train)

y_pred = rf.predict_proba(X_test_minmax)[:,1]
prediction_score = roc_auc_score(y_test, y_pred)

print (f'Random Forest prediction score: {prediction_score}')

Random Forest prediction score: 0.9032429558745347


Fitting using SVM classifier

In [14]:
from sklearn import svm

svm = svm.SVC(kernel='linear', probability=True)
svm.fit(X_train_minmax, y_train)

y_pred = svm.predict_proba(X_test_minmax)[:,1]
prediction_score = roc_auc_score(y_test, y_pred)

print (f'SVM prediction score: {prediction_score}')

SVM prediction score: 0.8952684742158427


## Using StandardScaler

Fitting using Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

Lr = LogisticRegression(max_iter=1000)
Lr.fit(X_train_std, y_train)

y_pred = Lr.predict_proba(X_test_std)[:,1]
prediction_score = roc_auc_score(y_test, y_pred)
print (f'Logistic regression prediction score for StandardScaler: {prediction_score}')

Logistic regression prediction score for StandardScaler: 0.898989898989899


Fitting using Gradient Boosting

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train_std, y_train)

y_pred = gb.predict_proba(X_test_std)[:,1]
prediction_score=roc_auc_score(y_test, y_pred)

print (f'Gradient Boosting prediction score: {prediction_score}')

Gradient Boosting prediction score: 0.8841041998936735


Fitting using Random Forest Classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_std, y_train)

y_pred = rf.predict_proba(X_test_std)[:,1]
prediction_score = roc_auc_score(y_test, y_pred)

print (f'Random Forest prediction score: {prediction_score}')

Random Forest prediction score: 0.9271664008506114


Fitting using SVM

In [18]:
from sklearn import svm

svm = svm.SVC(kernel='linear', probability=True)
svm.fit(X_train_std, y_train)

y_pred = svm.predict_proba(X_test_std)[:,1]
prediction_score = roc_auc_score(y_test, y_pred)

print (f'SVM prediction score: {prediction_score}')

SVM prediction score: 0.8926103136629452
