In [14]:
# Import libraries to be used.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import clone

In [15]:
# Define a random state value
RANDOM_STATE = 3030

# Preprocessing

In [16]:
# Define column names which will be used instead of the ones in the datasets.
column_names = [
    "age", "sex", "chest_pain_type", "rest_bp", "cholesterol",
    "fasting_bs", "resting_ecg", "max_hr", "exercise_angina", "st_exercise_depression",
    "st_exercise_slope", "major_vessels_fluorospy", "thal", "angiographic_disease"
]
na_values = ["?"] # Define what values will be interpreted as missing values, in this dataset the "?" character is used for this.

# Load all datasets using relative paths and combine into one dataframe.
df = pd.concat([
    pd.read_csv("../../data/processed.cleveland.data", names=column_names, na_values=na_values),
    pd.read_csv("../../data/processed.hungarian.data", names=column_names, na_values=na_values),
    pd.read_csv("../../data/processed.switzerland.data", names=column_names, na_values=na_values),
    pd.read_csv("../../data/processed.va.data", names=column_names, na_values=na_values)
]).drop(['st_exercise_slope', 'major_vessels_fluorospy', 'thal'], axis=1)

In [17]:
df[df["angiographic_disease"] > 1] = 1

In [18]:
df['age'].fillna((df['age'].mean()), inplace=True)
df['rest_bp'].fillna((df['rest_bp'].mean()), inplace=True)
df['cholesterol'].fillna((df['cholesterol'].mean()), inplace=True)
df['max_hr'].fillna((df['max_hr'].mean()), inplace=True)
df['st_exercise_depression'].fillna((df['st_exercise_depression'].mean()), inplace=True)
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [19]:
# Set category attributes
df["sex"] = df["sex"].astype("category")
df["chest_pain_type"] = df["chest_pain_type"].astype("category")
df["fasting_bs"] = df["fasting_bs"].astype("category")
df["resting_ecg"] = df["resting_ecg"].astype("category")
df["exercise_angina"] = df["exercise_angina"].astype("category")
df["angiographic_disease"] = df["angiographic_disease"].astype("category")

In [20]:
df["chest_pain_type"] = df["chest_pain_type"].map({1.0: "typical_angina", 2.0: "atypical_angina", 3.0: "non_angingal_pain", 4.0: "asymptomatic"})
df["resting_ecg"] = df["resting_ecg"].map({0.0: "resting_ecg_normal", 1.0: "resting_ecg_stt_abnormal", 2.0: "resting_ecg_left_ventricular_hypertrophy"})

ohe_chest_pain = pd.get_dummies(df["chest_pain_type"])
ohe_resting_ecg = pd.get_dummies(df["resting_ecg"])

df = pd.concat([df, ohe_chest_pain, ohe_resting_ecg], axis=1)
df.drop(["chest_pain_type"], axis=1, inplace=True)
df.drop(["resting_ecg"], axis=1, inplace=True)

In [21]:
df_X = df.drop(["angiographic_disease"], axis=1)
best_features = ['fasting_bs', 'exercise_angina', 'typical_angina', 'atypical_angina',
 'non_angingal_pain', 'resting_ecg_stt_abnormal']
df_X = df_X[best_features]

In [22]:
X = MinMaxScaler().fit_transform(df_X.to_numpy())
y = df["angiographic_disease"].to_numpy()

# Model implementation

In [23]:
clf = RandomForestClassifier(n_estimators = 5, criterion='gini', random_state=RANDOM_STATE)
clf.fit(X, y)

RandomForestClassifier(n_estimators=5, random_state=3030)

In [24]:
import pickle
with open('../dash/RF_model.pkl', 'wb') as model_file:
    pickle.dump(clf, model_file)

In [25]:
df_X['resting_ecg_stt_abnormal'].value_counts()

0    567
1    353
Name: resting_ecg_stt_abnormal, dtype: int64

In [26]:
df_X

Unnamed: 0,fasting_bs,exercise_angina,typical_angina,atypical_angina,non_angingal_pain,resting_ecg_stt_abnormal
0,1.0,0.0,1,0,0,0
1,1.0,1.0,1,0,0,1
2,0.0,1.0,0,0,0,0
3,0.0,0.0,0,0,1,0
4,0.0,0.0,0,1,0,0
...,...,...,...,...,...,...
195,1.0,0.0,0,0,0,1
196,0.0,1.0,1,0,0,1
197,1.0,1.0,1,0,0,1
198,1.0,1.0,0,0,0,0
