In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
# getting heart attack data
h_a = pd.read_csv("./data/heart.csv")
# getting o2 saturation data
o2= pd.read_csv("./data/o2Saturation.csv")

In [3]:
#we dont have info about the columns oldpeak and slp so we delete them

h_a.drop(["oldpeak","slp", "thall"], axis=1, inplace=True) 


In [4]:
df=h_a.join(o2)

In [5]:
df.rename(columns = {'98.6':'o2saturation', 'output':'ha_prediction'}, inplace = True)

In [6]:
df.isnull().any()

age              False
sex              False
cp               False
trtbps           False
chol             False
fbs              False
restecg          False
thalachh         False
exng             False
caa              False
ha_prediction    False
o2saturation     False
dtype: bool

In [7]:
df.head(8)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,caa,ha_prediction,o2saturation
0,63,1,3,145,233,1,0,150,0,0,1,98.6
1,37,1,2,130,250,0,1,187,0,0,1,98.6
2,41,0,1,130,204,0,0,172,0,0,1,98.6
3,56,1,1,120,236,0,1,178,0,0,1,98.1
4,57,0,0,120,354,0,1,163,1,0,1,97.5
5,57,1,0,140,192,0,1,148,0,0,1,97.5
6,56,0,1,140,294,0,0,153,0,0,1,97.5
7,44,1,1,120,263,0,1,173,0,0,1,97.5


In [8]:
df_train=df[:242]
df_val=df[243:]

In [9]:
cat_variables=["sex", "cp", "fbs", "restecg", "exng", "caa"] 


In [12]:
#encode data, we create a dictionary with the name and the column and the ecndoer . for example {"sex":{"female":0, "man":1 }}
col_mapper = {}

for column in cat_variables:
    le = LabelEncoder()
    le.fit(df_train.loc[:, column]) #here we select the column on the DS to do the encoding
    class_names = le.classes_
    df_train.loc[:, column] = le.transform(df_train.loc[:, column])
    col_mapper.update({column: le})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.loc[:, column] = le.transform(df_train.loc[:, column])


In [13]:
def pre_process_data(df, label_encoder_dict):
    
    for col in df.columns:
        if col in list(label_encoder_dict.keys()):
            column_le = label_encoder_dict[col]
            df.loc[:, col] = column_le.transform(df.loc[:, col])
        else:
            continue

    return df

In [14]:
# splitting into X and Y
x_train = df_train.drop("ha_prediction", axis=1)
y_train = df_train.loc[:, "ha_prediction"]

# fitting model
model = RandomForestClassifier(criterion = "gini", min_samples_split=20, n_estimators=200)
model.fit(x_train, y_train)

In [15]:
# split validation set 
x_val = df_val.drop("ha_prediction", axis=1)
y_val = df_val.loc[:, "ha_prediction"]

# predicting on validation
predictions = model.predict(x_val)
precision, recall, fscore, support = precision_recall_fscore_support(y_val, predictions)
accuracy = accuracy_score(y_val, predictions)
print(f"Validation accuracy is: {round(accuracy, 3)}")

Validation accuracy is: 0.6


  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# pickling mdl

pickler = open("heart_attack_prediction_model.pkl", "wb")
pickle.dump(model, pickler)
pickler.close()

In [17]:
# pickling label encoder dict 
pickler = open("heart_attack_prediction_label_encoders.pkl", "wb")
pickle.dump(col_mapper, pickler)
pickler.close()
