In [1]:
# Standard imports
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## 1. Get the data ready 

In [3]:
df = pd.read_csv('heart-disease.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
X = df.drop("target",axis=1)

y = df["target"]

In [20]:
# Split the data into training and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y)

#view data shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape

X_train.index
X_train.iloc[3]

age          61.0
sex           0.0
cp            0.0
trestbps    130.0
chol        330.0
fbs           0.0
restecg       0.0
thalach     169.0
exang         0.0
oldpeak       0.0
slope         2.0
ca            0.0
thal          2.0
Name: 182, dtype: float64

## 2. Choose the model/estimator


In [22]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

## 3. Fit the model to the data and use it to make a prodiction

In [25]:
model.fit(X_train,y_train)

In [27]:
y_preds = model.predict(X_test)
y_preds

array([0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 1], dtype=int64)

## 4. Evaluate the model 

In [30]:
# on the training set
model.score(X_train, y_train)

1.0

In [32]:
# On the test set
model.score(X_test,y_test)

0.868421052631579

# 5. Experiment to improve

In [35]:
np.random.seed(42)
for i in range(10,100,10):
    print(f"Trying model with {i} estimators...")
    model = RandomForestClassifier(n_estimators=i)
    model.fit(X_train,y_train)
    print(f"Model accuracy {model.score(X_test,y_test)}")
    print("")

Trying model with 10 estimators...
Model accuracy 0.8552631578947368

Trying model with 20 estimators...
Model accuracy 0.8289473684210527

Trying model with 30 estimators...
Model accuracy 0.9078947368421053

Trying model with 40 estimators...
Model accuracy 0.8289473684210527

Trying model with 50 estimators...
Model accuracy 0.8157894736842105

Trying model with 60 estimators...
Model accuracy 0.8421052631578947

Trying model with 70 estimators...
Model accuracy 0.8026315789473685

Trying model with 80 estimators...
Model accuracy 0.7894736842105263

Trying model with 90 estimators...
Model accuracy 0.8157894736842105



In [36]:
from sklearn.model_selection import cross_val_score
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    model = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accruacy on test set: {model.score(X_test, y_test)}")
    print(f"Cross-validation score: {np.mean(cross_val_score(model, X, y, cv=5)) * 100}%")
    print("")

Trying model with 10 estimators...
Model accruacy on test set: 0.8552631578947368
Cross-validation score: 78.53551912568305%

Trying model with 20 estimators...
Model accruacy on test set: 0.8157894736842105
Cross-validation score: 79.84699453551912%

Trying model with 30 estimators...
Model accruacy on test set: 0.8421052631578947
Cross-validation score: 80.50819672131148%

Trying model with 40 estimators...
Model accruacy on test set: 0.7894736842105263
Cross-validation score: 82.15300546448088%

Trying model with 50 estimators...
Model accruacy on test set: 0.8026315789473685
Cross-validation score: 81.1639344262295%

Trying model with 60 estimators...
Model accruacy on test set: 0.8157894736842105
Cross-validation score: 83.47540983606557%

Trying model with 70 estimators...
Model accruacy on test set: 0.8289473684210527
Cross-validation score: 81.83060109289617%

Trying model with 80 estimators...
Model accruacy on test set: 0.8026315789473685
Cross-validation score: 82.8142076502

## 6. Save a model for later use 

In [38]:
import pickle

pickle.dump(model, open("random_forest_model_1.pkl","wb"))
print(X_test.index)

Index([180, 225, 156, 133, 219, 239, 241,  74, 242, 168, 208, 205, 267,  55,
       258, 167, 236, 297,  83, 185,  50, 203, 247,  59,  69,  89,  33, 273,
       260, 103, 231, 138, 271,  30,  36, 254, 173, 265, 204, 292,  63, 109,
       224, 268,  43, 227, 259,  40,  23, 206,  17, 282, 131, 136,  87, 216,
       140, 188, 220, 179,  78, 232,  90, 243, 144, 130, 240, 191, 290,  57,
        26,  15,  35, 264, 209, 146],
      dtype='int64')


In [39]:
# Load a saved model and make a prediction on a single example
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
# loaded_model.predict(np.array(X_test.loc[181]).reshape(1, -1))


new_input = [[6.1, 0.0, 0.0, 4.7, 0.6, 2.1]]  # Example feature values
prediction = loaded_model.predict(new_input)
print(prediction)


age          61.0
sex           0.0
cp            0.0
trestbps    130.0
chol        330.0
fbs           0.0
restecg       0.0
thalach     169.0
exang         0.0
oldpeak       0.0
slope         2.0
ca            0.0
thal          2.0

KeyError: 181

In [None]:
## check with other sample
model.predict