# Fit the model/algorithm on our data and use it to make predictions


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn

In [3]:
# Get the data 
heart_disease = pd.read_csv("resources/heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


## 1. Fitting the model to the data
Different names for:

* X = features, features variables, data
* y = labels, targets, target variables

In [5]:
# Import the RandomForestClassifier estimator class
from sklearn.ensemble import RandomForestClassifier
# Import train test split
from sklearn.model_selection import train_test_split

# Setup random seed
np.random.seed(42)

# Make the data
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100)

# Fit the model to the data (training the machine learning model)
clf.fit(X_train, y_train)

# Evaluate the Random Forest Classifier (use the patterns the model has learned)
clf.score(X_test, y_test)

0.8524590163934426

In [7]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [9]:
y.tail()

298    0
299    0
300    0
301    0
302    0
Name: target, dtype: int64

# Random Forest model deep dive
These resources will help you understand what's happening inside the Random Forest models we've been using.

* <a href="https://en.wikipedia.org/wiki/Random_forest">Random Forest Wikipedia</a>
* <a href="https://simple.wikipedia.org/wiki/Random_forest">Random Forest Wikipedia (simple version)</a>
* <a href="https://www.kdnuggets.com/2016/12/random-forests-python.html">Random Forests in Python</a>
* <a href="https://willkoehrsen.github.io/machine%20learning/tutorial/an-implementation-and-explanation-of-the-random-forest-in-python/">An Implementation and Explanation of the Random Forest in Python by Will Koehrsen</a>

## 2. Make predictions using a machine learning model
2 ways to make predictions:

* predict()
* predict_proba()

In [10]:
X_test.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1
228,59,1,3,170,288,0,0,159,0,0.2,1,0,3
111,57,1,2,150,126,1,1,173,0,0.2,2,1,3
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2


### 2.1 Make predictions using predict function

In [11]:
clf.predict(X_test)

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [13]:
np.array(y_test)

array([0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [15]:
# Compare predictions to truth labels to evaluate the model
# This is a manual process
y_preds = clf.predict(X_test)
np.mean(y_preds == y_test)

0.8524590163934426

In [17]:
# The above result is same as our eariler score
clf.score(X_test, y_test)

0.8524590163934426

In [19]:
# Another way to check this
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

0.8524590163934426

### 2.2 Make predictions using predict_proba function

* Make predictions with predict_proba() - use this if someone asks you "what's the probability your model is assigning to each prediction?"