# Making predictions using machine learning model
(Classification)


2 ways to make predictions: predict(), predict_probab()

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
heart_disease = pd.read_csv('Heart_Disease_Prediction.csv')
heart_disease

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,1,3,172,199,1,0,162,0,0.5,1,0,7,Absence
266,44,1,2,120,263,0,0,173,0,0.0,1,0,7,Absence
267,56,0,2,140,294,0,2,153,0,1.3,2,0,3,Absence
268,57,1,4,140,192,0,0,148,0,0.4,2,0,6,Absence


In [6]:
heart_disease = heart_disease.rename(columns = {'Heart Disease': 'Target'})

In [7]:
heart_disease['Target'] = heart_disease['Target'].map({"Presence": 1, "Absence": 0})
heart_disease

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Target
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,1
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,0
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,1
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,0
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,1,3,172,199,1,0,162,0,0.5,1,0,7,0
266,44,1,2,120,263,0,0,173,0,0.0,1,0,7,0
267,56,0,2,140,294,0,2,153,0,1.3,2,0,3,0
268,57,1,4,140,192,0,0,148,0,0.4,2,0,6,0


In [8]:
from sklearn.ensemble import RandomForestClassifier

#generate random seed
np.random.seed(0)

#X and y
X = heart_disease.drop('Target', axis=1)
y = heart_disease['Target']

#training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#model
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

#score
clf.score(X_test, y_test)

0.7962962962962963

In [10]:
X_test.shape

(54, 13)

# Make predictions with predict()

In [11]:
#use a trained model to make predictions
clf.predict(np.array([1,4,6,3])) #this doesnt work



ValueError: Expected 2D array, got 1D array instead:
array=[1. 4. 6. 3.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [12]:
clf.predict(X_test)

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 0], dtype=int64)

In [13]:
np.array(y_test)

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1], dtype=int64)

In [14]:
#compare predictions to truth labels to evaluate the model
y_preds = clf.predict(X_test)
np.mean(y_preds == y_test)

0.7962962962962963

In [15]:
clf.score(X_test, y_test)

0.7962962962962963

In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

0.7962962962962963

# Make predictions with predict_proba()


predict_proba() returns probabilities of a classification label

In [17]:
clf.predict_proba(X_test)

array([[0.6 , 0.4 ],
       [0.89, 0.11],
       [0.16, 0.84],
       [0.22, 0.78],
       [0.77, 0.23],
       [0.73, 0.27],
       [0.71, 0.29],
       [0.77, 0.23],
       [0.6 , 0.4 ],
       [0.68, 0.32],
       [0.75, 0.25],
       [0.51, 0.49],
       [0.66, 0.34],
       [0.11, 0.89],
       [0.06, 0.94],
       [0.86, 0.14],
       [0.94, 0.06],
       [0.35, 0.65],
       [0.3 , 0.7 ],
       [0.76, 0.24],
       [0.03, 0.97],
       [0.94, 0.06],
       [0.04, 0.96],
       [0.49, 0.51],
       [0.38, 0.62],
       [0.8 , 0.2 ],
       [0.83, 0.17],
       [0.91, 0.09],
       [0.51, 0.49],
       [0.56, 0.44],
       [0.72, 0.28],
       [0.48, 0.52],
       [0.02, 0.98],
       [0.99, 0.01],
       [0.11, 0.89],
       [0.52, 0.48],
       [0.05, 0.95],
       [0.14, 0.86],
       [1.  , 0.  ],
       [0.7 , 0.3 ],
       [0.48, 0.52],
       [0.94, 0.06],
       [0.93, 0.07],
       [0.12, 0.88],
       [0.89, 0.11],
       [0.73, 0.27],
       [0.24, 0.76],
       [0.7 ,

In [18]:
clf.predict_proba(X_test[:5])
#returns 5 arrays of two numbers

array([[0.6 , 0.4 ],
       [0.89, 0.11],
       [0.16, 0.84],
       [0.22, 0.78],
       [0.77, 0.23]])

In [20]:
clf.predict(X_test[:5])
#returns 1 array of 5 numbers

array([0, 0, 1, 1, 0], dtype=int64)

When the first number is bigger than the second number then predict() returns 0, while if 1st<2nd then returns 1

In [21]:
heart_disease['Target'].value_counts()

0    150
1    120
Name: Target, dtype: int64