In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import pandas as pd
import numpy as np

In [2]:
file_path = "pima-indians-diabetes.csv"
df =pd.read_csv('pima-indians-diabetes.csv',names=['Pregnancies','Glucose',
    'BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age', 'Class'])

In [3]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Class'],
      dtype='object')

### Input features (feature) & convert to Array

In [4]:
df.columns[0:8]

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [5]:
data = df[['Pregnancies','Glucose',
    'BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']].to_numpy()

### Output variable (the Class label) & convert to Array

In [6]:
df.columns[7]

'Age'

In [7]:
target = df["Class"].to_numpy()

In [8]:
X = data
Y = target

### Examine the data dimension

In [9]:
X.shape

(768, 8)

In [10]:
Y.shape

(768,)

## Build Classification Model using Random Forest

In [11]:
clf = RandomForestClassifier()

In [12]:
clf.fit(X, Y)

RandomForestClassifier()

## Feature Importance

In [13]:
print(clf.feature_importances_)

[0.08479541 0.24944367 0.08809044 0.06972875 0.07286709 0.17019002
 0.12495305 0.13993156]


## Make Prediction

In [14]:
X[0]

array([  6.   , 148.   ,  72.   ,  35.   ,   0.   ,  33.6  ,   0.627,
        50.   ])

In [15]:
print(clf.predict([[10,348,92,35,0,33.6,0.712,50]]))

[1]


In [16]:
print(clf.predict(X[[0]]))

[1]


In [17]:
print(clf.predict_proba(X[[0]]))

[[0.21 0.79]]


In [18]:
clf.fit(data, target)

RandomForestClassifier()

## Data split (80/20 ratio)

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [20]:
X_train.shape, Y_train.shape

((614, 8), (614,))

In [21]:
X_test.shape, Y_test.shape

((154, 8), (154,))

## Rebuild the Random Forest Model

In [22]:
clf.fit(X_train, Y_train)

RandomForestClassifier()

### 9.1. Performs prediction on single sample from the data set

In [23]:
print(clf.predict([[10,348,92,35,0,33.6,0.712,50]]))

[1]


In [24]:
print(clf.predict_proba([[10,348,92,35,0,33.6,0.712,50]]))

[[0.24 0.76]]


### 9.2. Performs prediction on the test set

#### *Predicted class labels*

In [25]:
print(clf.predict(X_test))

[0 1 0 1 0 0 1 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0
 1 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 0
 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0
 1 1 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0
 0 0 0 0 0 1]


#### *Actual class labels*

In [26]:
print(Y_test)

[1 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 1 0
 1 0 0 1 1 0 0 1 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 1 0 1 0 1 0 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 0 1 0 1
 0 1 0 1 0 1 0 0 1 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0
 0 0 0 1 1 1]


## 10. Model Performance

In [27]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, Y_train)
print('Accuracy of RandomForest classifier on training set: {:.2f}'
     .format(clf.score(X_train, Y_train)))
print('Accuracy of classifier on training set: {:.2f}'
     .format(clf.score(X_test, Y_test)))

Accuracy of RandomForest classifier on training set: 1.00
Accuracy of classifier on training set: 0.74


In [28]:
print(clf.predict([[10,348,92,35,0,33.6,0.712,50]]))

[1]


In [29]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, Y_train)
print('Accuracy of LDA classifier on training set: {:.2f}'
     .format(lda.score(X_train, Y_train)))
print('Accuracy of LDA classifier on training set: {:.2f}'
     .format(lda.score(X_test, Y_test)))

Accuracy of LDA classifier on training set: 0.78
Accuracy of LDA classifier on training set: 0.75


In [30]:
print(lda.predict([[10,348,92,35,0,33.6,0.712,50]]))

[1]


In [31]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, Y_train)
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(X_train, Y_train)))
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(X_test, Y_test)))

Accuracy of GNB classifier on training set: 0.77
Accuracy of GNB classifier on training set: 0.73


In [32]:
print(gnb.predict([[10,348,92,35,0,33.6,0.712,50]]))

[1]


In [33]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train, Y_train)))
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_test, Y_test)))

Accuracy of K-NN classifier on training set: 0.81
Accuracy of K-NN classifier on training set: 0.70


In [34]:
print(knn.predict([[10,348,92,35,0,33.6,0.712,50]]))

[1]
