In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
data = pd.read_csv("pima-indians-diabetes.data.csv", header=None)

In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
X = data.iloc[:,0:-1]
y = data.iloc[:,-1:]
classes = y[8].unique()

In [5]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [6]:
y.head()

Unnamed: 0,8
0,1
1,0
2,1
3,0
4,1


In [7]:
def gaussianProbability(x, mean, std):
    temp = math.pow(x - mean, 2) / (2 * std * std)
    return 1 / (math.sqrt(2 * math.pi) * std) * math.exp(-temp)

In [8]:
n = len(X)
n

768

In [9]:
train_len = math.ceil(0.7 * n)
test_len = n - train_len

In [10]:
X_train = data.iloc[:train_len,:-1]
y_train = data.iloc[:train_len,8]

In [11]:
classes_data = {}
for _class in classes:
    mask = y_train == _class
    features = X_train[mask]
    prior_pb = features.count()[0] / train_len

    classes_data[_class] = {
        'data': features,
        'prior_pb': prior_pb,
        'mean': features.mean(),
        'std': features.std()
    }

In [12]:
def predict_class(classes_data, input_vec):
    class_prob = [None] * len(classes)
    for _class, data in classes_data.items():
        feat_prob = 1
        
        for i in range(len(input_vec)):
            feat_prob *= gaussianProbability(input_vec[i], data['mean'][i], data['std'][i])
        
        class_prob[_class] = data['prior_pb'] * feat_prob
    
    return class_prob.index(max(class_prob))

In [13]:
X_test = data.iloc[train_len:,:-1]
y_test = data.iloc[train_len:,8]

In [14]:
def predict_data(X):
    pred = []
    for i in range(len(X)):
        input_vec = X.iloc[i].values
        pred.append(predict_class(classes_data, input_vec))
    
    return pred

In [15]:
def accuracy(y, pred):
    return (y == pred).mean() * 100

In [18]:
prediction_on_train = predict_data(X_train)
prediction_on_test = predict_data(X_test)

In [20]:
accuracy(y_train, prediction_on_train)

76.76579925650557

In [21]:
accuracy(y_test, prediction_on_test)

76.95652173913044