In [1]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from libsvm.commonutil import svm_read_problem

In [2]:
# load data
x,y = svm_read_problem('diabetes_scale.txt')
diabetes = pd.DataFrame.from_dict(y)

# fill missing values with mean column values
diabetes = diabetes.fillna(method = 'ffill')
diabetes

Unnamed: 0,1,2,3,4,5,6,7,8
0,-0.294118,0.487437,0.180328,-0.292929,-1.000000,0.001490,-0.531170,-0.033333
1,-0.882353,-0.145729,0.081967,-0.414141,-1.000000,-0.207153,-0.766866,-0.666667
2,-0.058824,0.839196,0.049180,-1.000000,-1.000000,-0.305514,-0.492741,-0.633333
3,-0.882353,-0.105528,0.081967,-0.535354,-0.777778,-0.162444,-0.923997,-1.000000
4,-1.000000,0.376884,-0.344262,-0.292929,-0.602837,0.284650,0.887276,-0.600000
...,...,...,...,...,...,...,...,...
763,0.176471,0.015075,0.245902,-0.030303,-0.574468,-0.019374,-0.920581,0.400000
764,-0.764706,0.226131,0.147541,-0.454545,-1.000000,0.096870,-0.776260,-0.800000
765,-0.411765,0.216080,0.180328,-0.535354,-0.735225,-0.219076,-0.857387,-0.700000
766,-0.882353,0.266332,-0.016393,-1.000000,-1.000000,-0.102832,-0.768574,-0.133333


In [3]:
# split the dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(diabetes, x, test_size=0.2, random_state=42)

# check if there is any nan value in the dataset
np.isnan(x_train).any()

1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
dtype: bool

In [9]:
# Perceptron implementation from scratch 

class Perceptron:
    #constructor
    def __init__ (self, learning_rate=0.1, max_iter=1000):
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.activation = self.step_func
        self.weights = None
        self.bias = None
    
    # activation function: unit step
    def step_func(self, x):
        return np.where(x>=0, 1, -1)
    
    def fit(self, X, y):
        self.weights = np.zeros(X.shape[1])
        self.bias = 0 
        
        y_new = np.array([1 if i>0 else -1 for i in y])
        
        for i in range(self.max_iter):
            for j, x_i in enumerate(X):
                output = np.dot(x_i, self.weights) + self.bias
                y_pred = self.activation(output)
                
                update = self.learning_rate * (y_new[j] - y_pred)
                self.weights += update * x_i
                self.bias += update
    
    def predict(self, x):
        output = np.dot(x, self.weights) + self.bias
        y_pred = self.activation(output)
        return y_pred

    def accuracy_score(self, y_pred, y_true):
        sum_y = 0
        for i in range(len(y_pred)):
            if y_pred[i] == y_true[i]:
                sum_y += 1
        return sum_y/len(y_pred)

In [10]:
# implement cross validation
def cross_validation(X, y, k, model):
	n = len(X)
	fold_size = n // k
	accuracy = []
	for i in range(k):
		x_test = X[i*fold_size:(i+1)*fold_size]
		y_test = y[i*fold_size:(i+1)*fold_size]
		x_train = np.concatenate((X[:i*fold_size], X[(i+1)*fold_size:]), axis=0)
		y_train = np.concatenate((y[:i*fold_size], y[(i+1)*fold_size:]), axis=0)

		# fit model
		model.fit(x_train, y_train)
		y_pred = model.predict(x_test)
		accuracy.append(model.accuracy_score(y_pred, y_test))
	return accuracy, np.mean(accuracy)


In [11]:
# model initialisation
perceptron = Perceptron(learning_rate=0.01, max_iter=1000)

# apply k-fold cross-validation, select best K
for i in range(1, 10):
    accuracy, mean_accuracy = cross_validation(x_train, y_train, i, perceptron)
    print("K = ", i, "accuracy = ", accuracy, "mean accuracy = ", mean_accuracy)

#accuracy1 = cross_validation(x_train, y_train, 5, perceptron)
#print(accuracy)

K =  1 accuracy =  [0.6530944625407166] mean accuracy =  0.6530944625407166
K =  2 accuracy =  [0.7068403908794788, 0.762214983713355] mean accuracy =  0.7345276872964169
K =  3 accuracy =  [0.7794117647058824, 0.7696078431372549, 0.7450980392156863] mean accuracy =  0.7647058823529411
K =  4 accuracy =  [0.7712418300653595, 0.7189542483660131, 0.7777777777777778, 0.7450980392156863] mean accuracy =  0.7532679738562091
K =  5 accuracy =  [0.6311475409836066, 0.7377049180327869, 0.7540983606557377, 0.7377049180327869, 0.7213114754098361] mean accuracy =  0.7163934426229509
K =  6 accuracy =  [0.7058823529411765, 0.4803921568627451, 0.6568627450980392, 0.8431372549019608, 0.6862745098039216, 0.4803921568627451] mean accuracy =  0.6421568627450981
K =  7 accuracy =  [0.6551724137931034, 0.8390804597701149, 0.6896551724137931, 0.7701149425287356, 0.3793103448275862, 0.7471264367816092, 0.735632183908046] mean accuracy =  0.6880131362889983
K =  8 accuracy =  [0.6710526315789473, 0.69736842

In [7]:
# compare the result with sklearn Perceptron model

from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score

model = Perceptron(max_iter=1000)
model.fit(np.array(x_train), np.array(y_train))
y_pred = model.predict(np.array(x_test))

test_score = accuracy_score(y_pred, y_test)
test_score

0.7337662337662337