In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
import heapq as hq
import math
import sklearn
from sklearn import preprocessing 

### Data Import

In [2]:
def ConvertToLabel(df,pos):
    label_encoder = preprocessing.LabelEncoder() 
    df[pos]= label_encoder.fit_transform(df[pos]) 
    return df

def chunks(df, n):
    for i in range(0, len(df), n):
        yield df[i:i + n]

### Data Import

In [3]:
X_1 = pd.read_csv("project3_dataset1.txt",header=None, delimiter="\t")
XShape, YShape = X_1.shape
print('X Shape:' + str(XShape) + ' Y Shape: '+ str(YShape))
Y_1 = X_1[YShape-1]
X_1 = X_1.iloc[:, :YShape-1]
X_1 = X_1.to_numpy()
Y_1 = Y_1.to_numpy()
X_1 = sklearn.preprocessing.normalize(X_1)

X Shape:569 Y Shape: 31


In [4]:
X_2 = pd.read_csv("project3_dataset2.txt",header=None, delimiter="\t")
XShape, YShape = X_2.shape
print('X Shape:' + str(XShape) + ' Y Shape: '+ str(YShape))

Y_2 = X_2[YShape-1]
X_2 = ConvertToLabel(X_2,4)
X_2 = X_2.iloc[:, :YShape-1]
X_2 = X_2.to_numpy()
Y_2 = Y_2.to_numpy()
X_2 = sklearn.preprocessing.normalize(X_2)

X Shape:462 Y Shape: 10


## Library Implementation

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_1, Y_1, test_size=0.2)

In [7]:
knn = KNeighborsClassifier(n_neighbors = 6)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9385964912280702


Data Set 2

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_2, Y_2, test_size=0.3)

In [9]:
knn = KNeighborsClassifier(n_neighbors = 11)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6546762589928058


## Scratch Implementation

In [10]:
def knn(train_x, train_y, test_x, k=5):
    predicted = []
    for j,row in enumerate(test_x):
        h = []
        for i,pt in enumerate(train_x):
            d = distance.euclidean(pt,row)
            hq.heappush(h,(d,i))
        h = h[:k]
        labels = [train_y[l[1]] for l in h]
        predicted.append(0 if labels.count(0) > labels.count(1) else 1)
    return predicted

def findPredictedClass(predicted_labels,test_y):
    tp,tn,fp,fn = 0,0,0,0
    for i in range(len(predicted_labels)):
        if predicted_labels[i] == 1 and test_y[i] == 1:
            tp+=1    
        if predicted_labels[i] == 0 and test_y[i] == 0:
            tn+=1
        if predicted_labels[i] == 1 and test_y[i] == 0:
            fp+=1
        if predicted_labels[i] == 0 and test_y[i] == 1:
            fn+=1
    accuracy = (tp + tn)/(tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * (precision * recall)/(precision + recall)
    return accuracy,precision,recall,fscore

In [11]:
def dataProcessing(X,Y):
    data = np.column_stack((X, Y))
    np.random.shuffle(data)
    RowCount = math.ceil(len(X)/10)
    output = list(chunks(data,RowCount))
    print('Output Count:'+str(len(output))+' Output X Shape:'+str(len(output[0]))+' Output Y Shape:'+str(len(output[0][0])))
    return output

In [12]:
def knnExecution(output, k):
    totalAccuracy, totalPrecision, totalRecall, totalFscore = [],[],[],[]
    for i in range(len(output)):
        test_x,test_y,train_x, train_y = [], [], [], []
        train_x = [row for j,lt in enumerate(output) if j!=i for row in lt]
        test_x = output[i]
        for arr in test_x:
            test_y.append(arr[-1])
        for arr in train_x:
            train_y.append(arr[-1])
        train_x = np.delete(train_x, np.s_[-1:], axis=1)
        test_x = np.delete(test_x, np.s_[-1:], axis=1)

        predicted_labels = knn(train_x, train_y, test_x, k)
        accuracy, precision, recall, fscore = findPredictedClass(predicted_labels,test_y)
        totalAccuracy.append(accuracy)
        totalPrecision.append(precision)
        totalRecall.append(recall)
        totalFscore.append(fscore)

    avgAccuracy  = np.mean(totalAccuracy)
    avgPrecision = np.mean(totalPrecision)
    avgRecall    = np.mean(totalRecall)
    avgFscore    = np.mean(totalFscore)  
    print('Avgerage Accuracy :'+ str(avgAccuracy))
    print('Avgerage Precision :'+ str(avgPrecision))
    print('Avgerage Recall :'+ str(avgRecall))
    print('Avgerage Fscore :'+ str(avgFscore))

In [18]:
output = dataProcessing(X_1, Y_1)
for i in range(3,10):
    print('i: '+str(i))
    knnExecution(output, i)
    print('-----------------------------------------------------')

Output Count:10 Output X Shape:57 Output Y Shape:31
i: 3
Avgerage Accuracy :0.9085213032581454
Avgerage Precision :0.9063637669706791
Avgerage Recall :0.8278067588729353
Avgerage Fscore :0.8636174084204195
-----------------------------------------------------
i: 4
Avgerage Accuracy :0.9085526315789474
Avgerage Precision :0.8683494842753156
Avgerage Recall :0.884617679433856
Avgerage Fscore :0.8744888414862981
-----------------------------------------------------
i: 5
Avgerage Accuracy :0.9208333333333334
Avgerage Precision :0.9216193633059632
Avgerage Recall :0.8512296322958088
Avgerage Fscore :0.88216263623435
-----------------------------------------------------
i: 6
Avgerage Accuracy :0.9085213032581455
Avgerage Precision :0.8819351073762837
Avgerage Recall :0.863451854518031
Avgerage Fscore :0.8699557274903567
-----------------------------------------------------
i: 7
Avgerage Accuracy :0.9190476190476191
Avgerage Precision :0.933073083778966
Avgerage Recall :0.8313151023812789
Avg

In [19]:
output_2 = dataProcessing(X_2, Y_2)
for i in range(2,15):
    print('i: '+str(i))
    knnExecution(output_2, i)
    print('-----------------------------------------------------')

Output Count:10 Output X Shape:47 Output Y Shape:10
i: 2
Avgerage Accuracy :0.5146753955264594
Avgerage Precision :0.3622828264680309
Avgerage Recall :0.5392763819427906
Avgerage Fscore :0.4301591290210216
-----------------------------------------------------
i: 3
Avgerage Accuracy :0.546644844517185
Avgerage Precision :0.28924408924408923
Avgerage Recall :0.22179027413934532
Avgerage Fscore :0.24691134427335987
-----------------------------------------------------
i: 4
Avgerage Accuracy :0.5576650300054556
Avgerage Precision :0.37443414322250634
Avgerage Recall :0.40808754068429
Avgerage Fscore :0.38371655045553477
-----------------------------------------------------
i: 5
Avgerage Accuracy :0.5858156028368795
Avgerage Precision :0.3452168746286393
Avgerage Recall :0.21806650392950702
Avgerage Fscore :0.2602050613348788
-----------------------------------------------------
i: 6
Avgerage Accuracy :0.5900709219858157
Avgerage Precision :0.4019994560822734
Avgerage Recall :0.379778088698

## Data Set 3

In [20]:
TrainData = pd.read_csv("project3_dataset3_train.txt",header=None, delimiter="\t")
XShape, YShape = TrainData.shape
print('X Shape:' + str(XShape) + ' Y Shape: '+ str(YShape))
TrainData_Y = TrainData[YShape-1]
TrainData_X = TrainData.iloc[:, :YShape-1]
TrainData_X = TrainData_X.to_numpy()
TrainData_Y = TrainData_Y.to_numpy()

X Shape:80 Y Shape: 5


In [21]:
TestData = pd.read_csv("project3_dataset3_test.txt",header=None, delimiter="\t")
XShape, YShape = TestData.shape
print('X Shape:' + str(XShape) + ' Y Shape: '+ str(YShape))
TestData_Y = TestData[YShape-1]
TestData_X = TestData.iloc[:, :YShape-1]
TestData_X = TestData_X.to_numpy()
TestData_Y = TestData_Y.to_numpy()

X Shape:20 Y Shape: 5


In [23]:
for i in range(2, 10):
    print('i: '+str(i))
    predicted_labels = knn(TrainData_X, TrainData_Y, TestData_X, i)
    accuracy, precision, recall, fscore = findPredictedClass(predicted_labels,TestData_Y)
    print(' Accuracy :'+ str(accuracy))
    print(' Precision :'+ str(precision))
    print(' Recall :'+ str(recall))
    print(' Fscore :'+ str(fscore))
    print('-----------------------------------------------------')

i: 2
 Accuracy :1.0
 Precision :1.0
 Recall :1.0
 Fscore :1.0
-----------------------------------------------------
i: 3
 Accuracy :1.0
 Precision :1.0
 Recall :1.0
 Fscore :1.0
-----------------------------------------------------
i: 4
 Accuracy :1.0
 Precision :1.0
 Recall :1.0
 Fscore :1.0
-----------------------------------------------------
i: 5
 Accuracy :0.95
 Precision :1.0
 Recall :0.9
 Fscore :0.9473684210526316
-----------------------------------------------------
i: 6
 Accuracy :0.95
 Precision :1.0
 Recall :0.9
 Fscore :0.9473684210526316
-----------------------------------------------------
i: 7
 Accuracy :0.95
 Precision :1.0
 Recall :0.9
 Fscore :0.9473684210526316
-----------------------------------------------------
i: 8
 Accuracy :1.0
 Precision :1.0
 Recall :1.0
 Fscore :1.0
-----------------------------------------------------
i: 9
 Accuracy :1.0
 Precision :1.0
 Recall :1.0
 Fscore :1.0
-----------------------------------------------------


## Data Set 4

In [24]:
Data4 = pd.read_csv("project3_dataset4.txt",header=None, delimiter="\t")

In [25]:
for i in range(0,4):
    Data4 = ConvertToLabel(Data4,i)

In [26]:
XShape, YShape = Data4.shape
print('X Shape:' + str(XShape) + ' Y Shape: '+ str(YShape))
Data4_Y = Data4[YShape-1]
Data4_X = Data4.iloc[:, :YShape-1]
Data4_X = Data4_X.to_numpy()
Data4_Y = Data4_Y.to_numpy()

X Shape:14 Y Shape: 5


In [30]:
X_train, X_test, y_train, y_test = train_test_split(Data4_X, Data4_Y, test_size=0.2)

In [31]:
for i in range(2, 10):
    print('i: '+str(i))
    predicted_labels = knn(X_train, y_train, X_test, i)
    accuracy, precision, recall, fscore = findPredictedClass(predicted_labels,y_test)
    print(' Accuracy :'+ str(accuracy))
    print(' Precision :'+ str(precision))
    print(' Recall :'+ str(recall))
    print(' Fscore :'+ str(fscore))
    print('-----------------------------------------------------')

i: 2
 Accuracy :0.6666666666666666
 Precision :0.6666666666666666
 Recall :1.0
 Fscore :0.8
-----------------------------------------------------
i: 3
 Accuracy :0.3333333333333333
 Precision :0.5
 Recall :0.5
 Fscore :0.5
-----------------------------------------------------
i: 4
 Accuracy :0.6666666666666666
 Precision :0.6666666666666666
 Recall :1.0
 Fscore :0.8
-----------------------------------------------------
i: 5
 Accuracy :0.6666666666666666
 Precision :0.6666666666666666
 Recall :1.0
 Fscore :0.8
-----------------------------------------------------
i: 6
 Accuracy :0.6666666666666666
 Precision :0.6666666666666666
 Recall :1.0
 Fscore :0.8
-----------------------------------------------------
i: 7
 Accuracy :0.6666666666666666
 Precision :0.6666666666666666
 Recall :1.0
 Fscore :0.8
-----------------------------------------------------
i: 8
 Accuracy :0.6666666666666666
 Precision :0.6666666666666666
 Recall :1.0
 Fscore :0.8
-----------------------------------------------