In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
import heapq as hq
import math
import sklearn
from sklearn import preprocessing 

### Data Import

In [2]:
def ConvertToLabel(df,pos):
    label_encoder = preprocessing.LabelEncoder() 
    df[pos]= label_encoder.fit_transform(df[pos]) 
    return df

def chunks(df, n):
    for i in range(0, len(df), n):
        yield df[i:i + n]

### Data Import

In [3]:
X_1 = pd.read_csv("project3_dataset1.txt",header=None, delimiter="\t")
XShape, YShape = X_1.shape
print('X Shape:' + str(XShape) + ' Y Shape: '+ str(YShape))
Y_1 = X_1[YShape-1]
X_1 = X_1.iloc[:, :YShape-1]
X_1 = X_1.to_numpy()
Y_1 = Y_1.to_numpy()
X_1 = sklearn.preprocessing.normalize(X_1)

X Shape:569 Y Shape: 31


In [4]:
X_2 = pd.read_csv("project3_dataset2.txt",header=None, delimiter="\t")
XShape, YShape = X_2.shape
print('X Shape:' + str(XShape) + ' Y Shape: '+ str(YShape))

Y_2 = X_2[YShape-1]
X_2 = ConvertToLabel(X_2,4)
X_2 = X_2.iloc[:, :YShape-1]
X_2 = X_2.to_numpy()
Y_2 = Y_2.to_numpy()
X_2 = sklearn.preprocessing.normalize(X_2)

X Shape:462 Y Shape: 10


## Library Implementation

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_1, Y_1, test_size=0.2)

In [7]:
knn = KNeighborsClassifier(n_neighbors = 6)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9210526315789473


Data Set 2

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_2, Y_2, test_size=0.3)

In [9]:
knn = KNeighborsClassifier(n_neighbors = 11)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6690647482014388


## Scratch Implementation

In [10]:
def knn(train_x, train_y, test_x, k=5):
    predicted = []
    for j,row in enumerate(test_x):
        h = []
        for i,pt in enumerate(train_x):
            d = distance.euclidean(pt,row)
            hq.heappush(h,(d,i))
        h = h[:k]
        labels = [train_y[l[1]] for l in h]
        predicted.append(0 if labels.count(0) > labels.count(1) else 1)
    return predicted

def findPredictedClass(predicted_labels,test_y):
    tp,tn,fp,fn = 0,0,0,0
    for i in range(len(predicted_labels)):
        if predicted_labels[i] == 1 and test_y[i] == 1:
            tp+=1    
        if predicted_labels[i] == 0 and test_y[i] == 0:
            tn+=1
        if predicted_labels[i] == 1 and test_y[i] == 0:
            fp+=1
        if predicted_labels[i] == 0 and test_y[i] == 1:
            fn+=1
    accuracy = (tp + tn)/(tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * (precision * recall)/(precision + recall)
    return accuracy,precision,recall,fscore

In [11]:
def dataProcessing(X,Y):
    data = np.column_stack((X, Y))
    np.random.shuffle(data)
    RowCount = math.ceil(len(X)/10)
    output = list(chunks(data,RowCount))
    print('Output Count:'+str(len(output))+' Output X Shape:'+str(len(output[0]))+' Output Y Shape:'+str(len(output[0][0])))
    return output

In [12]:
def knnExecution(output, k):
    totalAccuracy, totalPrecision, totalRecall, totalFscore = [],[],[],[]
    for i in range(len(output)):
        test_x,test_y,train_x, train_y = [], [], [], []
        train_x = [row for j,lt in enumerate(output) if j!=i for row in lt]
        test_x = output[i]
        for arr in test_x:
            test_y.append(arr[-1])
        for arr in train_x:
            train_y.append(arr[-1])
        train_x = np.delete(train_x, np.s_[-1:], axis=1)
        test_x = np.delete(test_x, np.s_[-1:], axis=1)

        predicted_labels = knn(train_x, train_y, test_x, k)
        accuracy, precision, recall, fscore = findPredictedClass(predicted_labels,test_y)
        totalAccuracy.append(accuracy)
        totalPrecision.append(precision)
        totalRecall.append(recall)
        totalFscore.append(fscore)

    avgAccuracy  = np.mean(totalAccuracy)
    avgPrecision = np.mean(totalPrecision)
    avgRecall    = np.mean(totalRecall)
    avgFscore    = np.mean(totalFscore)  
    print('Avgerage Accuracy :'+ str(avgAccuracy))
    print('Avgerage Precision :'+ str(avgPrecision))
    print('Avgerage Recall :'+ str(avgRecall))
    print('Avgerage Fscore :'+ str(avgFscore))

In [13]:
output = dataProcessing(X_1, Y_1)
knnExecution(output, 5)

Output Count:10 Output X Shape:57 Output Y Shape:31
Avgerage Accuracy :0.9296679197994988
Avgerage Precision :0.9335805033173454
Avgerage Recall :0.8651590228567916
Avgerage Fscore :0.894920162322301


In [14]:
output_2 = dataProcessing(X_2, Y_2)
knnExecution(output_2, 11)

Output Count:10 Output X Shape:47 Output Y Shape:10
Avgerage Accuracy :0.6334424440807419
Avgerage Precision :0.4298448773448773
Avgerage Recall :0.2574431734416255
Avgerage Fscore :0.31146767408836373


## Data Set 3

In [33]:
TrainData = pd.read_csv("project3_dataset3_train.txt",header=None, delimiter="\t")
XShape, YShape = TrainData.shape
print('X Shape:' + str(XShape) + ' Y Shape: '+ str(YShape))
TrainData_Y = TrainData[YShape-1]
TrainData_X = TrainData.iloc[:, :YShape-1]
TrainData_X = TrainData_X.to_numpy()
TrainData_Y = TrainData_Y.to_numpy()
#TestData_X = sklearn.preprocessing.normalize(TestData_X)

X Shape:80 Y Shape: 5


In [34]:
TestData = pd.read_csv("project3_dataset3_test.txt",header=None, delimiter="\t")
XShape, YShape = TestData.shape
print('X Shape:' + str(XShape) + ' Y Shape: '+ str(YShape))
TestData_Y = TestData[YShape-1]
TestData_X = TestData.iloc[:, :YShape-1]
TestData_X = TestData_X.to_numpy()
TestData_Y = TestData_Y.to_numpy()

X Shape:20 Y Shape: 5


In [40]:
predicted_labels = knn(TrainData_X, TrainData_Y, TestData_X, 5)
accuracy, precision, recall, fscore = findPredictedClass(predicted_labels,TestData_Y)
print(' Accuracy :'+ str(accuracy))
print(' Precision :'+ str(precision))
print(' Recall :'+ str(recall))
print(' Fscore :'+ str(fscore))

 Accuracy :0.95
 Precision :1.0
 Recall :0.9
 Fscore :0.9473684210526316


## Data Set 4

In [45]:
Data4 = pd.read_csv("project3_dataset4.txt",header=None, delimiter="\t")

In [46]:
for i in range(0,4):
    Data4 = ConvertToLabel(Data4,i)

In [48]:
XShape, YShape = Data4.shape
print('X Shape:' + str(XShape) + ' Y Shape: '+ str(YShape))
Data4_Y = Data4[YShape-1]
Data4_X = Data4.iloc[:, :YShape-1]
Data4_X = Data4_X.to_numpy()
Data4_Y = Data4_Y.to_numpy()

X Shape:14 Y Shape: 5


In [53]:
X_train, X_test, y_train, y_test = train_test_split(Data4_X, Data4_Y, test_size=0.2)

In [54]:
predicted_labels = knn(X_train, y_train, X_test, 3)
accuracy, precision, recall, fscore = findPredictedClass(predicted_labels,y_test)
print(' Accuracy :'+ str(accuracy))
print(' Precision :'+ str(precision))
print(' Recall :'+ str(recall))
print(' Fscore :'+ str(fscore))

 Accuracy :0.6666666666666666
 Precision :1.0
 Recall :0.6666666666666666
 Fscore :0.8
