In [141]:
import pandas as pd
import numpy as np
import math
import operator
from sklearn import preprocessing

## Handling Categorical values

In [142]:
def handleCat(dataframe):
    for x in range(dataframe.shape[1]):
            if(dataframe[x].dtypes == 'object'):
                dataframe[x] = dataframe[x].astype('category')
                dataframe[x] = dataframe[x].cat.codes
    dataframe_orig = dataframe
    dataframe = pd.DataFrame(preprocessing.normalize(dataframe.iloc[:,0:len(dataframe.columns)-1]))
    dataframe[len(dataframe.columns)] = dataframe_orig.iloc[:,len(dataframe.columns):]
    return dataframe

In [143]:
dataframe.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,1
1,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,1
2,11.26,19.96,73.72,394.1,0.0802,0.1181,0.09274,0.05588,0.2595,0.06233,...,22.33,78.27,437.6,0.1028,0.1843,0.1546,0.09314,0.2955,0.07009,0
3,11.43,15.39,73.06,399.8,0.09639,0.06889,0.03503,0.02875,0.1734,0.05865,...,22.02,79.93,462.0,0.119,0.1648,0.1399,0.08476,0.2676,0.06765,0
4,14.61,15.69,92.68,664.9,0.07618,0.03515,0.01447,0.01877,0.1632,0.05255,...,21.75,103.7,840.8,0.1011,0.07087,0.04746,0.05813,0.253,0.05695,0
5,15.34,14.26,102.5,704.4,0.1073,0.2135,0.2077,0.09756,0.2521,0.07032,...,19.08,125.1,980.9,0.139,0.5954,0.6305,0.2393,0.4667,0.09946,1
6,11.89,17.36,76.2,435.6,0.1225,0.0721,0.05929,0.07404,0.2015,0.05875,...,18.99,79.46,472.4,0.1359,0.08368,0.07153,0.08946,0.222,0.06033,0
7,13.75,23.77,88.54,590.0,0.08043,0.06807,0.04697,0.02344,0.1773,0.05429,...,26.34,98.0,706.0,0.09368,0.1442,0.1359,0.06106,0.2663,0.06321,0
8,15.08,25.74,98.0,716.6,0.1024,0.09769,0.1235,0.06553,0.1647,0.06464,...,33.22,121.2,1050.0,0.166,0.2356,0.4029,0.1526,0.2654,0.09438,1
9,19.27,26.47,127.9,1162.0,0.09401,0.1719,0.1657,0.07593,0.1853,0.06261,...,30.9,161.4,1813.0,0.1509,0.659,0.6091,0.1785,0.3672,0.1123,1


## Data Preparation

In [144]:
def prepareData(k,dataframe,fold_no):
    length = int(len(dataframe)/k)
    test = pd.DataFrame(dataframe,index=range(fold_no*length,fold_no*length + length))
    train = dataframe.loc[~dataframe.index.isin(test.index)]
    return train,test   

## Compute the distance 

In [145]:
def calcDist(testDataPoint,trainDataPoint):
    dist =0
    for x in range(len(trainDataPoint)-1):
        #print(len(trainDataPoint))
        #print(testDataPoint[x],trainDataPoint[x])
        dist += pow((testDataPoint[x] - trainDataPoint[x]),2)
    dist = math.sqrt(dist)
   # print(dist)
    return dist

## Get the neighbors

In [146]:
def getNeighbors(train, testDataPoint, k):
    distances = []
    for x in range(len(train)):
        dist = calcDist(testDataPoint, train[x])
        distances.append((train[x], dist))
#     print(distances)
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

## Assign the neighbors

In [147]:
def getResponse(neighbors):
    assignZero = 0
    assignOne = 0
    for x in range(len(neighbors)):
        label = neighbors[x][-1]
        if (label == 0):
            assignZero += 1
        elif (label == 1):
            assignOne += 1              
    if (assignOne > assignZero):
        assignOne = 1
        return assignOne
    else:
        assignZero = 0
        return assignZero

## Build Confusion matrix

In [148]:
def buildConfusionMatrix(test, testSet):
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for x in range(len(test) - 1):
        if (test[x][-1] == testSet[x][-1]) and test[x][-1] == 1:
            TP += 1
        elif test[x][-1] == testSet[x][-1] and test[x][-1] == 0:
            TN += 1
        elif test[x][-1] != testSet[x][-1] and test[x][-1] == 1:
            FN += 1
        else:
            FP += 1
    return ([TP, TN, FP, FN])

## Calculating accuracy

In [149]:
def calcScores(TP,TN,FP,FN):
    accuracy = (float(TP) + TN) / (TP + TN + FP + FN)
    precision = (float(TP) / (TP + FP))
    f_score = (2.0 * TP) / (2 * TP + FP + FN)
    recall = (float(TP)) / (TP + FN)
    scores = [accuracy,precision,f_score,recall]
    print("Accuracy: " + str(accuracy) + "\tPrecision: " + str(precision))
    print("F-Score: " + str(f_score) + "\tRecall: " + str(recall)+ "\n")
    return scores

## Putting all together

In [150]:
def main():
#     trainSet = [[2, 2, 2, 0], [4, 4, 4, 1]]
#     testSet = [[5, 5, 5],[6, 6, 6]]
    demo =1 # 0 for the demo
    fold_no = 10 # 1 for the demo
    filename = 'project3_dataset2.txt'
    dataframe = pd.read_csv(filename,delimiter='\t',header=None)
    dataframe = handleCat(dataframe)
    folds_scores = dict()
    counter = 0 
    k = int(math.sqrt(dataframe.shape[1]-1))
    if (k%2 ==0):
        k=k+1
    print("K-Value is: "+ str(k))
    for x in range(fold_no):
        if(demo==0):
            filename1 = 'project3_dataset3_train.txt'
            filename2 = 'project3_dataset3_test.txt'
            trainSet = pd.read_csv(filename1,delimiter='\t',header=None)
            testSet = pd.read_csv(filename2,delimiter='\t',header=None)
            k = int(math.sqrt(trainSet.shape[1]-1))
            if (k%2 ==0):
                k=k+1
            print('Updated K-value is ' + str(k))
        else:
            trainSet,testSet = prepareData(fold_no,dataframe,x)        
        trainSet = trainSet.values.tolist()
        testSet = testSet.values.tolist()
        test = pd.DataFrame(testSet)
        test = test.values.tolist()
        print('Train set: ' + repr(len(trainSet)))
        print('Test set: ' + repr(len(testSet)) + '\n')

        for y in range(len(testSet)):
            neighbors = getNeighbors(trainSet, testSet[y], k)
            result = getResponse(neighbors)
#             print('testSet[x][-1]: ' + repr(testSet[x][-1]))
            testSet[y][-1]=result
#             print('testSet[x][-1]: ' + repr(testSet[x][-1]))
#             print('--------------')
        [TP, TN, FP, FN] = buildConfusionMatrix(test, testSet)
        print('Scores for fold ' + str(x+1) + ' are: \n')
        folds_scores[counter] = calcScores(TP,TN,FP,FN)
        counter += 1
    sumAccuracy = 0
    sumPrecision = 0
    sumFScore = 0 
    sumRecall = 0
    for z in range(counter):
        sumAccuracy += folds_scores[z][0]
        sumPrecision += folds_scores[z][1]
        sumFScore += folds_scores[z][2]
        sumRecall += folds_scores[z][3]
    print('Average accuracy is: ' + repr(sumAccuracy/fold_no))
    print('Average precision is: ' + repr(sumPrecision/fold_no))
    print('Average f-score is: ' + repr(sumFScore/fold_no))
    print('Average recall is: ' + repr(sumRecall/fold_no))
    return test,testSet

In [151]:
test,modifiedTest = main()

K-Value is: 3
Train set: 416
Test set: 46

Scores for fold 1 are: 

Accuracy: 0.4888888888888889	Precision: 0.42857142857142855
F-Score: 0.34285714285714286	Recall: 0.2857142857142857

Train set: 416
Test set: 46

Scores for fold 2 are: 

Accuracy: 0.6	Precision: 0.23076923076923078
F-Score: 0.25	Recall: 0.2727272727272727

Train set: 416
Test set: 46

Scores for fold 3 are: 

Accuracy: 0.6666666666666666	Precision: 0.6428571428571429
F-Score: 0.5454545454545454	Recall: 0.47368421052631576

Train set: 416
Test set: 46

Scores for fold 4 are: 

Accuracy: 0.6222222222222222	Precision: 0.42857142857142855
F-Score: 0.41379310344827586	Recall: 0.4

Train set: 416
Test set: 46

Scores for fold 5 are: 

Accuracy: 0.5777777777777777	Precision: 0.5555555555555556
F-Score: 0.3448275862068966	Recall: 0.25

Train set: 416
Test set: 46

Scores for fold 6 are: 

Accuracy: 0.35555555555555557	Precision: 0.07692307692307693
F-Score: 0.06451612903225806	Recall: 0.05555555555555555

Train set: 416
Test 