In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML

########### Question 1 ###########
# Generate Data

def datagen(mvec,cov,n):
    return np.random.multivariate_normal(mvec,cov,n)
   

In [2]:
# load mean vectors + covariance matrix
data1 = pd.read_csv('hwk2_datasets/DS1_m_0.txt', sep=",", header=None)
data2 = pd.read_csv('hwk2_datasets/DS1_m_1.txt', sep=",", header=None)
data3 = pd.read_csv('hwk2_datasets/DS1_Cov.txt', sep=",", header=None)

#df1 = pd.DataFrame(data1)
#df2 = pd.DataFrame(data2)
#df3 = pd.DataFrame(data3)
#display(df1)
#display(df2)
#display(df3)

ms0 = data1.values
ms0 = np.delete(ms0,-1,1)
ms0 = np.squeeze(ms0)
ms1 = data2.values
ms1 = np.delete(ms1,-1,1)
ms1 = np.squeeze(ms1)
cov = data3.values
cov = np.delete(cov,-1,1)

negative = datagen(ms0,cov,2000)
positive = datagen(ms1,cov,2000)


# label as negative or positive (negative = -1000, positive = 1000)
nlabel = np.full((negative.shape[0],1),-1000)
plabel = np.full((positive.shape[0],1),1000)

negative = np.hstack((negative, nlabel))
positive = np.hstack((positive, plabel))

In [3]:
# separate training and test sets
np.random.shuffle(negative)
np.random.shuffle(positive)

DS1_train = np.concatenate((negative[600:],positive[600:]))
DS1_test = np.concatenate((negative[0:600],positive[0:600]))

np.savetxt("hwk2_datasets/DS1/DS1_train.csv",DS1_train,delimiter=",")
np.savetxt("hwk2_datasets/DS1/DS1_test.csv",DS1_test,delimiter=",")

In [4]:
########### Question 2 ###########
## LDA

# load data
data1 = pd.read_csv("hwk2_datasets/DS1/DS1_train.csv", header=None)
data2 = pd.read_csv("hwk2_datasets/DS1/DS1_test.csv", header=None)

train = data1.values
test = data2.values

np.random.shuffle(train)
np.random.shuffle(test)

# x and y values
x_train = train[:,0:-1]
y_train = train[:,-1]
x_test = test[:,0:-1]
y_test = test[:,-1]

In [5]:
# LDA analysis
# determine P(y1), P(y2), mean1, mean2, and covariance

# P(y)
pneg = y_train[y_train < 0].size / y_train.size
ppos = y_train[y_train > 0].size / y_train.size

# means
mneg = np.mean(x_train[np.where(y_train < 0)], axis=0)
mpos = np.mean(x_train[np.where(y_train > 0)], axis=0)

# covariance
cneg = np.divide(np.matmul(np.subtract(x_train[np.where(y_train < 0)], mneg).T,np.subtract(x_train[np.where(y_train < 0)], mneg)),(x_train.shape[0]))
cpos = np.divide(np.matmul(np.subtract(x_train[np.where(y_train > 0)], mpos).T,np.subtract(x_train[np.where(y_train > 0)], mpos)),(x_train.shape[0]))
tcov = np.add(cneg,cpos)

mresults_LDA = np.array((mneg,mpos))
df = pd.DataFrame(mresults_LDA)
df.index = ['Neg Mean', 'Pos Mean']
display(df)

cresults_LDA = np.array(tcov)
df = pd.DataFrame(cresults_LDA)
print('Covariance:')
display(df)

# save coefficients
name = 'hwk2_datasets/DS1/m0.csv'
np.savetxt(name, mneg, delimiter=',')

name = 'hwk2_datasets/DS1/m1.csv'
np.savetxt(name, mpos, delimiter=',')

name = 'hwk2_datasets/DS1/cov.csv'
np.savetxt(name, tcov, delimiter=',')

# decision on test dataset
decision_test = np.log(pneg) - np.log(ppos) - 0.5*np.matmul((np.matmul(mneg.T,np.linalg.inv(tcov))),mneg) + 0.5*np.matmul((np.matmul(mpos.T,np.linalg.inv(tcov))),mpos) + np.matmul(np.matmul(x_test,np.linalg.inv(tcov)),np.subtract(mneg,mpos))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Neg Mean,1.21746,1.210236,1.20092,1.216288,1.187325,1.224769,1.165722,1.232455,1.228645,1.166006,1.214454,1.232744,1.172405,1.243096,1.213131,1.180325,1.25389,1.222228,1.218916,1.180524
Pos Mean,2.116734,2.101175,2.039455,2.099158,2.138343,2.083632,2.076554,2.101983,2.046757,2.110409,2.103519,2.10661,2.083266,2.140545,2.092506,2.176062,2.081606,2.14419,2.117653,2.134117


Covariance:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,7.497198,5.300577,5.796607,4.914541,5.620579,5.818427,4.378803,5.071256,4.745825,4.964389,3.725209,5.053845,6.816327,5.683798,5.815961,5.697846,5.56902,5.463929,5.278768,5.734987
1,5.300577,6.615423,5.127035,4.219602,5.285787,5.291786,4.164979,3.722892,4.038139,4.832822,3.230749,4.475632,5.719167,4.879523,5.238374,5.028853,5.439381,4.936236,5.09147,5.183985
2,5.796607,5.127035,6.973305,4.684291,5.579782,6.265085,4.303052,4.505169,4.60457,4.807502,3.023433,4.464449,6.170026,4.868962,5.836574,5.660113,5.944674,4.744457,4.405508,4.954123
3,4.914541,4.219602,4.684291,5.633733,5.10348,4.363989,3.509713,4.153038,3.208623,4.020098,2.52845,4.053078,5.709109,4.53257,4.615258,4.877749,4.508401,4.402997,3.661073,5.685939
4,5.620579,5.285787,5.579782,5.10348,6.841359,5.188162,4.82534,4.091695,4.495342,4.88504,3.928982,4.836597,6.016318,5.563531,5.759613,5.91862,5.713484,4.952627,5.11534,5.595726
5,5.818427,5.291786,6.265085,4.363989,5.188162,6.4123,4.173129,4.636757,4.513193,5.074682,2.775842,4.635302,6.177343,4.720881,5.566605,5.751426,5.671995,4.764066,4.497831,5.087636
6,4.378803,4.164979,4.303052,3.509713,4.82534,4.173129,4.96794,3.528793,3.764045,4.084573,2.864728,4.119521,4.441454,3.812692,4.486338,4.988594,4.057955,3.404566,4.341035,3.74038
7,5.071256,3.722892,4.505169,4.153038,4.091695,4.636757,3.528793,5.683777,3.323426,4.607229,2.273842,4.655732,5.888773,4.871616,4.667461,5.90173,4.279551,4.706996,4.486241,4.353783
8,4.745825,4.038139,4.60457,3.208623,4.495342,4.513193,3.764045,3.323426,4.803368,4.484648,2.995611,3.923231,4.900663,4.459729,4.729241,4.910234,4.91277,3.902084,4.055295,3.938201
9,4.964389,4.832822,4.807502,4.020098,4.88504,5.074682,4.084573,4.607229,4.484648,6.791757,3.12704,4.535938,6.57393,4.911664,4.789787,5.728722,4.560987,5.066691,4.254275,5.16533


In [6]:
# test
decision_test[decision_test > 0] = 1
decision_test[decision_test < 0] = -1
decision_test[decision_test == 1] = -1000
decision_test[decision_test == -1] = 1000

y_tp = np.array(y_test)
y_tp[y_tp == 1000] = 5000
results_tp = y_tp - decision_test

y_fp = np.array(y_test)
y_fp[y_fp == 1000] = 0
results_fp = y_fp - decision_test

y_tn = np.array(y_test)
y_tn[y_tn == -1000] = -5000
results_tn = y_tn - decision_test

y_fn = np.array(y_test)
y_fn[y_fn == -1000] = 0
results_fn = y_fn - decision_test

In [7]:
# error measures

tp = (results_tp == 4000).sum(axis=0)
fp = (results_fp == -2000).sum(axis=0)
tn = (results_tn == -4000).sum(axis=0)
fn = (results_fn == 2000).sum(axis=0)

accuracy = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = fp / (fp + tn)

results = np.array((tp,fp,tn,fn,accuracy,precision, recall, f1))
df = pd.DataFrame(results)
df.columns = ['LDA']
df.index = ['tp','fp','tn','fn','accuracy','precision', 'recall','f1']
display(df)

Unnamed: 0,LDA
tp,572.0
fp,35.0
tn,565.0
fn,28.0
accuracy,0.9475
precision,0.942339
recall,0.953333
f1,0.058333


In [8]:
########### Question 3 ###########
## kNN

# load data
data1 = pd.read_csv("hwk2_datasets/DS1/DS1_train.csv", header=None)
data2 = pd.read_csv("hwk2_datasets/DS1/DS1_test.csv", header=None)

train = data1.values
test = data2.values

np.random.shuffle(train)
np.random.shuffle(test)

# x and y values
x_train = train[:,0:-1]
y_train = train[:,-1]
x_test = test[:,0:-1]
y_test = test[:,-1]

In [9]:
# distance function
def distance(x0, x):
    return np.sqrt(np.sum(np.square(x-x0)))

In [10]:
# kNN
knum_list = [1,3,5,15]  # k nearest neighbours list
test_results = np.empty(shape=(len(knum_list),1200))

for n in range(len(knum_list)):
    knum = knum_list[n]
    y_list = []
    for i in range(x_test.shape[0]):
        current = x_test[i]
        y_slist = []
        min_index = []
        index = 0

        for j in range(knum):
            dtemp = 1000
            for k in range(x_train.shape[0]):
                if k in min_index:
                    continue

                d = distance(current,x_train[k])

                if dtemp > d:
                    dtemp = d
                    index = k
                    continue

            min_index.append(index)
            y_slist.append(y_train[index])

        num = sum(y_slist)
        if num > 0:
            y_list.append(1000)
        else:
            y_list.append(-1000)
    
    y_list = np.array(y_list)
    test_results[n] = y_list

In [11]:
# test
results_tp = np.empty(shape=(len(knum_list),1200))
results_fp = np.empty(shape=(len(knum_list),1200))
results_tn = np.empty(shape=(len(knum_list),1200))
results_fn = np.empty(shape=(len(knum_list),1200))

for i in range(len(knum_list)):
    y_tp = np.array(y_test)
    y_tp[y_tp == 1000] = 5000
    results_tp[i] = y_tp - test_results[i]

    y_fp = np.array(y_test)
    y_fp[y_fp == 1000] = 0
    results_fp[i] = y_fp - test_results[i]

    y_tn = np.array(y_test)
    y_tn[y_tn == -1000] = -5000
    results_tn[i] = y_tn - test_results[i]

    y_fn = np.array(y_test)
    y_fn[y_fn == -1000] = 0
    results_fn[i] = y_fn - test_results[i]


In [12]:
# error measures
tp = []
fp = []
tn = []
fn = []

accuracy = []
precision = []
recall = []
f1 = []

for i in range(len(knum_list)):
    tp.append((results_tp[i] == 4000).sum(axis=0))
    fp.append((results_fp[i] == -2000).sum(axis=0))
    tn.append((results_tn[i] == -4000).sum(axis=0))
    fn.append((results_fn[i] == 2000).sum(axis=0))

    accuracy.append((tp[i] + tn[i]) / (tp[i] + fp[i] + tn[i] + fn[i]))
    precision.append(tp[i] / (tp[i] + fp[i]))
    recall.append(tp[i] / (tp[i] + fn[i]))
    f1.append(fp[i] / (fp[i] + tn[i]))

results = np.array((tp,fp,tn,fn,accuracy,precision,recall,f1))
df = pd.DataFrame(results)
df.columns=['k=1','k=3','k=5','k=15']
df.index=['tp','fp','tn','fn','accuracy','precision','recall','f1']
display(df)

Unnamed: 0,k=1,k=3,k=5,k=15
tp,321.0,304.0,329.0,340.0
fp,272.0,265.0,278.0,280.0
tn,328.0,335.0,322.0,320.0
fn,279.0,296.0,271.0,260.0
accuracy,0.540833,0.5325,0.5425,0.55
precision,0.541315,0.534271,0.54201,0.548387
recall,0.535,0.506667,0.548333,0.566667
f1,0.453333,0.441667,0.463333,0.466667


In [13]:
########### Question 4 ###########

# load data
data1 = pd.read_csv('hwk2_datasets/DS2_c1_m1.txt', sep=",", header=None)
data2 = pd.read_csv('hwk2_datasets/DS2_c1_m2.txt', sep=",", header=None)
data3 = pd.read_csv('hwk2_datasets/DS2_c1_m3.txt', sep=",", header=None)
data4 = pd.read_csv('hwk2_datasets/DS2_c2_m1.txt', sep=",", header=None)
data5 = pd.read_csv('hwk2_datasets/DS2_c2_m2.txt', sep=",", header=None)
data6 = pd.read_csv('hwk2_datasets/DS2_c2_m3.txt', sep=",", header=None)
data7 = pd.read_csv('hwk2_datasets/DS2_Cov1.txt', sep=",", header=None)
data8 = pd.read_csv('hwk2_datasets/DS2_Cov2.txt', sep=",", header=None)
data9 = pd.read_csv('hwk2_datasets/DS2_Cov3.txt', sep=",", header=None)

c1_m1 = data1.values
c1_m2 = data2.values
c1_m3 = data3.values
c2_m1 = data4.values
c2_m2 = data5.values
c2_m3 = data6.values
cov1 = data7.values
cov2 = data8.values
cov3 = data9.values

c1_m1 = np.delete(c1_m1,-1,1)
c1_m2 = np.delete(c1_m2,-1,1)
c1_m3 = np.delete(c1_m3,-1,1)
c2_m1 = np.delete(c2_m1,-1,1)
c2_m2 = np.delete(c2_m2,-1,1)
c2_m3 = np.delete(c2_m3,-1,1)
cov1 = np.delete(cov1,-1,1)
cov2 = np.delete(cov2,-1,1)
cov3 = np.delete(cov3,-1,1)

c1_m1 = np.squeeze(c1_m1)
c1_m2 = np.squeeze(c1_m2)
c1_m3 = np.squeeze(c1_m3)
c2_m1 = np.squeeze(c2_m1)
c2_m2 = np.squeeze(c2_m2)
c2_m3 = np.squeeze(c2_m3)
cov1 = np.squeeze(cov1)
cov2 = np.squeeze(cov2)
cov3 = np.squeeze(cov3)

In [14]:
# generate dataset 2
neg = np.empty(shape=(1,20))
pos = np.empty(shape=(1,20))

for i in range(2000):
    choice = np.random.choice(np.arange(1,4), p = [0.1, 0.42, 0.48])

    if choice == 1:
        neg = np.vstack((neg,datagen(c1_m1,cov1,1)))
    elif choice == 2:
        neg = np.vstack((neg,datagen(c1_m2,cov2,1)))
    elif choice == 3:
        neg = np.vstack((neg,datagen(c1_m3,cov3,1)))

for i in range(2000):
    choice = np.random.choice(np.arange(1,4), p = [0.1, 0.42, 0.48])
    
    if choice == 1:
        pos = np.vstack((pos,datagen(c2_m1,cov1,1)))
    elif choice == 2:
        pos = np.vstack((pos,datagen(c2_m2,cov2,1)))
    elif choice == 3:
        pos = np.vstack((pos,datagen(c2_m3,cov3,1)))

neg = np.delete(neg,(0),axis=0)
pos = np.delete(pos,(0),axis=0)

nlabel = np.full((neg.shape[0],1),-1000)
plabel = np.full((pos.shape[0],1),1000)

neg = np.hstack((neg, nlabel))
pos = np.hstack((pos, plabel))

In [15]:
# save data
np.random.shuffle(neg)
np.random.shuffle(pos)

DS2_train = np.concatenate((neg[600:],pos[600:]))
DS2_test = np.concatenate((neg[0:600],pos[0:600]))

np.savetxt("hwk2_datasets/DS2/DS2_train.csv",DS2_train,delimiter=",")
np.savetxt("hwk2_datasets/DS2/DS2_test.csv",DS2_test,delimiter=",")

In [16]:
########### Question 5 ###########

# load data
data3 = pd.read_csv("hwk2_datasets/DS2/DS2_train.csv", header=None)
data4 = pd.read_csv("hwk2_datasets/DS2/DS2_test.csv", header=None)

train = data3.values
test = data4.values

np.random.shuffle(train)
np.random.shuffle(test)

# x and y values
x_train = train[:,0:-1]
y_train = train[:,-1]
x_test = test[:,0:-1]
y_test = test[:,-1]

In [17]:
# LDA analysis
# determine P(y1), P(y2), mean1, mean2, and covariance

# P(y)
pneg2 = y_train[y_train < 0].size / y_train.size
ppos2 = y_train[y_train > 0].size / y_train.size

# means
mneg2 = np.mean(x_train[np.where(y_train < 0)], axis=0)
mpos2 = np.mean(x_train[np.where(y_train > 0)], axis=0)

# covariance
cneg2 = np.divide(np.matmul(np.subtract(x_train[np.where(y_train < 0)], mneg2).T,np.subtract(x_train[np.where(y_train < 0)], mneg2)),(x_train.shape[0]))
cpos2 = np.divide(np.matmul(np.subtract(x_train[np.where(y_train > 0)], mpos2).T,np.subtract(x_train[np.where(y_train > 0)], mpos2)),(x_train.shape[0]))
tcov2 = np.add(cneg2,cpos2)

mresults_LDA = np.array((mneg2,mpos2))
df = pd.DataFrame(mresults_LDA)
df.index = ['Neg Mean', 'Pos Mean']
display(df)

cresults_LDA = np.array(tcov2)
df = pd.DataFrame(cresults_LDA)
print('Covariance:')
display(df)

# save coefficients
name = 'hwk2_datasets/DS2/m0.csv'
np.savetxt(name, mneg2, delimiter=',')

name = 'hwk2_datasets/DS2/m1.csv'
np.savetxt(name, mpos2, delimiter=',')

name = 'hwk2_datasets/DS2/cov.csv'
np.savetxt(name, tcov2, delimiter=',')

# decision on test dataset
decision_test = np.log(pneg2) - np.log(ppos2) - 0.5*np.matmul((np.matmul(mneg2.T,np.linalg.inv(tcov2))),mneg2) + 0.5*np.matmul((np.matmul(mpos2.T,np.linalg.inv(tcov2))),mpos2) + np.matmul(np.matmul(x_test,np.linalg.inv(tcov2)),np.subtract(mneg2,mpos2))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Neg Mean,0.942604,0.976817,0.940859,1.007138,1.031264,0.946021,0.945092,1.009528,0.973279,0.938582,0.946268,0.929144,0.939061,0.89144,1.00686,0.957359,0.975721,1.043182,0.913677,0.92528
Pos Mean,1.178269,1.181999,1.160282,1.234148,1.188961,1.219658,1.195624,1.21147,1.226366,1.153725,1.155376,1.121287,1.143648,1.124954,1.183411,1.203071,1.159971,1.229718,1.150984,1.225583


Covariance:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,8.520658,5.81524,4.981714,5.494844,4.781764,6.048138,6.289465,6.152822,4.889211,5.656796,6.005879,5.492174,5.491505,6.677019,5.833352,6.219237,5.835791,5.941243,5.962839,6.288042
1,5.81524,7.342387,5.001812,5.309559,5.129068,6.119745,6.536888,5.526977,4.632696,5.245886,5.037282,5.235503,5.078498,6.317918,5.758881,6.142517,5.485016,5.122884,6.290456,5.832111
2,4.981714,5.001812,7.098356,5.455147,4.998182,4.947647,6.092538,4.779383,4.745134,5.008128,4.908801,5.160774,5.37296,6.28857,5.575614,5.11145,5.580422,4.59053,6.013179,5.94144
3,5.494844,5.309559,5.455147,6.793818,4.484308,5.665758,6.366383,5.567917,4.86252,5.610007,5.14554,5.068241,5.227627,5.776521,5.512147,5.632347,5.388848,5.332621,6.038238,5.841207
4,4.781764,5.129068,4.998182,4.484308,5.877234,4.90413,5.255423,4.72672,3.837084,4.279378,4.036249,4.380219,4.539239,5.113712,5.454472,5.150909,4.562021,4.703259,5.501267,5.147479
5,6.048138,6.119745,4.947647,5.665758,4.90413,7.818743,6.683635,5.438422,5.485375,5.617881,4.979661,5.084356,5.515525,6.540996,5.804777,6.137474,5.73127,5.56466,6.639065,6.213309
6,6.289465,6.536888,6.092538,6.366383,5.255423,6.683635,8.509064,6.141072,5.781604,6.264797,5.983772,5.927234,5.775556,7.178434,6.102029,6.483261,6.042004,5.719977,7.313271,6.400053
7,6.152822,5.526977,4.779383,5.567917,4.72672,5.438422,6.141072,6.734412,4.743928,5.220924,4.893286,5.073178,4.576804,5.804013,5.537887,5.590567,5.131131,5.877245,5.759308,5.613582
8,4.889211,4.632696,4.745134,4.86252,3.837084,5.485375,5.781604,4.743928,5.938263,4.669332,4.399786,4.42646,4.467539,5.185071,4.852374,4.538677,4.809399,4.632067,4.955056,4.918976
9,5.656796,5.245886,5.008128,5.610007,4.279378,5.617881,6.264797,5.220924,4.669332,6.853958,4.925302,4.828718,5.090276,5.929239,5.336908,5.36349,5.191316,4.718799,6.364283,5.419049


In [18]:
# test
decision_test[decision_test > 0] = 1
decision_test[decision_test < 0] = -1
decision_test[decision_test == 1] = -1000
decision_test[decision_test == -1] = 1000

y_tp = np.array(y_test)
y_tp[y_tp == 1000] = 5000
results_tp = y_tp - decision_test

y_fp = np.array(y_test)
y_fp[y_fp == 1000] = 0
results_fp = y_fp - decision_test

y_tn = np.array(y_test)
y_tn[y_tn == -1000] = -5000
results_tn = y_tn - decision_test

y_fn = np.array(y_test)
y_fn[y_fn == -1000] = 0
results_fn = y_fn - decision_test


In [19]:
# error measures
tp = (results_tp == 4000).sum(axis=0)
fp = (results_fp == -2000).sum(axis=0)
tn = (results_tn == -4000).sum(axis=0)
fn = (results_fn == 2000).sum(axis=0)

accuracy = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = fp / (fp + tn)

results = np.array((tp,fp,tn,fn,accuracy,precision, recall, f1))
df = pd.DataFrame(results)
df.columns = ['LDA']
df.index = ['tp','fp','tn','fn','accuracy','precision', 'recall', 'f1']
display(df)

Unnamed: 0,LDA
tp,297.0
fp,307.0
tn,293.0
fn,303.0
accuracy,0.491667
precision,0.491722
recall,0.495
f1,0.511667


In [20]:
# kNN
knum_list = [1,3,5,15]  # k nearest neighbours list
test_results = np.empty(shape=(len(knum_list),1200))

for n in range(len(knum_list)):
    knum = knum_list[n]
    y_list = []
    for i in range(x_test.shape[0]):
        current = x_test[i]
        y_slist = []
        min_index = []
        index = 0

        for j in range(knum):
            dtemp = 1000
            for k in range(x_train.shape[0]):
                if k in min_index:
                    continue

                d = distance(current,x_train[k])

                if dtemp > d:
                    dtemp = d
                    index = k
                    continue

            min_index.append(index)
            y_slist.append(y_train[index])

        num = sum(y_slist)
        if num > 0:
            y_list.append(1000)
        else:
            y_list.append(-1000)
    
    y_list = np.array(y_list)
    test_results[n] = y_list

In [21]:
# test
results_tp = np.empty(shape=(len(knum_list),1200))
results_fp = np.empty(shape=(len(knum_list),1200))
results_tn = np.empty(shape=(len(knum_list),1200))
results_fn = np.empty(shape=(len(knum_list),1200))

for i in range(len(knum_list)):
    y_tp = np.array(y_test)
    y_tp[y_tp == 1000] = 5000
    results_tp[i] = y_tp - test_results[i]

    y_fp = np.array(y_test)
    y_fp[y_fp == 1000] = 0
    results_fp[i] = y_fp - test_results[i]

    y_tn = np.array(y_test)
    y_tn[y_tn == -1000] = -5000
    results_tn[i] = y_tn - test_results[i]

    y_fn = np.array(y_test)
    y_fn[y_fn == -1000] = 0
    results_fn[i] = y_fn - test_results[i]


In [22]:
# error measures
tp = []
fp = []
tn = []
fn = []

accuracy = []
precision = []
recall = []
f1 = []

for i in range(len(knum_list)):
    tp.append((results_tp[i] == 4000).sum(axis=0))
    fp.append((results_fp[i] == -2000).sum(axis=0))
    tn.append((results_tn[i] == -4000).sum(axis=0))
    fn.append((results_fn[i] == 2000).sum(axis=0))

    accuracy.append((tp[i] + tn[i]) / (tp[i] + fp[i] + tn[i] + fn[i]))
    precision.append(tp[i] / (tp[i] + fp[i]))
    recall.append(tp[i] / (tp[i] + fn[i]))
    f1.append(fp[i] / (fp[i] + tn[i]))

results = np.array((tp,fp,tn,fn,accuracy,precision,recall,f1))
df = pd.DataFrame(results)
df.columns=['k=1','k=3','k=5','k=15']
df.index=['tp','fp','tn','fn','accuracy','precision','recall','f1']
display(df)

Unnamed: 0,k=1,k=3,k=5,k=15
tp,305.0,307.0,299.0,311.0
fp,272.0,271.0,256.0,283.0
tn,328.0,329.0,344.0,317.0
fn,295.0,293.0,301.0,289.0
accuracy,0.5275,0.53,0.535833,0.523333
precision,0.528596,0.531142,0.538739,0.523569
recall,0.508333,0.511667,0.498333,0.518333
f1,0.453333,0.451667,0.426667,0.471667
