# 这是作业1中KNN分类器的实现

In [2]:
# 获取 CIFAR-10 数据集，并且处理成为train,validation,test 三部分
import numpy as np
import torch
import pickle
# 加载 CIFAR-10 数据集
# 这里使用的是CIFAR-10 的 data_batch_1 共一万个数据，我自己划分为 train,validation,test 三部分
# 前 6000 为training set，接着3000为 validation set 最后1000为 test set

def load_train_data(filename):
    with open(filename,'rb') as f:
        datadict = pickle.load(f,encoding = 'latin1')
        X = datadict['data']
        Y = datadict['labels']
        X = X.reshape(10000,3,32,32)
        Y = np.array(Y)
        return X,Y
    
    
root = 'data_batch_2'
(data, labels) = load_train_data(root)

data = data.reshape(10000,3072)
train_set = data[0:3000]
train_labels = labels[0:3000]

val_set = data[6000:6500]
val_labels = labels[6000:6500]

test_set = data[9000:9500]
test_labels = labels[9000:9500]



In [9]:
# KNN 算法，这是一个10分类问题
# 这个KNN与往常的KNN不太一样，是稻草人算法
# k为其中的超参数，要通过 val_set 调参
class KNN(object):
    def __init__(self, f, k=1):
        self.k = k
        self.dis = f
    
    def train(self, train_set, train_label):
        self.X = train_set
        self.y = train_label
    
    def predict(self, X):
        Ypred = np.zeros(X.shape[0])
        distance = self.dis(X, self.X)
        for i in range(X.shape[0]):
            closest = self.y[np.argsort(distance[i])[:self.k]]
            Ypred[i]=np.argmax(np.bincount(closest))  
        return Ypred, distance
            
def distance(x, y):
    # L2 距离
    # 输出结果为一个矩阵，num_x 为行，num_y 为列
    # x, y 都是行向量
    num_x = x.shape[0]
    num_y = y.shape[0]  
    dis = np.zeros((num_x,num_y))
    d1 = np.sum(np.square(x),axis = 1,keepdims=True).expand(num_x,num_y)
    d2 = np.sum(np.square(y),axis = 1).expand(num_x,num_y)
    d3 = np.dot(x,y.T) * -2
    dis = np.sqrt(d1+d3+d2)
    #dis = np.sqrt(d1.reshape(-1,1)+d3+d2)
    return dis


In [139]:
# 调整超参数k的过程
K = [1,3,5,8,10,12,15,19]
k = 5
# for k in K:
p = KNN(distance, k)
p.train(train_set, train_labels)
(Ypred,dist) = p.predict(val_set)
print(np.shape(Ypred))
count = np.sum(Ypred == val_labels)
print ('k = %s accuracy in val_set: %s / %s == %s' % (k,count,len(Ypred),count / len(Ypred)))


(500,)
k = 5 accuracy in val_set: 54 / 500 == 0.108


In [172]:
A = np.array([[1,2,3],[1,2,9],[1,3,9]])
B = np.array([[1,5,6],[4,6,8],[7,8,9],[3,4,8]])
dist1 = distance(A,B)
print(dist1)
print(np.linalg.norm(A[0]-B[2]))

dist2 = distance(val_set,train_set)
print(val_set[0])
print(train_set[0])
print(dist2[0][0])
print(np.linalg.norm(train_set[2]-val_set[0], dtype))

[[ 4.24264069  7.07106781 10.39230485  5.74456265]
 [ 4.24264069  5.09901951  8.48528137  3.        ]
 [ 3.60555128  4.35889894  7.81024968  2.44948974]]
10.392304845413264
[41 65 64 ... 48 59 50]
[ 35  27  25 ... 169 168 168]
827.9547089062299
8731.501989921322


In [10]:
print(np.shape(val_set[0]))
a = val_set[0].reshape(1,3072)
b = train_set[0].reshape(1,3072)
print(np.shape(a))
print(distance(a,b))
a = val_set[0].reshape(3072,1)
b = train_set[0].reshape(3072,1)
c = a - b
d = b - a
print(np.linalg.norm(c,ord=2))
print(np.linalg.norm(d,ord=2))
print(np.linalg.norm(a - b,ord=2))
print(np.linalg.norm(b - a,ord=2))
print(np.shape(b))
print(np.shape(a))

(3072,)
(1, 3072)
[[827.95470891]]
2210392727680
2210361230960
9067.635910202835
7159.24053234699
9067.635910202835
7159.24053234699
(3072, 1)
(3072, 1)


In [12]:
a = np.ones((3,3))
b = np.array([2,3,4])
c = np.array([7,7,8])

print(a+b)
b = np.array([2,3,4]).reshape(-1,1)
print(a+b)

[[3. 4. 5.]
 [3. 4. 5.]
 [3. 4. 5.]]
[[3. 3. 3.]
 [4. 4. 4.]
 [5. 5. 5.]]


In [17]:
print(b)
print(c)

print(np.linalg.norm(b-c))
print(np.linalg.norm(c-b))
b = b.reshape((1,3))
c = c.reshape((1,3))
print(np.shape(b))
print(distance(b,c))



[[2 3 4]]
[[7 7 8]]
7.54983443527075
7.54983443527075
(1, 3)
[[7.54983444]]
