## 从零开始自己写一个KNN算法


In [1]:
from sklearn import datasets
from collections import Counter
from sklearn.model_selection import train_test_split
import numpy as np

iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2003)

In [2]:
def euc_dis(instance1, instance2):
    dist = np.sqrt(sum((instance1 - instance2) ** 2))
    return dist

def knn_classify(X, y, testInstance, k):
    # 时间复杂度： O(N) N: # of samples
    distance = [euc_dis(x, testInstance) for x in X]
    # O(NlogN) 优化：使用priority queue -> O(NlogK)
    kneighbors = np.argsort(distance)[: k]
    count = Counter(y[kneighbors])
    return count.most_common()[0][0]

In [3]:
predictions = [knn_classify(X_train, y_train, data, 3) for data in X_test]
correct = np.count_nonzero((predictions == y_test) == True)
print('Accuracy is: {:.3f}'.format(correct / len(X_test)))

Accuracy is: 0.921


## 调用KNN函数实现分类

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)

accuracy_score(y_test, clf.predict(X_test))

0.9210526315789473

## KNN用于回归

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('data.csv')
df


In [None]:
df_colors = df.Color.str.get_dummise().add_prefix('Color: ')
df_type = df.Type.apply(str).str.get_dummies().add_prefix('Type: ')
df = pd.concat([df, df_colors, df_type], axis=1)
df = df.drop(['Brand', 'Type', 'Color'], axis=1)
df


In [None]:
matrix = df.corr()
f, ax = plt.subplot(figsize=(8, 6))
sns.heatmap(matrix, square=True)
plt.title('Car Price Variables')


In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

X = df[['Construction Year', 'Days Until MOT', 'Odometer']]
y = df['Ask Price'].values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)

X_normalizer = StandardScaler() # 对数据进行标准归一化
X_train = X_normalizer.fit_transform(X_train) # 数据转换只使用训练数据进行训练
X_test = X_normalizer.transform(X_test) # 测试数据进行的转换过程需要与训练数据一致

y_normalizer = StandardScaler()
y_train = y_normalizer.fit_transform(y_train)
y_test = y_normalizer.transform(y_test)

knn = KNeighborsRegressor(n_neighbors=2)
knn.fit(X_train, y_train.ravel())

y_pred = knn.predict(X_test)
y_pred_inv = y_normalizer.inverse_transform(y_pred) # 将标准化的数据转换回去
y_test_inv = y_normalizer.inverse_transform(y_test)

plt.scatter(y_pred_inv, y_test_inv)
diagonal = np.linspace(500, 1500, 100)
plt.plot(diagonal, diagonal, '-r')
plt.xlabel('Predicted ask price')
plt.ylabel('Ask price')
plt.show()

print(y_pred_inv)
