# NN, k-NN

Raw implementation

In [1]:
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt
import math
from sklearn import linear_model
from sklearn.model_selection import KFold

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

## Load data

In [2]:
train_data = pd.read_csv("kc_house_data_small_train.csv")
val_data = pd.read_csv("kc_house_data_validation.csv")
test_data = pd.read_csv("kc_house_data_small_test.csv")

## Preprocess the data

#### Converting to nump

In [3]:
def get_numpy_data(df, features, output):
    feature_matrix = np.array(df[features])
    output_array = np.array(df[output])
    return (feature_matrix, output_array)

#### Normalizing features

In [4]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features / norms
    return (normalized_features, norms)

In [5]:
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 
            'view', 'condition', 'grade', 'sqft_above', 'sqft_basement','yr_built', 
            'yr_renovated','zipcode','lat','long','sqft_living15','sqft_lot15']

output = ['price']

In [6]:

X_train, Y_train = get_numpy_data(train_data, features, output)
X_val, Y_val = get_numpy_data(val_data, features, output)
X_test, Y_test = get_numpy_data(test_data, features, output)


X_train, norms = normalize_features(X_train)
X_val = X_val / norms
X_test = X_test / norms


print(X_test[0])
print(X_train[9])

[ 0.01551285  0.01807473  0.01759212  0.00160518  0.017059    0.
  0.05102365  0.0116321   0.01564352  0.01362084  0.02481682  0.01350306
  0.          0.01345762  0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01163464  0.00602491  0.0083488   0.00050756  0.01279425  0.
  0.          0.01938684  0.01390535  0.0096309   0.          0.01302544
  0.          0.0134557   0.01346821 -0.01346251  0.01195898  0.00156612]


# 1-Nearest Neighbour

In [7]:
#Euclidean distance between the query house and the 10th house of the training set?
np.sqrt(np.sum((X_test[0] - X_train[9])**2))

0.05972359374484452

In [8]:
# Among the first 10 training houses, which house is the closest to the query house?
l = []
for i in range(10):
    l.append(np.sqrt(np.sum((X_test[0] - X_train[i])**2)))
print(np.argmin(np.array(l)))

8


#### Function for computing distances

In [9]:
def compute_distances(X_train, query):
    distances = np.sqrt(np.sum(((X_train - query)**2),axis=1))
    return distances

In [10]:
#Take the query house to be third house of the test set (features_test[2]). What is the index of the house in the training set that is closest to this query house?

dists = compute_distances(X_train, X_test[2])
np.argmin(np.array(dists))

382

In [11]:
#What is the predicted value of the query house based on 1-nearest neighbor regression?

print(Y_train[382])

[249000]


# K-Nearest Neighbour

In [13]:
def k_nearest_neighbors(k, X_train, query):
    dists = compute_distances(X_train, query)
    neighbors = np.argsort(dists)[:k]
    return neighbors

In [14]:
#Take the query house to be third house of the test set (features_test[2]). What are the indices of the 4 training houses closest to the query house?

k_nearest_neighbors(4, X_train, X_test[2])

array([ 382, 1149, 4087, 3142], dtype=int64)

In [15]:
"""
Now that we know how to find the k-nearest neighbors, write a function that predicts the value of a given query house. 
For simplicity, take the average of the prices of the k nearest neighbors in the training set. The function should have 
the following parameters:
"""
def predict_output_of_query(k, X_train, Y_train, query):
    neighbors = k_nearest_neighbors(k, X_train, query)
    prediction = np.mean(Y_train[neighbors])
    return prediction



predict_output_of_query(4, X_train, Y_train, X_test[2])

413987.5

### write a function to predict the value of each and every house in a query set

In [18]:
def predict_output(k, X_train, Y_train, query_set):
    predictions = []
    for query in query_set:
        prediction = predict_output_of_query(k,X_train,Y_train, query)
        predictions.append(prediction)
    return predictions



# Make predictions for the first 10 houses in the test set, using k=10. 
# What is the index of the house in this query set that has the lowest predicted value? 
# What is the predicted value of this house?


print(predict_output(10, X_train, Y_train, X_test[:10]))


[881300.0, 431860.0, 460595.0, 430200.0, 766750.0, 667420.0, 350032.0, 512800.7, 484000.0, 457235.0]


'\n[881300.0,\n 431860.0,\n 460595.0,\n 430200.0,\n 766750.0,\n 667420.0,\n 350032.0, --> min, index = 6\n 512800.7,\n 484000.0,\n 457235.0]\n'

In [19]:
RSS = []

for k in range(1,16):
    val_pred = predict_output(k, X_train, Y_train, X_val)
    rss = np.sum((Y_val - val_pred)**2)
    RSS.append(rss)    
    
print(np.argmin(RSS))
#14  so k = 15

14


In [21]:
test_pred = predict_output(15, X_train, Y_train, X_test)
rss = np.sum((Y_test - test_pred)**2)
rss
#6.811729487234961e+17

6.811729487234961e+17