In [103]:
import numpy as np
import pandas as pd
from math import sqrt
import matplotlib.pyplot as plt
import sklearn.linear_model as skllm
%matplotlib inline

In [104]:
sales = pd.read_csv("kc_house_data_small.csv")
train_sales = pd.read_csv("kc_house_data_small_train.csv")
test_sales = pd.read_csv("kc_house_data_small_test.csv")
valid_sales = pd.read_csv("kc_house_data_validation.csv")

In [105]:
def get_features_matrix(dataframe, features, output):
  dataframe['constant'] = 1
  features = ['constant'] + features
  #print(features)
  features_matrix = dataframe[features].to_numpy()
  output_array = dataframe[output].to_numpy()
  #print(features_matrix)
  return (features_matrix,output_array)

In [122]:
def norms(features_matrix):
    norms_matrix = np.array([]) 
    normalized_features = np.array([])
    for i in range(len(features_matrix[0])):
        norms_matrix = np.append(norms_matrix, np.sqrt(np.dot(features_matrix[:,i],features_matrix[:,i])))
        print(norms_matrix.shape)
    normalized_features = features_matrix/norms_matrix
    #print(normalized_features.shape)
    #print(norms_matrix)
    #print(normalized_features.shape)
    return(normalized_features, norms_matrix)

In [123]:
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
target = 'price'

In [124]:
(features_train, output_train) = get_features_matrix(train_sales, feature_list, target)
(features_test, output_test) = get_features_matrix(test_sales, feature_list, target)
(features_valid, output_valid) = get_features_matrix(valid_sales, feature_list, target)

In [125]:
#to ensure consistent normalization, divide test and 
#valid features matrix with
#the same norm_matrix retrieved from the training set
(normalized_train, norm_matrix) = norms(features_train)
normalized_test = features_test/norm_matrix
normalized_valid = features_valid/norm_matrix
norm_matrix

(1,)
(2,)
(3,)
(4,)
(5,)
(6,)
(7,)
(8,)
(9,)
(10,)
(11,)
(12,)
(13,)
(14,)
(15,)
(16,)
(17,)
(18,)


array([7.43437960e+01, 2.57850732e+02, 1.65977596e+02, 1.67688695e+05,
       3.11491224e+06, 1.17240138e+02, 6.32455532e+00, 5.87962584e+01,
       2.57906960e+02, 5.75318173e+02, 1.45365513e+05, 3.90863965e+04,
       1.46559338e+05, 2.99361461e+04, 3.53588887e+03, 9.08552952e+03,
       1.55531640e+05, 2.46533053e+06])

In [110]:
query_house = normalized_test[0]
tenth_house = normalized_train[9]

euclidean distance is: 
if p1, p2, p3,...pn are elements of the first vector 
and q1, q2, q3, ..., qn are elements of the second vector

where all elements are points on ther real number line 
the distance between every element is = |pn - qn| 
which when squared and summed, then taken a sqrt root of leaves all positive numbers unchanged 
but replaces negative distances with their absolute value 

In [111]:
euclidean_distance = np.sqrt(np.sum((query_house-tenth_house)**2))
euclidean_distance

0.05972359371398078

now we will build a function that loops through every observation to find the nearest neighbor house this can be done by vectorization 

In [112]:
results = np.array([])
for i in range(3):
    results = normalized_train[i] - normalized_test[0]
    print(results-results)
    
#vectorization works 


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [113]:
difference = normalized_train - normalized_test[0]
difference
#difference is element by element wise difference 
#to turn this into euclidean distance 
#we have to first square the difference, sum over all elements in 
#the difference vector, then take the square root of the sum

euclidean_distance_all = np.sqrt(np.sum(difference**2, axis=1))
euclidean_distance_all[100]

0.023708232416678195

In [114]:
def compute_distance(features_instances, features_query):
    difference = features_instances - features_query
    euclidean_distance = np.sqrt(np.sum(difference**2, axis=1))
    return(euclidean_distance)
    

In [115]:
third_house_distance = compute_distance(normalized_train, normalized_test[2])
third_house_distance.argsort()[0]
print(third_house_distance.argsort()[0], min(third_house_distance))


382 0.0028604955575117085


In [116]:
print(third_house_distance.argsort()[:4])

[ 382 1149 4087 3142]


In [117]:
#the value of the 382th house would be the 'predicted output' for the query house
output_train[382]

249000

In [118]:
def compute_k_nearest_neighbors(k,features_instances, features_query):
    distances = compute_distance(features_instances, features_query)
    return np.argsort(distances, axis = 0)[:k]

In [119]:
knn_for_third_house = compute_k_nearest_neighbors(4, normalized_train, normalized_test[2])
knn_for_third_house

array([ 382, 1149, 4087, 3142], dtype=int64)

In [120]:
def compute_distances_k_average(k, features_instances, output_values, features_query):
    k_neighbors = compute_k_nearest_neighbors(k,features_instances, features_query)
    knn_average = np.mean(output_values[k_neighbors])
    return(knn_average)

In [121]:
fourth_nearest_neighbor_average = compute_distances_k_average(4, normalized_train, output_train, normalized_test[2])
fourth_nearest_neighbor_average

413987.5