# Machine Learning: Regression (Module 2, week 6) - Kernel & KNN Regression
Keywords: Kernel Regression, KNN Regression

In [7]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model

In [1]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
def get_numpy_data(data, features, output):
    ones = np.ones(data.shape[0])
    data['constant'] = ones
    all_features = ['constant']+features
    features_matrix = data.as_matrix(all_features)
    output_array = data.as_matrix([output])[:,0]
    return(features_matrix, output_array)

In [4]:
def get_numpy_data(data, features, output):
    ones = np.ones(data.shape[0])
    data['constant'] = ones
    all_features = ['constant']+features
    features_matrix = data.as_matrix(all_features)
    output_array = data.as_matrix([output])[:,0]
    return(features_matrix, output_array)

In [14]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features / norms    
    return (normalized_features, norms)

In [15]:
sales = pd.read_csv('kc_house_data_small.csv', dtype=dtype_dict)
train = pd.read_csv('kc_house_data_small_train.csv',dtype=dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv',dtype=dtype_dict)
validation = pd.read_csv('kc_house_data_validation.csv',dtype=dtype_dict)

In [16]:
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
features_train, output_train = get_numpy_data(train, feature_list, 'price')
features_test, output_test = get_numpy_data(test, feature_list, 'price')
features_valid, output_valid = get_numpy_data(validation, feature_list, 'price')

In [17]:
features_train, output_train = get_numpy_data(train, feature_list, 'price')

In [18]:
features_train, norms = normalize_features(features_train)
features_test = features_test / norms
features_valid = features_valid / norms

In [19]:
print features_test[0]
print features_train[9]

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059    0.
  0.05102365  0.0116321   0.01564352  0.01362084  0.02481682  0.01350306
  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


### Quiz Question: What is the Euclidean distance between the query house and the 10th house of the training set?

In [29]:
def euclidian_distance_one_row(first, second):
    return np.sqrt(np.sum((first-second)**2))

In [30]:
euclidian_distance_one_row(features_train[9], features_test[0])

0.059723593713980783

### Quiz Question: Among the first 10 training houses, which house is the closest to the query house?

In [32]:
for training_feature in features_train[0:10]:
    print euclidian_distance_one_row(training_feature, features_test[0])

0.060274709163
0.0854688114764
0.0614994643528
0.0534027397929
0.0584448406017
0.0598792150981
0.0546314049678
0.0554310832361
0.0523836278402
0.059723593714


#### Vectorizing the distances calculation

The 9th house is the closest

In [53]:
def euclidian_distance(first, second):
    return np.sqrt(np.sum((first-second)**2, axis=1))

### Quiz Question: Take the query house to be third house of the test set (features_test[2]). What is the index of the house in the training set that is closest to this query house?

In [54]:
np.argmin(euclidian_distance(features_train, features_test[2]))

382

### Quiz Question: What is the predicted value of the query house based on 1-nearest neighbor regression?

In [59]:
train['price'][382]

249000.0

In [60]:
def k_nearest_neighbors(k, feature_train, features_query):
    distances = euclidian_distance(features_train, features_query)
    neighbors = np.argsort(distances)
    return neighbors[0:k]

### Quiz Question: Take the query house to be third house of the test set (features_test[2]). What are the indices of the 4 training houses closest to the query house?

In [62]:
k_nearest_neighbors(4, features_train, features_test[2])

array([ 382, 1149, 4087, 3142])

In [73]:
def predict_output_of_query(k, features_train, output_train, features_query):
    neighbors = k_nearest_neighbors(k, features_train, features_query)
    prediction = np.average(output_train[neighbors])
    return prediction

### Quiz Question: Again taking the query house to be third house of the test set (features_test[2]), predict the value of the query house using k-nearest neighbors with k=4 and the simple averaging method described and implemented above.

In [75]:
k=4
features_query = features_test[2]
predict_output_of_query(k, features_train, output_train, features_query)

413987.5

In [76]:
def predict_output(k, features_train, output_train, features_query):
    predictions = []
    for query in features_query:
        predictions.append(predict_output_of_query(k, features_train, output_train, query))
    return predictions

### Quiz Question: Make predictions for the first 10 houses in the test set, using k=10. What is the index of the house in this query set that has the lowest predicted value? What is the predicted value of this house?

In [78]:
predict_output(k, features_train, output_train, features_test[:10])

[923750.0,
 434400.0,
 413987.5,
 552750.0,
 869625.0,
 683237.5,
 332830.0,
 577500.0,
 436250.0,
 454975.0]

In [85]:
for k in range(1,16):
    predictions = predict_output(k, features_train, output_train, features_valid)
    print "k = ", k, ", RSS = ", np.sum( (output_valid - predictions)**2) # RSS on the validation data

k =  1 , RSS =  1.05453830252e+14
k =  2 , RSS =  8.3445073504e+13
k =  3 , RSS =  7.26920960192e+13
k =  4 , RSS =  7.19467216521e+13
k =  5 , RSS =  6.98465174197e+13
k =  6 , RSS =  6.88995443532e+13
k =  7 , RSS =  6.83419734501e+13
k =  8 , RSS =  6.73616787355e+13
k =  9 , RSS =  6.8372727959e+13
k =  10 , RSS =  6.93350486686e+13
k =  11 , RSS =  6.95238552156e+13
k =  12 , RSS =  6.90499695872e+13
k =  13 , RSS =  7.00112545083e+13
k =  14 , RSS =  7.0908698869e+13
k =  15 , RSS =  7.11069283859e+13


### Quiz Question: What is the RSS on the TEST data using the value of k found above? To be clear, sum over all houses in the TEST set.

In [86]:
predictions = predict_output(k, features_train, output_train, features_test)
print "RSS = ", np.sum( (output_test - predictions)**2) # RSS on the validation data

RSS =  1.34342939295e+14
