## Week three - Ridge Regression

<p>First, import necessary libraries.</p>

In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
import math

%matplotlib inline

In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

### Create a dataframe representing a polynomial function given a feature and degree

In [33]:
def get_numpy_data(data_frame, features, output):
    data_frame['constant'] = 1 # add a constant column to a dataframe
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_frame given by the ‘features’ list into the Frame ‘features_frame’
    features_frame = data_frame[list(features)]
    # this will convert the features_sframe into a numpy matrix
    features_matrix = features_frame.as_matrix()
    # assign the column of data_frame associated with the target to the variable ‘output_array’
    output_column = data_frame[output]
    # this will convert the series into a numpy array:
    output_array = output_column.as_matrix()
    return(features_matrix, output_array)

### Load the necessary data

In [34]:
sales = pd.read_csv('kc_house_data_small.csv', dtype=dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)
train = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
valid = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)

### Normalize Features

In [56]:
def normalize_features(features):
    norms = []
    normalized_features = np.empty(np.shape(features))
    for index in range(features.shape[1]):
        norms.append(np.linalg.norm(features[:,index], axis=0))
        normalized_features[:,index] = features[:,index]/norms[index]
    return (normalized_features, norms)

### Get the numpy data and normalize it

In [91]:
all_feats = ['bathrooms', 'waterfront', 'sqft_above', 'sqft_living15', 'grade', 'yr_renovated', 'bedrooms', 'long', 'sqft_lot15', 'sqft_living', 'floors', 'condition', 'lat', 'sqft_basement', 'yr_built', 'sqft_lot', 'view']
features_train = get_numpy_data(train, all_feats, 'price')
features_test = get_numpy_data(test, all_feats, 'price')
features_valid = get_numpy_data(valid, all_feats, 'price')

train_price = features_train[1]
test_price = features_test[1]
valid_price = features_valid[1]

features_train, norms = normalize_features(features_train[0])
features_test = features_test[0] / norms
features_valid = features_valid[0] / norms

In [92]:
print features_test[0]
print features_train[9]

distance_euclid = np.sqrt(np.sum((features_test[0] - features_train[9])**2))
print distance_euclid

[ 0.01345102  0.01807473  0.          0.01362084  0.01375926  0.01564352
  0.          0.01551285 -0.01346922  0.0016225   0.01759212  0.017059
  0.0116321   0.01345387  0.02481682  0.01350306  0.00160518  0.05102365]
[ 0.01345102  0.00602491  0.          0.0096309   0.01195898  0.01390535
  0.          0.01163464 -0.01346251  0.00156612  0.0083488   0.01279425
  0.01938684  0.01346821  0.          0.01302544  0.00050756  0.        ]
0.059723593714


### Now loop to see the distance between multiple houses

In [93]:
for index in range(10):
    print 'House ' + str(index) + ' ' + str(np.sqrt(np.sum((features_test[0] - features_train[index])**2)))

House 0 0.060274709163
House 1 0.0854688114764
House 2 0.0614994643528
House 3 0.0534027397929
House 4 0.0584448406017
House 5 0.0598792150981
House 6 0.0546314049678
House 7 0.0554310832361
House 8 0.0523836278402
House 9 0.059723593714


In [94]:
diff =  features_train - features_test[0]
print sum(diff[-1])

-0.0934339987465


### Now get Euclidean distances from all points in train to query

In [95]:
distances = np.sqrt(np.sum((diff**2), axis=1))
print distances[100]

0.0237082324167


### Abstract to function

In [96]:
def compute_distances(features_instances, features_query):
    diff =  features_instances - features_query
    distances = np.sqrt(np.sum((diff**2), axis=1))
    return distances

In [130]:
distances = compute_distances(features_train, features_test[2])
print min(distances)
print np.where(distances==min(distances))
print train_price[382]

0.00286049555751
(array([382]),)
249000.0


In [143]:
def k_nearest_neighbors(k, features_instances, features_query):
    diff =  features_instances - features_query
    distances = np.sqrt(np.sum((diff**2), axis=1))
    neighbors = distances.argsort()[:k][::1]
    return neighbors

In [144]:
print k_nearest_neighbors(4, features_train, features_test[2])

[ 382 1149 4087 3142]


### Now build our prediction function

In [145]:
def predict_output_of_query(k, features_train, output_train, features_query):
    indeces = k_nearest_neighbors(k, features_train, features_query)
    prices = output_train[indeces]
    prediction = sum(prices)/k
    return prediction

In [146]:
print predict_output_of_query(4, features_train, train_price, features_test[2])

413987.5


In [147]:
def predict_output(k, features_train, output_train, features_query):
    predictions={}
    for index,item in enumerate(features_query):
        predictions[index]=predict_output_of_query(k, features_train, output_train, item)
    return predictions

In [148]:
predictions = predict_output(10, features_train, train_price, features_test[:10])
print predictions

{0: 881300.0, 1: 431860.0, 2: 460595.0, 3: 430200.0, 4: 766750.0, 5: 667420.0, 6: 350032.0, 7: 512800.70000000001, 8: 484000.0, 9: 457235.0}
