In [1]:
import pandas as pd

In [4]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [5]:
train = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)
valid = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)

In [6]:
import numpy as np

In [7]:
def get_numpy_data(data, features, output):
    data['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features

    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    feature_matrix = data[features].as_matrix(columns=None)

    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    # features_matrix = features
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’

    # this will convert the SArray into a numpy array:
    output_array = data[output].as_matrix(columns=None) # GraphLab Create>= 1.7!!
    return (feature_matrix, output_array)

In [8]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features / norms
    return (normalized_features, norms)

In [9]:
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
train_feature_matrix, train_output_array = get_numpy_data(train, feature_list, 'price')
test_feature_matrix, test_output_array = get_numpy_data(test, feature_list, 'price')
valid_feature_matrix, valid_output_array = get_numpy_data(valid, feature_list, 'price')

In [11]:
train_feature_matrix, norms = normalize_features(train_feature_matrix) # normalize training set features (columns)
test_feature_matrix = test_feature_matrix / norms # normalize test set by training set norms
valid_feature_matrix = valid_feature_matrix / norms # normalize validation set by training set norms

# Compute a single distance

To start, let's just explore computing the "distance" between two given houses.  We will take our **query house** to be the first house of the test set and look at the distance between this house and the 10th house of the training set.

To see the features associated with the query house, print the first row (index 0) of the test feature matrix. You should get an 18-dimensional vector whose components are between 0 and 1.

In [12]:
print test_feature_matrix[0]
print train_feature_matrix[9]

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059    0.
  0.05102365  0.0116321   0.01564352  0.01362084  0.02481682  0.01350306
  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


In [14]:
np.sqrt(np.sum((train_feature_matrix[9] - test_feature_matrix[0])**2))


0.059723593713980783

In [15]:
houses = train_feature_matrix[0:9]

In [21]:
for i in 1:10
    print np.sqrt(np.sum((train_feature_matrix[i] - test_feature_matrix[0]) ** 2))

IndentationError: unexpected indent (<ipython-input-21-48b88eff6df4>, line 2)

In [24]:
distance = {}
for i in range(10):
    distance[i] = np.sqrt(np.sum((train_feature_matrix[i] - test_feature_matrix[0])**2))
print distance

{0: 0.060274709162955922, 1: 0.085468811476437465, 2: 0.061499464352793153, 3: 0.053402739792943632, 4: 0.05844484060170442, 5: 0.059879215098128345, 6: 0.054631404967754607, 7: 0.055431083236146074, 8: 0.052383627840220305, 9: 0.059723593713980783}


In [26]:
for i in xrange(3):
    print train_feature_matrix[i]-test_feature_matrix[0]
    # should print 3 vectors of length 18

[  0.00000000e+00  -3.87821276e-03  -1.20498190e-02  -1.05552733e-02
   2.08673616e-04  -8.52950206e-03   0.00000000e+00  -5.10236549e-02
   0.00000000e+00  -3.47633726e-03  -5.50336860e-03  -2.48168183e-02
  -1.63756198e-04   0.00000000e+00  -1.70254220e-05   1.29876855e-05
  -5.14364795e-03   6.69281453e-04]
[  0.00000000e+00  -3.87821276e-03  -4.51868214e-03  -2.26610387e-03
   7.19763456e-04   0.00000000e+00   0.00000000e+00  -5.10236549e-02
   0.00000000e+00  -3.47633726e-03   1.30705004e-03  -1.45830788e-02
  -1.91048898e-04   6.65082271e-02   4.23090220e-05   6.16364736e-06
  -2.89330197e-03   1.47606982e-03]
[  0.00000000e+00  -7.75642553e-03  -1.20498190e-02  -1.30002801e-02
   1.60518166e-03  -8.52950206e-03   0.00000000e+00  -5.10236549e-02
   0.00000000e+00  -5.21450589e-03  -8.32384500e-03  -2.48168183e-02
  -3.13866046e-04   0.00000000e+00   4.70885840e-05   1.56292487e-05
   3.72914476e-03   1.64764925e-03]


In [28]:
print train_feature_matrix[0:3] - test_feature_matrix[0]

[[  0.00000000e+00  -3.87821276e-03  -1.20498190e-02  -1.05552733e-02
    2.08673616e-04  -8.52950206e-03   0.00000000e+00  -5.10236549e-02
    0.00000000e+00  -3.47633726e-03  -5.50336860e-03  -2.48168183e-02
   -1.63756198e-04   0.00000000e+00  -1.70254220e-05   1.29876855e-05
   -5.14364795e-03   6.69281453e-04]
 [  0.00000000e+00  -3.87821276e-03  -4.51868214e-03  -2.26610387e-03
    7.19763456e-04   0.00000000e+00   0.00000000e+00  -5.10236549e-02
    0.00000000e+00  -3.47633726e-03   1.30705004e-03  -1.45830788e-02
   -1.91048898e-04   6.65082271e-02   4.23090220e-05   6.16364736e-06
   -2.89330197e-03   1.47606982e-03]
 [  0.00000000e+00  -7.75642553e-03  -1.20498190e-02  -1.30002801e-02
    1.60518166e-03  -8.52950206e-03   0.00000000e+00  -5.10236549e-02
    0.00000000e+00  -5.21450589e-03  -8.32384500e-03  -2.48168183e-02
   -3.13866046e-04   0.00000000e+00   4.70885840e-05   1.56292487e-05
    3.72914476e-03   1.64764925e-03]]


In [29]:
# verify that vectorization works
results = train_feature_matrix[0:3] - test_feature_matrix[0]
print results[0] - (train_feature_matrix[0]-test_feature_matrix[0])
# should print all 0's if results[0] == (features_train[0]-features_test[0])
print results[1] - (train_feature_matrix[1]-test_feature_matrix[0])
# should print all 0's if results[1] == (features_train[1]-features_test[0])
print results[2] - (train_feature_matrix[2]-test_feature_matrix[0])
# should print all 0's if results[2] == (features_train[2]-features_test[0])

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [31]:
diff = train_feature_matrix[:] - test_feature_matrix[0]
diff[-1].sum()

-0.093433998746546426

In [32]:
np.sum(diff**2, axis=1)[15]

0.0033070590284564457

In [33]:
np.sum(diff[15]**2)

0.0033070590284564453

In [41]:
np.sqrt(sum(diff[100]**2))


0.023708232416678195

In [63]:
def compute_distances(feature_matrix, features_query):
    diff = feature_matrix - features_query
    distances = np.sqrt(np.sum(diff**2, axis=1))
    return distances


In [64]:
dist = compute_distances(train_feature_matrix, test_feature_matrix[2])

In [65]:
min(dist)

0.0028604955575117085

In [66]:
np.argmin(dist)


382

In [50]:
train_output_array[382]


249000.0

In [67]:
def k_nearest_neighbors(k, feature_matrix, feat_query):
    distance = compute_distances(feature_matrix, feat_query)
#     print np.sort(distance)[:k]
    return np.argsort(distance)[0:k]

In [68]:
k_nearest_neighbors(4,train_feature_matrix, test_feature_matrix[2])


array([ 382, 1149, 4087, 3142])

In [69]:
train_output_array[[382, 1149, 4087, 3142]]

array([ 249000.,  477000.,  499950.,  430000.])

In [61]:
(249000. + 477000. + 499950. + 430000.) / 4.

413987.5

In [71]:
def predict_output_of_query(k, features_train, output_train, features_query):
    prediction = np.sum(output_train[k_nearest_neighbors(k,features_train, features_query)])/k
    return prediction




In [72]:
for m in range(10):
    print m, predict_output_of_query(10, train_feature_matrix, train_output_array, test_feature_matrix[m])


0 881300.0
1 431860.0
2 460595.0
3 430200.0
4 766750.0
5 667420.0
6 350032.0
7 512800.7
8 484000.0
9 457235.0
