In [1]:
import pandas as pd
import numpy as np

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
sales = pd.read_csv('../kc_house_data_small.csv')
train_data = pd.read_csv('../kc_house_data_small_train.csv')
test_data = pd.read_csv('../kc_house_data_small_test.csv')
validate = pd.read_csv('../kc_house_data_validation.csv')

In [4]:
def get_numpy_data(dataframe,features,output):
    dataframe['constant'] = 1
    features = ['constant']+features
    feature_matrix = dataframe[features].values
    output_array = dataframe[output].values
    return feature_matrix,output_array

In [5]:
def normalize_features(features):
    norms = np.sqrt((features**2).sum(axis=0))
    norm_matrix = features/norms
    return norm_matrix,norms

In [19]:
cols_names = [m for m,n in dtype_dict.items() if n!=str]

In [20]:
cols_names.remove('price')
cols_names.sort()
len(cols_names)

17

In [21]:
features_train, output_train = get_numpy_data(train_data, cols_names, 'price')
features_test, output_test = get_numpy_data(test_data, cols_names, 'price')
features_valid, output_valid = get_numpy_data(validate, cols_names, 'price')

In [22]:
features_train_n, norms = normalize_features(features_train)
features_test_n = features_test / norms
features_valid_n = features_valid / norms

In [23]:
print (features_test_n[0])
print (features_train_n[9])

[ 0.01345102  0.01807473  0.01551285  0.0116321   0.017059    0.01564352
  0.01345387 -0.01346922  0.01362084  0.02481682  0.01759212  0.01375926
  0.00160518  0.0016225   0.05102365  0.          0.01350306  0.        ]
[ 0.01345102  0.00602491  0.01163464  0.01938684  0.01279425  0.01390535
  0.01346821 -0.01346251  0.0096309   0.          0.0083488   0.01195898
  0.00050756  0.00156612  0.          0.          0.01302544  0.        ]


In [24]:
def compute_distance(point1, point2):
    distance = np.sqrt(((point1-point2)**2).sum(axis=0))
    return distance

In [27]:
compute_distance(features_test_n[0], features_train_n[9])

0.05972359371398078

In [28]:
dis_list=[]
for i in range(10):
    dis_list.append(compute_distance(features_test_n[0],features_train_n[i]))

In [29]:
dis_list

[0.06027470916295592,
 0.08546881147643746,
 0.06149946435279315,
 0.05340273979294363,
 0.05844484060170442,
 0.059879215098128345,
 0.05463140496775461,
 0.055431083236146074,
 0.052383627840220305,
 0.05972359371398078]

In [30]:
dis_list.index(min(dis_list))

8

In [31]:
diff = features_train_n - features_test_n[0]

In [32]:
diff[-1].sum()

-0.09343399874654643

In [33]:
np.sum(diff[15]**2,axis=0)

0.0033070590284564453

In [34]:
def compute_distance_1(point1, point2):
    distance = np.sqrt(((point1-point2)**2).sum(axis=1))
    return distance

In [39]:
d = compute_distance_1(features_test_n[0],features_train_n)

In [40]:
d[100]

0.023708232416678195

In [42]:
d2 = compute_distance_1(features_test_n[2],features_train_n)

In [43]:
np.argmin(d2,axis=0)

382

In [44]:
output_train[382]

249000

In [45]:
def k_nearest_neighbors(k, feature_train, features_query):
    diff = feature_train - features_query
    distance = np.sum(diff**2,axis=1)
    sort_index = np.argsort(distance)
    neighbors = sort_index[0:k]
    return neighbors

In [47]:
a = k_nearest_neighbors(4,features_train_n, features_test_n[2])
output_train[a].sum()/4

413987.5

In [48]:
def predict(k, feature_train, output_train, features_query):
    diff=feature_train-features_query
    distance=np.sum(diff**2,axis=1)
    sort_index=np.argsort(distance)
    neighbors=sort_index[0:k]
    prediction=output_train[neighbors].mean()
    return prediction

In [49]:
predict(4,features_train_n,output_train,features_test_n[0])

923750.0

In [50]:
def predict_output(k, feature_train, output_train, features_query):
    a,b=features_query.shape
    predictions=[]
    for i in range(a):
        predictions.append(predict(k, feature_train, output_train, features_query[i]))
    return predictions

In [52]:
output_10 = predict_output(10,features_train_n,output_train,features_test_n[0:9])

In [53]:
(min(output_10))

350032.0

In [57]:
rsslist=[]
for i in range(1,16):
    prep=predict_output(i,features_train_n,output_train,features_valid_n)
    rss=((prep-output_valid)**2).sum()
    rsslist.append((rss,i))
print(rsslist)

[(105453830251561.0, 1), (83445073504025.5, 2), (72692096019202.56, 3), (71946721652091.69, 4), (69846517419718.6, 5), (68899544353180.836, 6), (68341973450051.09, 7), (67361678735491.5, 8), (68372727958976.09, 9), (69335048668556.74, 10), (69523855215598.83, 11), (69049969587246.17, 12), (70011254508263.69, 13), (70908698869034.34, 14), (71106928385945.16, 15)]


In [58]:
rsslist.index(min(rsslist))

7

In [59]:
pre_test = predict_output(8,features_train_n,output_train,features_test_n)

In [60]:
rss_test = ((pre_test-output_test)**2).sum()

In [61]:
rss_test

133118823551516.81