# Project 6: KNN Algorithm

* Find the k-nearest neighbors of a given query input
* Predict the output for the query input using the k-nearest neighbors
* Choose the best value of k using a validation set

In [1]:
import numpy as np

In [2]:
import pandas as pd

# Load in house sales data

In [3]:
df = pd.read_csv('merged.csv')

In [4]:
# Transform the numerical type to categorical type for some columns

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
labelencoder = LabelEncoder()

In [7]:
df['CITY'] = df.CITY.str.upper()

In [8]:
df['CITY_cat'] = labelencoder.fit_transform(df['CITY'].astype(str))

In [9]:
df['PROPERTY TYPE_cat'] = labelencoder.fit_transform(df['PROPERTY TYPE'].astype(str))

In [10]:
df['ZIP OR POSTAL CODE'] = labelencoder.fit_transform(df['ZIP OR POSTAL CODE'])

In [11]:
# for feature we have as follows:

In [12]:
all_features = ['SQUARE FEET', 'LOT SIZE', 
                'BEDS', 'BATHS', 'CITY_cat', 
                'PROPERTY TYPE_cat', 'ZIP OR POSTAL CODE', 
                'YEAR BUILT']

In [13]:
my_output = 'PRICE'

In [14]:
df2 = df[all_features+['PRICE']].dropna()

# Normalize the feature_matrix

In [15]:
# Copy normalizae function from project 5

In [16]:
def get_numpy_data(data, features, output):
    data['Constant'] = 1
    features = ['Constant'] + features
    feature_matrix = data[features].values
    output_array = data[output].values
    return(feature_matrix, output_array)

In [17]:
def normalize_features(feature_matrix):
    norms = np.linalg.norm(feature_matrix, axis=0)
    features = feature_matrix / norms
    
    return(features,norms)

# Split the data into training, validation and test set

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
training_validation,testing = train_test_split(df2,test_size = 0.1)

In [20]:
training,validation = train_test_split(training_validation,test_size = 0.5)

# Transform numpy data to each set

In [21]:
feature_matrix_train, output_array_train = get_numpy_data(training, all_features, my_output)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [22]:
feature_matrix_vali, output_array_vali = get_numpy_data(validation, all_features, my_output)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
feature_matrix_test, output_array_test = get_numpy_data(testing, all_features, my_output)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# Normalize each set

In [24]:
feature_train,norm_train = normalize_features(feature_matrix_train)

In [25]:
feature_vali,norm_vali = normalize_features(feature_matrix_vali)

In [26]:
feature_test,norm_test = normalize_features(feature_matrix_test)

In [27]:
feature_test[0]

array([0.07832604, 0.06182494, 0.09006516, 0.04708816, 0.03072925,
       0.10116032, 0.08155185, 0.08436529, 0.07755679])

# How KNN Algorithm works
Calculate the Euclidean distance between the new point and the existing points

Formula of Euclidean distance:

Euclidean distance = sqrt [(x1-x2)^2+(y1-y2)^2+(z1-z2)^2+.....+(n1-n2)^2)]

After getting the distance, we need to find the number of smallest distance.

K equals to how many smallest distance you'd like to choose.

And then compare these numbers and make predictions.

#
First I would like to try to find two observation's distance in training set.

I randomly select two numbers from training set and calculate the distance.

In [28]:
a = np.random.randint(0,100,1)
b = np.random.randint(0,100,1)

In [29]:
a_1 = feature_train[a]
b_1 = feature_train[b]

In [32]:
from math import sqrt

In [33]:
diff_square = (a_1 - b_1) * (a_1 - b_1)
sum_diff_square = diff_square.sum()
distance = sqrt(sum_diff_square)
print(distance)

0.0467216450029469


In [31]:
# Then this two points difference is 0.0467216450029469

In [34]:
# After calculating two points distance, try to calculating one point to multiple points.

In [None]:
# Set a random point as c, and multiple points as d.

In [36]:
c = np.random.randint(0,100,1)
c_1 = feature_train[c]

In [39]:
d = feature_train[0:15]

In [81]:
d_v = output_array_train[0:15]
d_v

array([ 720000.,  878888.,  865000.,  240000.,  267500.,  354540.,
        342000.,  505000., 1025000., 2475000., 1410000.,  304545.,
        475000., 1545000.,  849000.])

In [40]:
multi_diff_square = (c_1-d) * (c_1-d)
multi_sum_diff_square = multi_diff_square.sum()
multi_distance = sqrt(multi_sum_diff_square)
print(multi_distance)

0.1585340043558751


In [82]:
ind = []
dist = []
value = []
dataf = pd.DataFrame()
for i in range(len(d)):
    multi_sum_diff_square = multi_diff_square[i].sum()
    multi_distance = sqrt(multi_sum_diff_square)
    value.append(output_array_train[i])
    dist.append(multi_distance)
    ind.append(i)
dataf['index'] = ind
dataf['distance'] = dist
dataf['value'] = value
dataf

Unnamed: 0,index,distance,value
0,0,0.061107,720000.0
1,1,0.054506,878888.0
2,2,0.047017,865000.0
3,3,0.037986,240000.0
4,4,0.02308,267500.0
5,5,0.037532,354540.0
6,6,0.019364,342000.0
7,7,0.052047,505000.0
8,8,0.051272,1025000.0
9,9,0.023355,2475000.0


In [61]:
# After getting these results, we can creat two lists and make them to a data frame.
# It will be convenient to see the smallest distance.

In [62]:
dataf['distance'].min()

0.016314446350458343

# Create a function to calculate the distance.

In [83]:
def distance(point,matrix_feature,value_matrix):
    ind = []
    dist = []
    value = []
    dataf_1 = pd.DataFrame()
    diff_square_1 = (point-matrix_feature) * (point-matrix_feature)
    for i in range(len(matrix_feature)):
        sum_diff_square_1 = diff_square_1[i].sum()
        distance_1 = sqrt(sum_diff_square_1)
        value.append(value_matrix[i])
        dist.append(distance_1)
        ind.append(i)
    dataf_1['index'] = ind
    dataf_1['distance'] = dist
    dataf_1['value'] = value
    return dataf_1

In [65]:
# Run a test on previous one data points and multiple data points

In [84]:
print(distance(c_1,d,output_array_train[0:15]))

    index  distance      value
0       0  0.061107   720000.0
1       1  0.054506   878888.0
2       2  0.047017   865000.0
3       3  0.037986   240000.0
4       4  0.023080   267500.0
5       5  0.037532   354540.0
6       6  0.019364   342000.0
7       7  0.052047   505000.0
8       8  0.051272  1025000.0
9       9  0.023355  2475000.0
10     10  0.032648  1410000.0
11     11  0.016314   304545.0
12     12  0.025607   475000.0
13     13  0.042176  1545000.0
14     14  0.052996   849000.0


In [86]:
# Then, grab a data from test set and calculate distance from data from train set.
# Using K=1 to predict the price.

In [89]:
f = np.random.randint(0,100,1)
f_1 = feature_test[f]

In [116]:
f_2 = output_array_test[f]

In [91]:
x = distance(f_1,feature_train,output_array_train)

In [95]:
x.iloc[x['distance'].idxmin()]

index       9.300000e+01
distance    1.240239e-01
value       1.950000e+06
Name: 93, dtype: float64

In [96]:
# Then, we may see that this point has the smallest distance, and its price might be
# 1.950000e+06.

# Choose the K value

In my opinion, k-value means how many smallest value that may be helpful to see their
features and make a better decision on which 'Label' this number might be.

Before doing that, we need to sort the dataframe I have created in the previous function based on the "Distance" column.

And also, with the help of dataframe, we can see the first k values using dataframe.

In [108]:
def distance_sort(point,matrix_feature,value_matrix,k):
    ind = []
    dist = []
    value = []
    dataf_1 = pd.DataFrame()
    diff_square_1 = (point-matrix_feature) * (point-matrix_feature)
    for i in range(len(matrix_feature)):
        sum_diff_square_1 = diff_square_1[i].sum()
        distance_1 = sqrt(sum_diff_square_1)
        value.append(value_matrix[i])
        dist.append(distance_1)
        ind.append(i)
    dataf_1['index'] = ind
    dataf_1['distance'] = dist
    dataf_1['value'] = value
    dataf_2 = dataf_1.sort_values(by='distance',ascending=True)
    pre = dataf_2.head(k)
    return pre

In [110]:
y = distance_sort(f_1,feature_train,output_array_train,k = 6)
y

Unnamed: 0,index,distance,value
93,93,0.124024,1950000.0
453,453,0.126418,2000000.0
348,348,0.129389,480000.0
728,728,0.131364,2150000.0
339,339,0.131479,1017500.0
589,589,0.133527,1590000.0


In [113]:
distance_sort(f_1,feature_train,output_array_train,k = 6)['value'].min()

480000.0

To choose the best value K, we need to calculate the RSS for each value k.

To make the test faster, make a range of K.

Set k is from 1 to 10.

Besides, I think after getting the first k results, it is better to use the mean values than use the smallest value. In this way, the price will be more balanced and no to absolute.

Make the test_point's index is a random number. Then, we can get its feature and actual value.

In [127]:
e = np.random.randint(1,100,1)
e_point = feature_test[e]
e_actual = output_array_test[e]

In [119]:
ss = []
ints = []
dataf4 = pd.DataFrame()
for k in range(1,6):
    prediction = distance_sort(f_1,feature_train,output_array_train,k)['value'].mean()
    square_error = (prediction - f_2)**2
    ss.append(square_error)
    ints.append(k)
dataf4['SS'] = ss
dataf4['K'] = ints
print(dataf4)

                    SS  K
0      [40000000000.0]  1
1      [50625000000.0]  2
2  [74711111111.11107]  3
3      [11025000000.0]  4
4      [53130250000.0]  5


In [136]:
def ss_print(point,matrix_feature,value_matrix,k,actual_value):
    ss = []
    ints = []
    dataf5 = pd.DataFrame()
    for i in range(1,k+1):
        prediction = distance_sort(point,matrix_feature,value_matrix,i)['value'].mean()
        square_error = (prediction - actual_value)**2
        ss.append(square_error)
        ints.append(i)
    dataf5['SS'] = ss
    dataf5['K'] = ints
    return dataf5

In [130]:
# Using the points and value to calculate the rss.

In [143]:
ew = ss_print(feature_test[50],feature_train,output_array_train,10,output_array_test[50])
ew

Unnamed: 0,SS,K
0,211600000000.0,1
1,50625000000.0,2
2,56011110000.0,3
3,33306250000.0,4
4,27225000000.0,5
5,26950690000.0,6
6,15625000000.0,7
7,15190560000.0,8
8,40938780000.0,9
9,38848410000.0,10


In [144]:
ew.iloc[ew['SS'].idxmin()]

SS    1.519056e+10
K     8.000000e+00
Name: 7, dtype: float64

In [145]:
# We can see, if K = 8, the SS is the lowest.