In [1]:
import pandas as pd

data = pd.read_csv("data/HousingData_processed.csv")
data.head()

Unnamed: 0,OverallQual,YearBuilt,TotalBsmtSF,GrLivArea,GarageCars,FullBath,TotalArea,AreaPerRoom,SalePrice
0,7,2003,856,1710,2,2,2566,213.75,208500
1,6,1976,1262,1262,2,2,2524,210.333333,181500
2,7,2001,920,1786,2,2,2706,297.666667,223500
3,7,1915,756,1717,3,1,2473,245.285714,140000
4,8,2000,1145,2198,3,2,3343,244.222222,250000


In [2]:
# Let's choose one row as a test case.
test_index = 321
test = data.loc[[test_index], :]
test

Unnamed: 0,OverallQual,YearBuilt,TotalBsmtSF,GrLivArea,GarageCars,FullBath,TotalArea,AreaPerRoom,SalePrice
321,8,2004,1136,2468,3,2,3604,246.8,354000


In [3]:
# Remove this instance from the main data frame
data = data.drop([test_index])
# data.loc[test_index, :]# This shows an error because the row has been removed

In [4]:
data.loc[[0], :]

Unnamed: 0,OverallQual,YearBuilt,TotalBsmtSF,GrLivArea,GarageCars,FullBath,TotalArea,AreaPerRoom,SalePrice
0,7,2003,856,1710,2,2,2566,213.75,208500


In [5]:
# Let's predict the price of the test case using a method called "k-nearest-neighbor".
# To find similar houses, we need to come up with a similarity measure.
data.loc[[1], :]

Unnamed: 0,OverallQual,YearBuilt,TotalBsmtSF,GrLivArea,GarageCars,FullBath,TotalArea,AreaPerRoom,SalePrice
1,6,1976,1262,1262,2,2,2524,210.333333,181500


In [6]:
import numpy as np
# The similarity measure should combine the differences from all features
def similarity(case1, case2, weights):
    """
    This function measure the difference between two cases from the housing data.
    
    Input:
    case1, case2: two rows from the housing data frame.
    weights: a list of 8 weights for the 8 features.
    
    Output: the difference measure
    """
    
    # np.abs() calculates the absolute value
    diff1 = np.abs(case1['OverallQual'] - case2['OverallQual']) 
    diff2 = np.abs(case1['YearBuilt'] - case2['YearBuilt'])
    diff3 = np.abs(case1['TotalBsmtSF'] - case2['TotalBsmtSF'])
    diff4 = np.abs(case1['GrLivArea'] - case2['GrLivArea'])
    diff5 = np.abs(case1['GarageCars'] - case2['GarageCars'])
    diff6 = np.abs(case1['FullBath'] - case2['FullBath'])
    diff7 = np.abs(case1['TotalArea'] - case2['TotalArea'])
    diff8 = np.abs(case1['AreaPerRoom'] - case2['AreaPerRoom'])
    
#     print("Differences:\n", diff1, diff2, diff3, diff4, diff5, diff6, diff7, diff8)
    
    similarity = weights[0] * diff1 + weights[1] * diff2 \
                + weights[2] * diff3 + weights[3] * diff4 \
                + weights[4] * diff5 + weights[5] * diff6 \
                + weights[6] * diff7 + weights[7] * diff8
    return similarity

In [7]:
# We can use the reciprocals of standard deviations as weights.
weights = []
weights.append(1 / np.std(data['OverallQual']))
weights.append(1 / np.std(data['YearBuilt']))
weights.append(1 / np.std(data['TotalBsmtSF']))
weights.append(1 / np.std(data['GrLivArea']))
weights.append(1 / np.std(data['GarageCars']))
weights.append(1 / np.std(data['FullBath']))
weights.append(1 / np.std(data['TotalArea']))
weights.append(1 / np.std(data['AreaPerRoom']))
print("Weights:\n", weights)

case1 = test.loc[test_index, :]
case2 = data.loc[1, :]
similarity(case1, case2, weights)

Weights:
 [0.7270285292925576, 0.033132847895239285, 0.00240977469137096, 0.001971382707351958, 1.339756662571655, 1.818935172697677, 0.0012891461270902026, 0.02270961745899479]


8.62307448566477

In [8]:
# Calculate the difference between the test house and each house in the data.

# Attempt 1: Use a loop
case1 = test.loc[test_index, :]
similarity_column = []
for ind in data.index.values:
    case2 = data.loc[ind, :]
    sim = similarity(case1, case2, weights)
    similarity_column.append(sim)
# Add similarity_column to the data frame
data['Similarity'] = similarity_column
data.head()

Unnamed: 0,OverallQual,YearBuilt,TotalBsmtSF,GrLivArea,GarageCars,FullBath,TotalArea,AreaPerRoom,SalePrice,Similarity
0,7,2003,856,1710,2,2,2566,213.75,208500,6.35765
1,6,1976,1262,1262,2,2,2524,210.333333,181500,8.623074
2,7,2001,920,1786,2,2,2706,297.666667,223500,6.343994
3,7,1915,756,1717,3,1,2473,245.285714,140000,9.383423
4,8,2000,1145,2198,3,2,3343,244.222222,250000,1.0815


In [9]:
# Attempt 2: Use the apply mechanism to do this
data['Similarity'] = data.apply(similarity, axis=1, args=(test.loc[test_index, :],
                                                         weights))
data.head()

Unnamed: 0,OverallQual,YearBuilt,TotalBsmtSF,GrLivArea,GarageCars,FullBath,TotalArea,AreaPerRoom,SalePrice,Similarity
0,7,2003,856,1710,2,2,2566,213.75,208500,6.35765
1,6,1976,1262,1262,2,2,2524,210.333333,181500,8.623074
2,7,2001,920,1786,2,2,2706,297.666667,223500,6.343994
3,7,1915,756,1717,3,1,2473,245.285714,140000,9.383423
4,8,2000,1145,2198,3,2,3343,244.222222,250000,1.0815


In [10]:
closest_neighbors = data.sort_values("Similarity").head()

In [11]:
test

Unnamed: 0,OverallQual,YearBuilt,TotalBsmtSF,GrLivArea,GarageCars,FullBath,TotalArea,AreaPerRoom,SalePrice
321,8,2004,1136,2468,3,2,3604,246.8,354000


In [12]:
# Calculate the average price of the 5 closest houses.
prediction = closest_neighbors['SalePrice'].mean()
print("Prediction:", prediction)

Prediction: 307026.4


In [13]:
# What is the actual sale price for the test case?
print("Actual price:", test['SalePrice'])

Actual price: 321    354000
Name: SalePrice, dtype: int64
