# KC housing price prediction

### Model : Multiple Regression

#### Importing the libraries and uploading the data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

df_train = pd.read_csv('kc_house_train_data.csv',dtype= dtype_dict)

df_test = pd.read_csv('kc_house_test_data.csv',dtype= dtype_dict)

In [2]:
df_train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


### Preprocessing

In [3]:
import numpy as np

#‘bedrooms_squared’ = ‘bedrooms’*‘bedrooms’
df_train['bedrooms_squared'] = df_train['bedrooms'].pow(2)
df_test['bedrooms_squared'] = df_test['bedrooms'].pow(2)

#‘bed_bath_rooms’ = ‘bedrooms’*‘bathrooms’
df_train['bed_bath_rooms'] = df_train['bedrooms'].rmul(df_train['bathrooms'], level=None, fill_value=None)
df_test['bed_bath_rooms'] = df_test['bedrooms'].rmul(df_test['bathrooms'], level=None, fill_value=None)

# ‘log_sqft_living’ = log(‘sqft_living’)
df_train['log_sqft_living'] = np.log(df_train['sqft_living'])
df_test['log_sqft_living'] = np.log(df_test['sqft_living'])


#‘lat_plus_long’ = ‘lat’ + ‘long’
df_train['lat_plus_long'] = df_train['lat'] + df_train['long']
df_test['lat_plus_long'] = df_test['lat'] + df_test['long']


In [5]:
df_train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_long
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,0,98178,47.5112,-122.257,1340.0,5650.0,9.0,3.0,7.07327,-74.7458
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,1991,98125,47.721,-122.319,1690.0,7639.0,9.0,6.75,7.851661,-74.598
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,0,98028,47.7379,-122.233,2720.0,8062.0,4.0,2.0,6.646391,-74.4951
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,0,98136,47.5208,-122.393,1360.0,5000.0,16.0,12.0,7.5807,-74.8722
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,0,98074,47.6168,-122.045,1800.0,7503.0,9.0,6.0,7.426549,-74.4282


In [9]:
mean1 = df_test['bedrooms_squared'].mean()  # or use np.mean()
mean2 = df_test['bed_bath_rooms'].mean()
mean3 = df_test['log_sqft_living'].mean()
mean4 = df_test['lat_plus_long'].mean()
print(mean1, '\n', mean2, '\n', mean3, '\n', mean4)

12.4466777015843 
 7.5039016315913925 
 7.550274679645921 
 -74.65333355403185


### Regression function to create the model

In [10]:
def simple_linear_regression(input_feature, output):
    model = LinearRegression()
    model.fit(input_feature, output)
    return model 

### Model 1

In [11]:
inp1 = df_train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']]
out1 = df_train['price']

model1 = simple_linear_regression(inp1, out1)

### Model 2

In [12]:
inp2 = df_train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long','bed_bath_rooms']]
out2 = df_train['price']

model2 = simple_linear_regression(inp2, out2)

### Model 3

In [13]:
inp3 = df_train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long','bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']]
out3 = df_train['price']

model3 = simple_linear_regression(inp3, out3)

### Some analysis of the models above

In [14]:
print(list(zip(model1.coef_, inp1.columns)))

[(312.25864627320277, 'sqft_living'), (-59586.53315361201, 'bedrooms'), (15706.742082734634, 'bathrooms'), (658619.2639305175, 'lat'), (-309374.35126823327, 'long')]


In [15]:
print(list(zip(model2.coef_, inp2.columns)))

[(306.61005345899537, 'sqft_living'), (-113446.36807020313, 'bedrooms'), (-71461.30829275966, 'bathrooms'), (654844.629503303, 'lat'), (-294298.96913811855, 'long'), (25579.652000752154, 'bed_bath_rooms')]


#### RSS on train data

In [20]:
RSS1 = np.sum((out1 - model1.predict(inp1))**2) 
RSS2 = np.sum((out2 - model2.predict(inp2))**2)
RSS3 = np.sum((out3 - model3.predict(inp3))**2)

In [21]:
l = [RSS1,RSS2,RSS3]
l

[967879963049545.9, 958419635074068.5, 903436455050477.6]

### RSS on test data

In [22]:
inpt1 = df_test[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']]
inpt2 = df_test[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long','bed_bath_rooms']]
inpt3 = df_test[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long','bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']]

RSSt1 = np.sum((df_test['price'] - model1.predict(inpt1))**2) 
RSSt2 = np.sum((df_test['price'] - model2.predict(inpt2))**2)
RSSt3 = np.sum((df_test['price'] - model3.predict(inpt3))**2)

In [23]:
RSSt1, RSSt2, RSSt3

(225500469795490.2, 223377462976467.06, 259236319207179.84)