In [1]:
## Housing Price Prediction

In [2]:
# import libraries 
import pandas as pd
import numpy as np
import csv

In [3]:
## Naive data processing: binarizing all fields

In [4]:
# reading data from a csv file 'my_train.csv'
with open('my_train.csv') as file:
    
    reader = csv.reader(file, delimiter = ',')
      
    # store the headers in a separate variable,
    # move the reader object to point on the next row
    headings = next(reader)
      
    # new_train list to store all rows
    new_train = []
    for row in reader:
        new_train.append(row[:])

In [5]:
## Training data Binarization with numpy - All fields 

In [6]:
mapping = {}
new_data = []
# data without id and target(SalePrice)
train_data = [[value for i, value in enumerate(line) if i not in [0,80]] for line in new_train]

# feature mapping
for row in train_data: 
    new_row = []
    for j, x in enumerate(row):
        feature = (j, x) 
        if feature not in mapping:
            mapping[feature] = len(mapping) # insert a new feature into the index
        new_row.append(mapping[feature])
    new_data.append(new_row) 

bin_train = np.zeros((len(new_train), len(mapping))) 

# translating to numpy
for i, row in enumerate(new_data):
    for x in row:
        bin_train[i][x] = 1
        
train_data_bin = bin_train.astype('float')

#Total features
train_data_bin.shape[1] 

7227

In [7]:
# reading data from a csv file 'my_dev.csv'
with open('my_dev.csv') as file:
    
    reader = csv.reader(file, delimiter = ',')
      
    # store the headers in a separate variable
    headings = next(reader)
      
    # new_dev list to store all rows
    new_dev = []
    for row in reader:
        new_dev.append(row[:])

In [8]:
## Dev data Binarization with numpy - All fields 

In [9]:
new_data = []
# data without id and target(SalePrice)
dev_data = [[value for i, value in enumerate(line) if i not in [0,80]] for line in new_dev]

# feature mapping
for row in dev_data: 
    new_row = []
    for j, x in enumerate(row):
        feature = (j, x) 
        if feature in mapping:
            new_row.append(mapping[feature])
    new_data.append(new_row)

bin_dev = np.zeros((len(new_dev), len(mapping))) 

# translating to numpy
for i, row in enumerate(new_data):
    for x in row:
        bin_dev[i][x] = 1
        
dev_data_bin = bin_dev.astype('float')

#dev_data_bin.shape

In [10]:
## take the logarithm for the y field (SalePrice)

In [11]:
from math import *

train_labels = [[log(float(value)) for idx, value in enumerate(line) if idx in [80]] for line in new_train]
train_label = [val for sublist in train_labels for val in sublist]

train_target = np.array(train_label)
train_target.shape

(1314,)

In [12]:
dev_labels = [[log(float(value)) for idx, value in enumerate(line) if idx in [80]] for line in new_dev]
dev_label = [val for sublist in dev_labels for val in sublist]

dev_target = np.array(dev_label)
dev_target.shape

(146,)

In [13]:
## Root Mean Squared Log Error (RMSLE)

In [14]:
from sklearn.linear_model import LinearRegression

X = train_data_bin
y = train_label

# train the regression model to predict the logarithm of housing prices - log(y)
reg = LinearRegression()
reg_mod = reg.fit(X,y)
y_pred = reg_mod.predict(dev_data_bin)

In [15]:
from sklearn.metrics import mean_squared_error

mean_squared_error(dev_target, y_pred, squared = False)

0.15436305811398643

In [16]:
exp(0.15)
# e^0.15 = 1.16, it means in average, the predicted price is 1.16 times of the actual price.

1.161834242728283

In [17]:
## Predict on test.csv 

In [18]:
# reading data from a csv file 'test.csv'
with open('test.csv') as file:
    
    reader = csv.reader(file, delimiter = ',')
      
    # store the headers in a separate variable,
    headings = next(reader)
      
    # new_test list to store all rows
    new_test = []
    for row in reader:
        new_test.append(row[:])

In [19]:
# Binarization - test data

new_data = []

test_data = [[value for i, value in enumerate(line) if i not in [0]] for line in new_test]

# feature mapping
for row in test_data: 
    new_row = []
    for j,x in enumerate(row):
        feature = (j,x) 
        if feature in mapping:
            new_row.append(mapping[feature])
    new_data.append(new_row)

bin_test = np.zeros((len(new_test), len(mapping))) 


for i, row in enumerate(new_data):
    for x in row:
        bin_test[i][x] = 1
        
test_data_bin = bin_test.astype('float')

In [20]:
# linear regression model and predictions

y_pred_test = reg_mod.predict(test_data_bin)
# we need to exponentiate the prediction to get the real values 
final_pred = np.exp(y_pred_test)
final_pred

array([124265.49879929, 142763.3743117 , 181297.83739769, ...,
       150612.49308502, 116702.81848392, 229859.46950773])

In [21]:
# submission file to kaggle

numbers = np.array(new_test)[:,0]

submission = pd.read_csv('test_submission.csv')
output = pd.DataFrame({'Id': numbers, 'SalePrice': final_pred})
output.to_csv('submission1.csv', index=False)

In [22]:
# My score(RMSLE error) after submission is 0.16031

In [23]:
## Now let's try Smarter binarization: Only binarizing categorical features

In [24]:
# read the files with pandas
my_train = pd.read_csv("my_train.csv")
my_dev = pd.read_csv("my_dev.csv")
my_test = pd.read_csv("test.csv")

In [25]:
# Train and Dev Binarization with pandas

concatenate_all = pd.concat((my_train, my_dev))
#concatenate_all.shape

# drop column do not need for categorical binariation & replacing missing values with 0
data_drop = concatenate_all.drop(["SalePrice", "Id"], axis = 1).fillna(0)
data_cat_bin = pd.get_dummies(data_drop) #all binarization

bin_train = data_cat_bin.head(1314) 
bin_dev = data_cat_bin.tail(146)

In [26]:
# linear regression - RMSLE Error 

X = bin_train
y = train_label

reg = LinearRegression()
reg_mod = reg.fit(X,y)
y_pred = reg_mod.predict(bin_dev)

mean_squared_error(dev_target, y_pred, squared = False)

0.12449790838380112

In [27]:
# Test Binarization with pandas - Only categorical features

concatenate_test = pd.concat((my_train, my_test))

data_drop_t = concatenate_test.drop(["SalePrice", "Id"], axis = 1).fillna(0)
data_cat_bin = pd.get_dummies(data_drop_t)

bin_train_t = data_cat_bin.head(1314)

bin_test = data_cat_bin.tail(1459)

In [28]:
# linear regression - RMSLE Error for testing

X = bin_train_t
y = train_label

reg = LinearRegression()
reg_mod = reg.fit(X,y)

y_pred = reg_mod.predict(bin_test)
final_pred = np.exp(y_pred)
final_pred

array([121136.7609415 , 167899.24212243, 182402.83579875, ...,
       167361.47596195, 113077.67404363, 216829.92834187])

In [29]:
# second test submision file to kaggle

numbers = np.array(new_test)[:,0]

submission = pd.read_csv('test_submission.csv')
output = pd.DataFrame({'Id': numbers, 'SalePrice': final_pred})
output.to_csv('submission2.csv', index=False)

In [30]:
# My score(RMSLE error) with the smarter binarization after submission is 0.15304. 
# It is better than the naive binarization to predict housing sale price.