## Load datasets

SUMMARY OF CHANGES FROM BASELINE
- Data Cleaning and Prep (Outliers Removed etc.)
- Categorical Data Used as Predictors Using Dummy Matrices
- Log Price Transformation Used to Improve Predictive Accuracy
- Rows with Lots of Missing Data Removed

In [1]:
import pandas as pd
import sklearn as sk
import plotly as pl
import matplotlib
import matplotlib.pyplot as plt
import math
import patsy as patsy
import pickle
import statsmodels.api as sm
import numpy as np 
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import functions as f ## imports all the functions that are in a separate file to keep this file cleaner

dftrain = pd.read_csv("/Users/andrewtobin/College/final_year/FYP/House_pricing_dataset-master/dataset_csv/train.csv",index_col='ad_id')
dftest  = pd.read_csv( "/Users/andrewtobin/College/final_year/FYP/House_pricing_dataset-master/dataset_csv/test.csv",index_col='ad_id')

## Data Cleaning and Prep

In [2]:
dftrain = f.prep_data(dftrain) ## prepping the data (drops any unneccessary columns, and cleans any messy data)

In [3]:
dftrain = f.remove_outliers(dftrain) ## removing any outliers from the data

In [4]:
dftrain = f.get_dummies(dftrain) ## converts categorical data to dummy matrices and merges with our dataset

In [5]:
X_train = dftrain.drop(['price', 'log_price'], axis=1)
Y_train = dftrain['log_price']

In [6]:
X_train.head()

Unnamed: 0_level_0,area_Adamstown,area_Ard Na Greine,area_Artane,area_Ashtown,area_Baldoyle,area_Balgriffin,area_Ballinteer,area_Ballsbridge,area_Ballybough,area_Ballyboughal,...,beds,latitude,longitude,surface,_Alarm,_Gas Fired Central Heating,_Oil Fired Central Heating,_Parking,_Wheelchair Access,_Wired for Cable Television
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11161103,0,0,0,0,0,0,0,0,0,0,...,1.0,53.353395,-6.458716,319.038224,0,0,0,1,0,0
11185670,0,0,0,0,0,0,0,0,0,0,...,5.0,53.375467,-6.062246,202.0,0,1,0,1,0,1
11421934,0,0,0,0,0,0,0,0,0,0,...,3.0,53.246732,-6.174917,150.0,0,0,0,0,0,0
11487466,0,0,0,0,0,0,0,0,0,0,...,3.0,53.403799,-6.294608,319.038224,0,0,0,0,0,0
11489764,0,0,0,0,0,0,0,0,0,0,...,2.0,53.402663,-6.294619,319.038224,0,0,0,0,0,0


In [7]:
Y_train.head()

ad_id
11161103    13.304685
11185670    13.795308
11421934    13.652992
11487466    12.345835
11489764    12.154516
Name: log_price, dtype: float64

## Train Model

In [8]:
reg = LinearRegression().fit(X_train, Y_train)

## Test Model

In [9]:
# predictions for training data
f.make_predictions(dftrain, X_train, reg)

### Compute MdAPE: Median absolute percentage error (less sensitive to outliers than MAPE)

In [10]:
print('Training MdAPE: ', f.MdAPE(dftrain['price'], dftrain['predicted_price']), '%')
print('Training MAPE: ', f.MAPE(dftrain['price'], dftrain['predicted_price']), '%')

Training MdAPE:  9.62 %
Training MAPE:  13.04 %


In [11]:
#importance = reg.coef_
# summarize feature importance
#for i,v in enumerate(importance):
	#print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
#plt.bar([x for x in range(len(importance))], importance)
#plt.show()
#importance = -np.sort(-importance)
#top_50 = importance[0:50]
#for i, v in enumerate(top_50):
#	print('Feature: %0d, Score: %.5f' % (i,v))
#plt.bar([x for x in range(len(top_50))], top_50)
#plt.show()
