In [20]:
# imports 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# load data 
houses = pd.read_csv('HousePricesClean.csv')

# select features
y = houses.price
X = houses[['bedrooms', 'bathrooms', 'year_built', 'post_code', 'condition', 'lot_area', 'house_area']]

# split into test and train data
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state = 0)

In [2]:
# create model 
model = RandomForestRegressor(random_state = 1)
model.fit(train_X,train_y)

# test model
predictions = model.predict(test_X)
mae = int(mean_absolute_error(predictions, test_y))
print(mae)

106903


In [7]:
# lets try improving the model by including more features 

# don't want to use these as they have missing data 
excluded_features = ['year_renovated', 'basement_area', 'price','id']

X = houses.drop(columns = excluded_features)
y = houses.price
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state = 0)


In [8]:
model_more_features = RandomForestRegressor(random_state = 1)
model_more_features.fit(train_X, train_y)

predictions = model_more_features.predict(test_X)
mae = int(mean_absolute_error(predictions, test_y))
print(mae) # = 13883

# model takes a lot longer time to fit but now has very good results. 

68030


In [22]:
def is_renovated(year):
    x = 0 if year == 0 else 1 
    return x

houses['renovated'] = houses.year_renovated.apply(is_renovated)
houses['basement'] = houses.basement_area.apply(is_renovated)

houses = houses.drop(columns = ['year_renovated', 'basement_area'])

In [24]:
X = houses.drop(columns = ['price', 'id'])
y = houses.price

train_X, test_X, train_y, test_y = train_test_split(X, y, random_state = 0)
model = RandomForestRegressor(random_state = 1)
model.fit(train_X,train_y)

predictions = model.predict(test_X)
mae = int(mean_absolute_error(predictions, test_y))
print(mae)

67876
