# House Prices: Advanced Regression Techniques

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Data Explore

### Train data.

In [7]:
train = pd.read_csv('./dataset/train.csv')

train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


### Test data.

In [8]:
test = pd.read_csv('./dataset/test.csv')

test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


## Data Wrangling

In [9]:
from sklearn.preprocessing import LabelEncoder

def fillna(data):
    
    # Fill missed values of numerical fields with mean.
    data = data.fillna(data.mean())
    
    # Fill missed values of other fields with the most frequent value.
    data = data.fillna(data.mode().iloc[0])
    
    return data

train = fillna(train)
test = fillna(test)

# Categorize
for col in test.select_dtypes('object').columns:
    
    le = LabelEncoder()
    
    le.fit(np.concatenate((np.array(train[col]), np.array(test[col]))).tolist())
    
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

train_x = train.drop(['Id', 'SalePrice'], axis = 1)
train_y = train.SalePrice

test_x = test.drop(['Id'], axis = 1)

## Model Evaluation

1. Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import mean_squared_log_error

x1, x2, y1, y2 = train_test_split(train_x, train_y)

lr = LinearRegression()

lr.fit(x1, y1)

mean_squared_log_error(lr.predict(x2), y2)

0.02558832113780661

2. Random Forest

In [11]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()

rfr.fit(x1, y1)

mean_squared_log_error(rfr.predict(x2), y2)

0.021790794692242276

3. Gradient Tree Boosting

In [12]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor()

gbr.fit(x1, y1)

mean_squared_log_error(gbr.predict(x2), y2)

0.01935084521997323

## Gradient Tree Boosting

In [13]:
gbr = GradientBoostingRegressor()

gbr.fit(train_x, train_y)

prediction = pd.DataFrame({ 'Id': test.Id, 'SalePrice': gbr.predict(test_x) })

prediction

Unnamed: 0,Id,SalePrice
0,1461,120393.640368
1,1462,164004.416289
2,1463,176928.920795
3,1464,181931.467083
4,1465,201782.103750
...,...,...
1454,2915,82219.919489
1455,2916,83947.207156
1456,2917,162217.719779
1457,2918,118679.584692


## Save Result

In [14]:
prediction.to_csv('./dataset/submission.csv', index = False)