# House Prices: Advanced Regression Techniques

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm

## Data Explore

There are `1460` instances in training set.

In [2]:
train_raw = pd.read_csv('./dataset/train.csv')

train_raw

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


And there are `1450` instances in test set.

In [3]:
test_raw = pd.read_csv('./dataset/test.csv')

test_raw

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


## Data Imputation

First, I use KNN to imputate missing values.

In [4]:
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

def imputate(dataset, n_neighbors = 10):
    
    # Exclude feature `Id` and `SalePrice`.
    df = dataset.copy().loc[:, (dataset.columns != 'Id') & (dataset.columns != 'SalePrice')]
    
    numerical_features = [col for col in df.columns if pd.api.types.is_numeric_dtype(df.dtypes[col])]
    categorical_features = [col for col in df.columns if not pd.api.types.is_numeric_dtype(df.dtypes[col])]
    
    for col in tqdm(df.columns):
        
        if df[col].isna().any():
            
            samples = pd.DataFrame()

            for feature in numerical_features:
                if feature != col:
                    samples[feature] = df[feature].fillna(df[feature].mean())

            for feature in categorical_features:
                if feature != col:
                    samples[feature],_ = pd.factorize(df[feature])
                    
            samples_x = samples.loc[df[col].notna(), :]
            samples_y = df.loc[df[col].notna(), col]

            predict_x = samples.loc[df[col].isna(), :]
            
            model = None
            
            if col in numerical_features:
                model = KNeighborsRegressor(n_neighbors = min(n_neighbors, samples_x.shape[0]))
            else:
                model = KNeighborsClassifier(n_neighbors = min(n_neighbors, samples_x.shape[0]))
            
            model.fit(samples_x, samples_y)
            
            predict_y = model.predict(predict_x)
            
            df.loc[df[col].isna(), col] = predict_y
    
    df['Id'] = dataset['Id']
    
    for col in categorical_features:
        
        df[col],_ = pd.factorize(df[col])
    
    if 'SalePrice' in dataset.columns:
        df['SalePrice'] = dataset['SalePrice']
        
    return df

In [5]:
train_raw = imputate(train_raw)

train_raw

100%|██████████| 79/79 [00:01<00:00, 62.52it/s]


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Id,SalePrice
0,60,0,65.0,8450,0,0,0,0,0,0,...,0,0,0,0,2,2008,0,0,1,208500
1,20,0,80.0,9600,0,0,0,0,0,1,...,0,0,0,0,5,2007,0,0,2,181500
2,60,0,68.0,11250,0,0,1,0,0,0,...,0,1,0,0,9,2008,0,0,3,223500
3,70,0,60.0,9550,0,0,1,0,0,2,...,0,0,0,0,2,2006,0,1,4,140000
4,60,0,84.0,14260,0,0,1,0,0,1,...,0,2,0,0,12,2008,0,0,5,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,0,62.0,7917,0,0,0,0,0,0,...,0,0,0,0,8,2007,0,0,1456,175000
1456,20,0,85.0,13175,0,0,0,0,0,0,...,0,0,0,0,2,2010,0,0,1457,210000
1457,70,0,66.0,9042,0,0,0,0,0,0,...,0,2,0,2500,5,2010,0,0,1458,266500
1458,20,0,68.0,9717,0,0,0,0,0,0,...,0,0,0,0,4,2010,0,0,1459,142125


In [6]:
test_raw = imputate(test_raw)

test_raw

100%|██████████| 79/79 [00:01<00:00, 39.54it/s]


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Id
0,20,0,80.0,11622,0,0,0,0,0,0,...,0,0,0,0,0,6,2010,0,0,1461
1,20,1,81.0,14267,0,0,1,0,0,1,...,0,0,0,1,12500,6,2010,0,0,1462
2,60,1,74.0,13830,0,0,1,0,0,0,...,0,0,0,0,0,3,2010,0,0,1463
3,60,1,78.0,9978,0,0,1,0,0,0,...,0,0,0,0,0,6,2010,0,0,1464
4,120,1,43.0,5005,0,0,1,1,0,0,...,0,0,0,0,0,1,2010,0,0,1465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,2,21.0,1936,0,1,0,0,0,0,...,0,0,1,0,0,6,2006,0,0,2915
1455,160,2,21.0,1894,0,1,0,0,0,0,...,0,0,1,0,0,4,2006,0,2,2916
1456,20,1,160.0,20000,0,0,0,0,0,0,...,0,0,2,0,0,9,2006,0,2,2917
1457,85,1,62.0,10441,0,0,0,0,0,0,...,0,0,0,0,700,7,2006,0,0,2918


## Model Selection

I use `80%` of training instances for training and other `20%` for validation.

In [7]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_target, test_target = train_test_split(train_raw.loc[:, (train_raw.columns != 'Id') & (train_raw.columns != 'SalePrice')], train_raw['SalePrice'])

### Random Forest

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

rfr = RandomForestRegressor()

rfr.fit(train_features, train_target)

mean_squared_log_error(rfr.predict(test_features), test_target)

0.021146216108228588

### Gradient Tree Boosting

In [18]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error

gbr = GradientBoostingRegressor()

gbr.fit(train_features, train_target)

mean_squared_log_error(gbr.predict(test_features), test_target)

0.02007608031534709

## Data Prediction

In [23]:
submission = pd.DataFrame({
    'Id': test_raw['Id'],
    'SalePrice': gbr.predict(test_raw.loc[:, test_raw.columns != 'Id'])
})

submission.to_csv('./dataset/submission.csv', index = False)

submission

Unnamed: 0,Id,SalePrice
0,1461,135763.349593
1,1462,160635.354775
2,1463,179604.540520
3,1464,188109.653559
4,1465,197566.265244
...,...,...
1454,2915,86151.674060
1455,2916,97442.629178
1456,2917,173980.023813
1457,2918,126588.059844
