# House Prices: Advanced Regression Techniques

Predict sales prices and practice feature engineering, RFs, and gradient boosting

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
#from Ipython.display import display

from sklearn import metrics

In [3]:
PATH = 'data/prices/'

In [4]:
!ls {PATH}

models	test.csv  tmp  train.csv


## The data

Dependent variable is SalePrice

In [5]:
# import the data
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)

In [6]:
# change display settings
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [None]:
display_all(df_raw.tail().T)

## Initial Processing

**Removing outliers**

In [7]:
df_raw.drop(df_raw[(df_raw['OverallQual']<5) & 
                   (df_raw['SalePrice']>200000)].index,
            inplace=True)
df_raw.drop(df_raw[(df_raw['GrLivArea']>4000) & 
                   (df_raw['SalePrice']<300000)].index,
            inplace=True)
df_raw.reset_index(drop=True, inplace=True)

In [8]:
# take log of SalePrice because metric being used is RMSE
df_raw.SalePrice = np.log(df_raw.SalePrice)

In [None]:
# there are no dates so wouldn't use add_datepart

In [9]:
# convert strings to numbers
train_cats(df_raw)

In [10]:
# look at the missing values
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))

1stFlrSF         0.000000
2ndFlrSF         0.000000
3SsnPorch        0.000000
Alley            0.937543
BedroomAbvGr     0.000000
BldgType         0.000000
BsmtCond         0.025395
BsmtExposure     0.026081
BsmtFinSF1       0.000000
BsmtFinSF2       0.000000
BsmtFinType1     0.025395
BsmtFinType2     0.026081
BsmtFullBath     0.000000
BsmtHalfBath     0.000000
BsmtQual         0.025395
BsmtUnfSF        0.000000
CentralAir       0.000000
Condition1       0.000000
Condition2       0.000000
Electrical       0.000686
EnclosedPorch    0.000000
ExterCond        0.000000
ExterQual        0.000000
Exterior1st      0.000000
Exterior2nd      0.000000
Fence            0.807138
FireplaceQu      0.473576
Fireplaces       0.000000
Foundation       0.000000
FullBath         0.000000
Functional       0.000000
GarageArea       0.000000
GarageCars       0.000000
GarageCond       0.055594
GarageFinish     0.055594
GarageQual       0.055594
GarageType       0.055594
GarageYrBlt      0.055594
GrLivArea   

save progress in feather format

In [11]:
os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/prices-raw')

## Pre-processing

In [12]:
# read data back from feather format
df_raw = pd.read_feather('tmp/prices-raw')

Replace categories with their numeric codes, handle missing continuous values and split the dependent variable into a separate variable

First RandomForest model

In [13]:
df, y, nas = proc_df(df_raw, 'SalePrice')

In [14]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df, y)

0.9750074271241405

Get separate training and validation sets

In [15]:
# get a random 20% subset of the data
X_valid = df.sample(frac=0.2)
y_valid = y[X_valid.index]

# remove validation set from training set
X_train = df.drop(X_valid.index)
y_train = np.delete(y, X_valid.index)

y_valid.shape
X_valid.shape

(291, 83)

Metrics

In [16]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

# def print_score(m):
#     res = [rmse(m.predict(df), y),
#                 m.score(df, y)]
#     if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
#     print(res)

In [17]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)

[0.06292229303713714, 0.13893673983882518, 0.97555169126438, 0.8715083280010713]


## Bagging

Increase number of trees in the forest to 20

In [18]:
m = RandomForestRegressor(n_estimators=20, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)

[0.055799730077240504, 0.13975783666072927, 0.9807733360536077, 0.8699851043967722]


The increase seems to help, let's add more.....`

In [19]:
m = RandomForestRegressor(n_estimators=100, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)

[0.05219969524224256, 0.13609378153992627, 0.9831742018637448, 0.8767129854858792]


Error still seems to be reducing
r^2 also increases

### use oob_score

In [20]:
m = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.05133289970432175, 0.13496443094383373, 0.9837283597341363, 0.878750647399507, 0.8792426348322353]


### min_samples_leaf

In [21]:
m = RandomForestRegressor(n_estimators=100, min_samples_leaf=3, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.07217107641142397, 0.1341059845828085, 0.9678362924379547, 0.880288164256582, 0.876223729292975]


min_samples_leaf = 3, actually increased our error

In [22]:
m = RandomForestRegressor(n_estimators=100, min_samples_leaf=1, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.05341106917005483, 0.13525890144589428, 0.9823842035803997, 0.8782209773517149, 0.8772780308432091]


min_samples_leaf = 3, seems to be helping

### max_features

In [24]:
m = RandomForestRegressor(n_estimators=100, min_samples_leaf=3,  max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.07321201318926157, 0.12790018633404987, 0.9669017952767918, 0.8911112212472324, 0.8849335934053256]


Use entire dataset to train with the same parameters

In [25]:
m = RandomForestRegressor(n_estimators=1000, min_samples_leaf=3,  max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(df, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=True, random_state=None, verbose=0, warm_start=False)

Use model to make predictions on test set

In [26]:
df_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)

In [27]:
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [28]:
# convert strings to numbers
apply_cats(df_test, df_raw)

In [29]:
# test set does not have a SalePrice column so let's add one 
# before calling proc_df
df_test['SalePrice'] = 0
df_final, y, nas = proc_df(df_test, 'SalePrice', na_dict=nas)

In [30]:
# predictions on the test set
preds = m.predict(df_final)

In [31]:
test_ids = df_test['Id']

# submission data frame
submission_df = {"Id": test_ids,
                 "SalePrice": np.exp(preds)}
submission = pd.DataFrame(submission_df)
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,125125.491039
1,1462,152352.648625
2,1463,178538.153619
3,1464,184085.324464
4,1465,195421.240499


Use to_csv to save the dataframe to a csv file

make sure the index parameter is set to False, otherwise we will add an extra column to our CSV

In [32]:
# save the dataframe to a csv file
submission.to_csv("submission.csv",index=False)