- Load Data
- One-hot encoding of Categorical variables
- Test and train split
- RFR Model (Random Forest Regressor)
- 

In [1]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


In [2]:
data = pd.read_pickle('data/train_data_cat_missing_handled.pkl')
data.describe()

Unnamed: 0,Id,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,69.863699,10516.828082,103.117123,443.639726,46.549315,567.240411,1057.429452,1162.626712,346.992466,...,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,180921.19589
std,421.610009,22.027677,9981.264932,180.731373,456.098091,161.319273,441.866955,438.705324,386.587738,436.528436,...,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,79442.502883
min,1.0,21.0,1300.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,34900.0
25%,365.75,60.0,7553.5,0.0,0.0,0.0,223.0,795.75,882.0,0.0,...,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,129975.0
50%,730.5,69.0,9478.5,0.0,383.5,0.0,477.5,991.5,1087.0,0.0,...,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,163000.0
75%,1095.25,79.0,11601.5,164.25,712.25,0.0,808.0,1298.25,1391.25,728.0,...,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,214000.0
max,1460.0,313.0,215245.0,1600.0,5644.0,1474.0,2336.0,6110.0,4692.0,2065.0,...,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,755000.0


In [3]:
data.select_dtypes('category').describe()

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,YrSold,SaleType,SaleCondition
count,1460,1460,1460,91,1460,1460,1460,1460,1460,1235,...,1379,1379,1379,1460,7,281,54,1460,1460,1460
unique,15,5,2,2,4,4,2,5,3,24,...,3,5,5,3,3,4,4,5,9,6
top,20,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,Unf,TA,TA,Y,Gd,MnPrv,Shed,2009,WD,Normal
freq,536,1151,1454,50,925,1311,1459,1052,1382,150,...,605,1311,1326,1340,3,157,49,338,1267,1198


**Data** 
- 50 categorical columns
- 31 numerical 
    - exclude Id column
    - exclude target column 'SalePrice'
    
X = 50 categorical (with missing values)  + 32 numerical     
y = numerical 

In [4]:
# data.info()

In [5]:
X = data.iloc[:, 1: -1]
y = data.iloc[:, -1]

X.shape, y.shape

((1460, 79), (1460,))

## One-hot encoding for categorical v.


In [6]:
'''
    Convert given categorical columns to one-hot encoded columns
    drop 'NAN' column to avoid dummy variable trap
    drop categorical columns after 1-hot encoding 
'''

def get_one_hot_encoded_X(X, cat_colums):

    for col in cat_colums:
        temp = pd.get_dummies(X[col], prefix=col, dummy_na=True)
        final_col = [col for col in temp.columns if not col.endswith('nan')]
        X = X.join(temp[final_col])
    X = X.drop(cat_colums, axis =1)   
    return X  

In [7]:
cat_cols = X.select_dtypes('category').columns
X_new = get_one_hot_encoded_X(X, cat_cols)
X_new.head()

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,...,SaleType_ConLw,SaleType_ConLI,SaleType_ConLD,SaleType_Oth,SaleCondition_Normal,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Partial
0,65.0,8450,196.0,706,0,150,856,856,854,0,...,0,0,0,0,1,0,0,0,0,0
1,80.0,9600,0.0,978,0,284,1262,1262,0,0,...,0,0,0,0,1,0,0,0,0,0
2,68.0,11250,162.0,486,0,434,920,920,866,0,...,0,0,0,0,1,0,0,0,0,0
3,60.0,9550,0.0,216,0,540,756,961,756,0,...,0,0,0,0,0,1,0,0,0,0
4,84.0,14260,350.0,655,0,490,1145,1145,1053,0,...,0,0,0,0,1,0,0,0,0,0


## Test and train split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=0)

## Random Forest Model

In [9]:
model = RandomForestRegressor(n_estimators=20, random_state =0, verbose=2)

In [10]:
model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.5s finished


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=20, n_jobs=None, oob_score=False,
                      random_state=0, verbose=2, warm_start=False)

In [11]:
y_pred = model.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.0s finished


## Model Evaluation
- mse
- R^2

In [12]:
#r_2 = 1 - ssr/sst
r2_score(y_test,y_pred)

0.8383277413003822

In [13]:
print( 'MSE: {0}, RMSE: {1}'.format(mean_squared_error(y_test, y_pred), np.sqrt(mean_squared_error(y_test, y_pred))))

MSE: 1116484361.92113, RMSE: 33413.83488797911


In [14]:
pd.DataFrame({'y': y_test,
             'y_hat': y_pred,
             'mse': np.sqrt(mean_squared_error(y_test, y_pred))})

Unnamed: 0,y,y_hat,mse
529,200624,256584.30,33413.834888
491,133000,150857.50,33413.834888
459,110000,121752.50,33413.834888
279,192000,194345.00,33413.834888
655,88000,86215.00,33413.834888
...,...,...,...
326,324000,284049.30,33413.834888
440,555000,447447.85,33413.834888
1387,136000,167122.50,33413.834888
1323,82500,81605.00,33413.834888


## Validation on Test Data

In [15]:
test_data = pd.read_pickle('data/test_data_cat_handled.pkl')
test_data.describe()


Unnamed: 0,Id,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold
count,1459.0,1232.0,1459.0,1444.0,1458.0,1458.0,1458.0,1458.0,1459.0,1459.0,...,1458.0,1458.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,2190.0,68.580357,9819.161069,100.709141,439.203704,52.619342,554.294925,1046.11797,1156.534613,325.967786,...,1.766118,472.768861,93.174777,48.313914,24.243317,1.79438,17.064428,1.744345,58.167923,6.104181
std,421.321334,22.376841,4955.517327,177.6259,455.268042,176.753926,437.260486,442.898624,398.16582,420.610226,...,0.775945,217.048611,127.744882,68.883364,67.227765,20.207842,56.609763,30.491646,630.806978,2.722432
min,1461.0,21.0,1470.0,0.0,0.0,0.0,0.0,0.0,407.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,1825.5,58.0,7391.0,0.0,0.0,0.0,219.25,784.0,873.5,0.0,...,1.0,318.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
50%,2190.0,67.0,9399.0,0.0,350.5,0.0,460.0,988.0,1079.0,0.0,...,2.0,480.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0
75%,2554.5,80.0,11517.5,164.0,753.5,0.0,797.75,1305.0,1382.5,676.0,...,2.0,576.0,168.0,72.0,0.0,0.0,0.0,0.0,0.0,8.0
max,2919.0,200.0,56600.0,1290.0,4010.0,1526.0,2140.0,5095.0,5095.0,1862.0,...,5.0,1488.0,1424.0,742.0,1012.0,360.0,576.0,800.0,17000.0,12.0


In [16]:
test_data.select_dtypes('category').describe()

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,YrSold,SaleType,SaleCondition
count,1459,1455,1459,107,1459,1459,1457,1459,1459,1241,...,1381,1381,1381,1459,3,290,51,1459,1458,1459
unique,16,5,2,2,4,4,1,5,3,24,...,3,4,5,3,2,4,3,5,9,6
top,20,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,OldTown,...,Unf,TA,TA,Y,Ex,MnPrv,Shed,2007,WD,Normal
freq,543,1114,1453,70,934,1311,1457,1081,1396,126,...,625,1293,1328,1301,2,172,46,363,1258,1204


In [17]:
# removing Id column 
X_test_data = test_data.iloc[:, 1: ]
X_test_data.head(2)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal


In [18]:
X_test_data_n = get_one_hot_encoded_X(X_test_data, cat_cols)
X_test_data_n.head(2)

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,...,SaleType_ConLw,SaleType_ConLI,SaleType_ConLD,SaleType_Oth,SaleCondition_Normal,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Partial
0,80.0,11622,0.0,468.0,144.0,270.0,882.0,896,0,0,...,0,0,0,0,1,0,0,0,0,0
1,81.0,14267,108.0,923.0,0.0,406.0,1329.0,1329,0,0,...,0,0,0,0,1,0,0,0,0,0


In [19]:
# columns in model but not in  test data
missing_cols_in_test = [c for c in X_new.columns if c not in X_test_data_n.columns]
len(missing_cols_in_test)

18

In [20]:
missing_cols_in_test

['YearBuilt_1872.0',
 'YearBuilt_1875.0',
 'YearBuilt_1882.0',
 'YearBuilt_1885.0',
 'YearBuilt_1892.0',
 'YearBuilt_1893.0',
 'YearBuilt_1898.0',
 'YearBuilt_1904.0',
 'YearBuilt_1906.0',
 'YearBuilt_1908.0',
 'YearBuilt_1911.0',
 'YearBuilt_1913.0',
 'GarageYrBlt_1906.0',
 'GarageYrBlt_1908.0',
 'GarageYrBlt_1914.0',
 'GarageYrBlt_1929.0',
 'GarageYrBlt_1931.0',
 'GarageYrBlt_1933.0']

In [21]:
# columns NOT in the model but are in test data
missing_cols_in_model = [c for c in X_test_data_n.columns if c not in X_new.columns]
len(missing_cols_in_model)

12

In [22]:
missing_cols_in_model

['YearBuilt_1879.0',
 'YearBuilt_1895.0',
 'YearBuilt_1896.0',
 'YearBuilt_1901.0',
 'YearBuilt_1902.0',
 'YearBuilt_1907.0',
 'GarageYrBlt_1895.0',
 'GarageYrBlt_1896.0',
 'GarageYrBlt_1917.0',
 'GarageYrBlt_1919.0',
 'GarageYrBlt_1943.0',
 'GarageYrBlt_2207.0']

As we can see, the issue is one-hot encoded year columns. This must be binned. 

For now, 
- remove the missing_cols_in_model from test data because our model is not aware of these columns [This will lead to poor result]
- add missing_cols_in_test to the test data because model is aware of them and needs at input time. Fill them with zero.

In [23]:
#remove the missing_cols_in_model from test data 
X_test_data_nn = X_test_data_n.drop(missing_cols_in_model, axis = 1)
X_test_data_nn.shape

(1459, 592)

In [24]:
#add missing_cols_in_test to the test data 
for col in missing_cols_in_test:
    X_test_data_nn[col] = pd.Series(np.zeros(X_test_data_nn.shape[0]), index=X_test_data_nn.index)
    

X_test_data_nn.shape

(1459, 610)

In [25]:
#validate again
print([c for c in X_new.columns if c not in X_test_data_nn.columns])
print([c for c in X_test_data_nn.columns if c not in X_new.columns])

[]
[]


In [26]:
X_test_data_nn.isnull().sum() 

LotFrontage           227
LotArea                 0
MasVnrArea             15
BsmtFinSF1              1
BsmtFinSF2              1
                     ... 
GarageYrBlt_1908.0      0
GarageYrBlt_1914.0      0
GarageYrBlt_1929.0      0
GarageYrBlt_1931.0      0
GarageYrBlt_1933.0      0
Length: 610, dtype: int64

In [27]:
X_test_data_nn[X_test_data_nn['LotFrontage'].isnull()]

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,...,YearBuilt_1906.0,YearBuilt_1908.0,YearBuilt_1911.0,YearBuilt_1913.0,GarageYrBlt_1906.0,GarageYrBlt_1908.0,GarageYrBlt_1914.0,GarageYrBlt_1929.0,GarageYrBlt_1931.0,GarageYrBlt_1933.0
6,,7980,0.0,935.0,0.0,233.0,1168.0,1187,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,,2980,1159.0,466.0,0.0,290.0,756.0,756,756,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,,2403,0.0,244.0,0.0,286.0,530.0,530,550,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45,,10456,120.0,506.0,0.0,1323.0,1829.0,1829,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47,,18837,0.0,687.0,46.0,491.0,1224.0,1287,604,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,,11088,0.0,872.0,0.0,476.0,1348.0,1358,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1390,,21533,0.0,0.0,0.0,1065.0,1065.0,1065,984,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1440,,50102,0.0,909.0,0.0,723.0,1632.0,1650,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1441,,8098,0.0,1136.0,116.0,129.0,1381.0,1403,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
y_pred_t = model.predict(X_test_data_nn)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').