- Load Data
- One-hot encoding of Categorical variables
- Test and train split
- RFR Model (Random Forest Regressor)
- Validation on Test Data

In [1]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


In [2]:
data = pd.read_pickle('data/train_data_cat_missing_binning_handled.pkl')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 74 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Id                1460 non-null   int64   
 1   MSSubClass        1460 non-null   category
 2   MSZoning          1460 non-null   category
 3   LotFrontage       1460 non-null   float64 
 4   LotArea           1460 non-null   int64   
 5   Street            1460 non-null   category
 6   Alley             91 non-null     category
 7   LotShape          1460 non-null   category
 8   LandContour       1460 non-null   category
 9   Utilities         1460 non-null   category
 10  LotConfig         1460 non-null   category
 11  LandSlope         1460 non-null   category
 12  Neighborhood      1235 non-null   category
 13  Condition1        1460 non-null   category
 14  Condition2        1460 non-null   category
 15  BldgType          1334 non-null   category
 16  HouseStyle        1460 n

In [3]:
data.describe()

Unnamed: 0,Id,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,GarageArea,WoodDeckSF,OpenPorchSF,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,69.686301,10516.828082,103.117123,443.639726,567.240411,1057.429452,1162.626712,346.992466,1515.463699,472.980137,94.244521,46.660274,180921.19589
std,421.610009,22.03795,9981.264932,180.731373,456.098091,441.866955,438.705324,386.587738,436.528436,525.480383,213.804841,125.338794,66.256028,79442.502883
min,1.0,21.0,1300.0,0.0,0.0,0.0,0.0,334.0,0.0,334.0,0.0,0.0,0.0,34900.0
25%,365.75,60.0,7553.5,0.0,0.0,223.0,795.75,882.0,0.0,1129.5,334.5,0.0,0.0,129975.0
50%,730.5,68.0,9478.5,0.0,383.5,477.5,991.5,1087.0,0.0,1464.0,480.0,0.0,25.0,163000.0
75%,1095.25,79.0,11601.5,164.25,712.25,808.0,1298.25,1391.25,728.0,1776.75,576.0,168.0,68.0,214000.0
max,1460.0,313.0,215245.0,1600.0,5644.0,2336.0,6110.0,4692.0,2065.0,5642.0,1418.0,857.0,547.0,755000.0


In [4]:
data.select_dtypes('category').describe()

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,bin_yr_built,bin_yr_remod,bin_garage_built,bin_yr_sold,bin_mo_sold
count,1460,1460,1460,91,1460,1460,1460,1460,1460,1235,...,7,281,54,1460,1460,1459,1282,1379,1146,1402
unique,15,5,2,2,4,4,2,5,3,24,...,3,4,4,9,6,6,4,5,4,4
top,20,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,Gd,MnPrv,Shed,WD,Normal,"(2000, 2010]","(2000, 2010]","(1950, 1980]","(2008, 2009]","(4, 6]"
freq,536,1151,1454,50,925,1311,1459,1052,1382,150,...,3,157,49,1267,1198,364,497,513,338,457


**Data** 
- 60 categorical columns
- 14 numerical 
    - exclude Id column
    - exclude target column 'SalePrice'
    
X = 60 categorical (with missing values)  + 12 numerical     
y = numerical 

In [5]:
data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'PoolQC', 'Fence',
       'MiscFeature', 'SaleTyp

In [6]:
X = data[['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'PoolQC', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition', 'bin_yr_built',
       'bin_yr_remod', 'bin_garage_built', 'bin_yr_sold', 'bin_mo_sold']]
y = data['SalePrice']

X.shape, y.shape

((1460, 72), (1460,))

## One-hot encoding for categorical v.


In [7]:
'''
    Convert given categorical columns to one-hot encoded columns
    drop 'NAN' column to avoid dummy variable trap
    drop categorical columns after 1-hot encoding 
'''

def get_one_hot_encoded_X(X, cat_colums):

    for col in cat_colums:
        temp = pd.get_dummies(X[col], prefix=col, dummy_na=True)
        final_col = [col for col in temp.columns if not col.endswith('nan')]
        X = X.join(temp[final_col])
    X = X.drop(cat_colums, axis =1)   
    return X  

In [8]:
cat_cols = X.select_dtypes('category').columns
X_new = get_one_hot_encoded_X(X, cat_cols)
X_new.head()

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,GarageArea,...,"bin_garage_built_(1990, 2000]","bin_garage_built_(2000, 2207]","bin_yr_sold_(2006, 2007]","bin_yr_sold_(2007, 2008]","bin_yr_sold_(2008, 2009]","bin_yr_sold_(2009, 2010]","bin_mo_sold_(1, 4]","bin_mo_sold_(4, 6]","bin_mo_sold_(6, 9]","bin_mo_sold_(9, 12]"
0,65.0,8450,196.0,706.0,150.0,856.0,856,854,1710,548.0,...,0,1,0,1,0,0,1,0,0,0
1,80.0,9600,0.0,978.0,284.0,1262.0,1262,0,1262,460.0,...,0,0,1,0,0,0,0,1,0,0
2,68.0,11250,162.0,486.0,434.0,920.0,920,866,1786,608.0,...,0,1,0,1,0,0,0,0,1,0
3,60.0,9550,0.0,216.0,540.0,756.0,961,756,1717,642.0,...,1,0,0,0,0,0,1,0,0,0
4,84.0,14260,350.0,655.0,490.0,1145.0,1145,1053,2198,836.0,...,1,0,0,1,0,0,0,0,0,1


## Test and train split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=0)

## Random Forest Model

In [10]:
model = RandomForestRegressor(n_estimators=20, random_state =0, verbose=2)

In [11]:
model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.4s finished


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=20, n_jobs=None, oob_score=False,
                      random_state=0, verbose=2, warm_start=False)

In [12]:
y_pred = model.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.0s finished


## Model Evaluation
- mse
- R^2

In [13]:
#r_2 = 1 - ssr/sst
r2_score(y_test,y_pred)

0.8486043586245207

In [14]:
print( 'MSE: {0}, RMSE: {1}'.format(mean_squared_error(y_test, y_pred), np.sqrt(mean_squared_error(y_test, y_pred))))

MSE: 1045515584.5431505, RMSE: 32334.433419238238


In [15]:
pd.DataFrame({'y': y_test,
             'y_hat': y_pred,
             'mse': np.sqrt(mean_squared_error(y_test, y_pred))})

Unnamed: 0,y,y_hat,mse
529,200624,261167.45,32334.433419
491,133000,151385.00,32334.433419
459,110000,119550.00,32334.433419
279,192000,195400.00,32334.433419
655,88000,86900.00,32334.433419
...,...,...,...
326,324000,293670.00,32334.433419
440,555000,474781.65,32334.433419
1387,136000,175615.00,32334.433419
1323,82500,76420.00,32334.433419


## Validation on Test Data

In [16]:
test_data = pd.read_pickle('data/test_data_cat_missing_binning_handled.pkl')
test_data.describe()


Unnamed: 0,Id,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,GarageArea,WoodDeckSF,OpenPorchSF
count,1459.0,1232.0,1459.0,1444.0,1458.0,1458.0,1458.0,1459.0,1459.0,1459.0,1458.0,1459.0,1459.0
mean,2190.0,68.580357,9819.161069,100.709141,439.203704,554.294925,1046.11797,1156.534613,325.967786,1486.045922,472.768861,93.174777,48.313914
std,421.321334,22.376841,4955.517327,177.6259,455.268042,437.260486,442.898624,398.16582,420.610226,485.566099,217.048611,127.744882,68.883364
min,1461.0,21.0,1470.0,0.0,0.0,0.0,0.0,407.0,0.0,407.0,0.0,0.0,0.0
25%,1825.5,58.0,7391.0,0.0,0.0,219.25,784.0,873.5,0.0,1117.5,318.0,0.0,0.0
50%,2190.0,67.0,9399.0,0.0,350.5,460.0,988.0,1079.0,0.0,1432.0,480.0,0.0,28.0
75%,2554.5,80.0,11517.5,164.0,753.5,797.75,1305.0,1382.5,676.0,1721.0,576.0,168.0,72.0
max,2919.0,200.0,56600.0,1290.0,4010.0,2140.0,5095.0,5095.0,1862.0,5095.0,1488.0,1424.0,742.0


In [17]:
test_data.select_dtypes('category').describe()

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,bin_yr_built,bin_yr_remod,bin_garage_built,bin_yr_sold,bin_mo_sold
count,1459,1455,1459,107,1459,1459,1457,1459,1459,1241,...,3,290,51,1458,1459,1459,1276,1380,1154,1395
unique,16,5,2,2,4,4,1,5,3,24,...,2,4,3,9,6,6,4,5,4,4
top,20,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,OldTown,...,Ex,MnPrv,Shed,WD,Normal,"(2000, 2010]","(2000, 2010]","(1950, 1980]","(2006, 2007]","(4, 6]"
freq,543,1114,1453,70,934,1311,1457,1081,1396,126,...,2,172,46,1258,1204,371,479,517,363,440


In [18]:
# removing Id column 
X_test_data = test_data.iloc[:, 1: ]
X_test_data.head(2)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,bin_yr_built,bin_yr_remod,bin_garage_built,bin_yr_sold,bin_mo_sold
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,,MnPrv,,WD,Normal,"(1960, 1980]","(1950, 1980]","(1950, 1980]","(2009, 2010]","(4, 6]"
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,,,Gar2,WD,Normal,"(1950, 1960]","(1950, 1980]","(1950, 1980]","(2009, 2010]","(4, 6]"


In [19]:
X_test_data_n = get_one_hot_encoded_X(X_test_data, cat_cols)
X_test_data_n.head(2)

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,GarageArea,...,"bin_garage_built_(1990, 2000]","bin_garage_built_(2000, 2207]","bin_yr_sold_(2006, 2007]","bin_yr_sold_(2007, 2008]","bin_yr_sold_(2008, 2009]","bin_yr_sold_(2009, 2010]","bin_mo_sold_(1, 4]","bin_mo_sold_(4, 6]","bin_mo_sold_(6, 9]","bin_mo_sold_(9, 12]"
0,80.0,11622,0.0,468.0,270.0,882.0,896,0,896,730.0,...,0,0,0,0,0,1,0,1,0,0
1,81.0,14267,108.0,923.0,406.0,1329.0,1329,0,1329,312.0,...,0,0,0,0,0,1,0,1,0,0


In [20]:
# columns in model but not in  test data
missing_cols_in_test = [c for c in X_new.columns if c not in X_test_data_n.columns]
len(missing_cols_in_test)

0

In [21]:
missing_cols_in_test

[]

In [22]:
# columns NOT in the model but are in test data
missing_cols_in_model = [c for c in X_test_data_n.columns if c not in X_new.columns]
len(missing_cols_in_model)

0

In [23]:
missing_cols_in_model

[]

In [24]:
X_test_data_n.isnull().sum()

LotFrontage                 227
LotArea                       0
MasVnrArea                   15
BsmtFinSF1                    1
BsmtUnfSF                     1
                           ... 
bin_yr_sold_(2009, 2010]      0
bin_mo_sold_(1, 4]            0
bin_mo_sold_(4, 6]            0
bin_mo_sold_(6, 9]            0
bin_mo_sold_(9, 12]           0
Length: 396, dtype: int64

In [25]:
X_test_data_n.isnull().columns

Index(['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea',
       ...
       'bin_garage_built_(1990, 2000]', 'bin_garage_built_(2000, 2207]',
       'bin_yr_sold_(2006, 2007]', 'bin_yr_sold_(2007, 2008]',
       'bin_yr_sold_(2008, 2009]', 'bin_yr_sold_(2009, 2010]',
       'bin_mo_sold_(1, 4]', 'bin_mo_sold_(4, 6]', 'bin_mo_sold_(6, 9]',
       'bin_mo_sold_(9, 12]'],
      dtype='object', length=396)

In [26]:
#Let's fill nan in test data with 0:
# if its numeric field than 0 can work 
# if its one-hot-encoded then 0 means no value is there for thsi column
X_test_data_n.fillna(0, inplace = True)

In [27]:
X_test_data_n.isnull().sum().sum()

0

In [28]:
y_pred_t = model.predict(X_test_data_n)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.0s finished


### Join the prediction with Id and submit to kaggle

In [29]:
submission_df = pd.DataFrame({'Id': test_data['Id'], 
                             'SalePrice': y_pred_t})

In [30]:
submission_df

Unnamed: 0,Id,SalePrice
0,1461,132672.50
1,1462,156326.25
2,1463,186630.00
3,1464,182095.00
4,1465,178944.05
...,...,...
1454,2915,86720.00
1455,2916,90090.00
1456,2917,156075.00
1457,2918,109400.00


In [31]:
submission_df.to_csv('data/ah_submission_3.csv', index=False)