In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score 


In [2]:
data = pd.read_pickle('data/final_train_data_l1_reg_v1.pkl')
data.shape

(1430, 39)

In [3]:
data.head()

Unnamed: 0,2ndFlrSF,GarageArea,LotArea_log,OpenPorchSF_log,EnclosedPorch_log,total_sqft_house_log,MSZoning_C (all),MSZoning_RM,Neighborhood_Crawfor,Condition1_Norm,...,KitchenQual_Ex,Functional_Typ,Fireplaces_2,GarageFinish_Fin,GarageCond_TA,PavedDrive_Y,SaleCondition_Abnorml,has_fireplace_1,"bin_yr_remod_(1950, 1980]",SalePrice_log
0,854,548,9.041922,4.110876,-9.21034,8.043663,0,0,0,1,...,0,1,0,0,1,1,0,0,0,12.247694
1,0,460,9.169518,-9.21034,-9.21034,8.00102,0,0,0,0,...,0,1,0,0,1,1,0,1,1,12.109011
2,866,608,9.328123,3.737672,-9.21034,8.105911,0,0,0,1,...,0,1,0,0,1,1,0,1,0,12.317167
3,756,642,9.164296,3.555351,5.605802,7.917536,0,0,1,1,...,0,1,0,0,1,1,1,1,1,11.849398
4,1053,836,9.565214,4.430818,-9.21034,8.337827,0,0,0,1,...,0,1,0,0,1,1,0,1,0,12.429216


In [4]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

X.shape, y.shape

((1430, 38), (1430,))

In [5]:
X.head()

Unnamed: 0,2ndFlrSF,GarageArea,LotArea_log,OpenPorchSF_log,EnclosedPorch_log,total_sqft_house_log,MSZoning_C (all),MSZoning_RM,Neighborhood_Crawfor,Condition1_Norm,...,KitchenQual_TA,KitchenQual_Ex,Functional_Typ,Fireplaces_2,GarageFinish_Fin,GarageCond_TA,PavedDrive_Y,SaleCondition_Abnorml,has_fireplace_1,"bin_yr_remod_(1950, 1980]"
0,854,548,9.041922,4.110876,-9.21034,8.043663,0,0,0,1,...,0,0,1,0,0,1,1,0,0,0
1,0,460,9.169518,-9.21034,-9.21034,8.00102,0,0,0,0,...,1,0,1,0,0,1,1,0,1,1
2,866,608,9.328123,3.737672,-9.21034,8.105911,0,0,0,1,...,0,0,1,0,0,1,1,0,1,0
3,756,642,9.164296,3.555351,5.605802,7.917536,0,0,1,1,...,0,0,1,0,0,1,1,1,1,1
4,1053,836,9.565214,4.430818,-9.21034,8.337827,0,0,0,1,...,0,0,1,0,0,1,1,0,1,0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
model = RandomForestRegressor()

In [8]:
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [9]:
y_pred = model.predict(X_test)

In [10]:
r2_score(y_test, y_pred)

0.8258320002086206

In [11]:
print( 'MSE: {0}, RMSE: {1}'.format(mean_squared_error(y_test, y_pred), np.sqrt(mean_squared_error(y_test, y_pred))))

MSE: 0.022045160859169542, RMSE: 0.1484761289203404


## Validation on Test Data

- need to apply transformation 
- select the final cols 

In [12]:
test_data = pd.read_pickle('data/data_test_l1_1.pkl')
test_data.head()

Unnamed: 0,Id,2ndFlrSF,GarageArea,LotArea_log,OpenPorchSF_log,EnclosedPorch_log,total_sqft_house_log,MSZoning_C (all),MSZoning_RM,Neighborhood_Crawfor,...,KitchenQual_TA,KitchenQual_Ex,Functional_Typ,Fireplaces_2,GarageFinish_Fin,GarageCond_TA,PavedDrive_Y,SaleCondition_Abnorml,has_fireplace_1,"bin_yr_remod_(1950, 1980]"
0,1461,0,730.0,9.360655,-9.21034,-9.21034,7.827241,0,0,0,...,1,0,1,0,0,1,1,0,0,1
1,1462,0,312.0,9.565704,3.583522,-9.21034,7.996317,0,0,0,...,0,0,1,0,0,1,1,0,0,1
2,1463,701,482.0,9.534595,3.526363,-9.21034,8.019284,0,0,0,...,1,0,1,0,1,1,1,0,1,0
3,1464,678,470.0,9.208138,3.583522,-9.21034,8.006368,0,0,0,...,0,0,1,0,1,1,1,0,1,0
4,1465,0,506.0,8.518193,4.40672,-9.21034,8.028129,0,0,0,...,0,0,1,0,0,1,1,0,0,0


In [13]:
test_data.isnull().sum().sum()

0

In [14]:
test_X = test_data.iloc[:, 1:]
Id_col = test_data.iloc[:, 0]

In [15]:
y_pred_test = model.predict(test_X)

In [16]:
submission = pd.DataFrame({'Id': Id_col, 
             'SalePrice': y_pred_test})
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,11.818787
1,1462,11.972064
2,1463,12.145397
3,1464,12.102947
4,1465,12.283235


In [17]:
submission.to_csv('data/ah_submission_4.csv', index=False)