In [43]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
%matplotlib inline

In [2]:
# Importing the train and test data:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [5]:
print(train.shape, test.shape)

(1460, 81) (1459, 80)


In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


As we can see there are many categorical features and hence they have to be encoded before we can feed them into a model

In [118]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


In [4]:
train.isna().sum()

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive

We can also see that there are features with NaN values in them. We cannot delete those samples from our data as there are certain features like "PoolQC" which only have 7 non-NaN values. We have to check if ignoring the features with NaN values in them is a viable option as we've a lot of other features and if the value addition because of these isn't much. Otherwise, we will have to follow one of the multiple available options to fill the NaN values by mean, median, highest, mode or zeros based on the feature individually.

#### Before that we'll have to make a dev set from the available train data in order to compare and evaluate the performance of the models we make and use the same for hyper-tuning different models.

In [9]:
def train_val_split(input_data, split = 0.9):
    
    train = input_data.sample(frac = split)
    train_index = list(train.index)
    val = input_data[(input_data.index.isin(train_index) == False)]

    train.reset_index(drop = True, inplace = True)
    val.reset_index(drop = True, inplace = True)
    
    return train, val

In [10]:
x_train, x_val = train_val_split(train, 0.8)

In [11]:
x_train.shape, x_val.shape

((1168, 81), (292, 81))

In [12]:
x_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1205,20,RL,78.0,10140,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,7,2006,WD,Normal,153500
1,1174,50,RL,138.0,18030,Pave,,IR1,Bnk,AllPub,...,0,,MnPrv,,0,3,2007,WD,Normal,200500
2,600,160,RM,24.0,1950,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,,0,7,2008,COD,Normal,151000
3,998,20,RL,,11717,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2009,WD,Normal,185000
4,1382,20,RL,,12925,Pave,,IR1,Lvl,AllPub,...,0,,,,0,5,2008,WD,Normal,237500


### Baseline

Predicting average sale price of the train data as baseline

In [295]:
baseline_pred = np.mean(x_train['SalePrice'])

In [296]:
baseline_pred

180844.84674657535

In [297]:
baseline_preds = [baseline_pred for i in range(len(x_val))]

In [298]:
log_rmse(x_val, baseline_preds, x_val[['Id', 'SalePrice']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


0.4017191315145741

### Linear Regression on select Basic Numeric Features

Trying to fit a Linear Regression Model on the numerical only and important features, based on intuition and description of the feature, to see the performance of the model

In [69]:
from sklearn.linear_model import LinearRegression

In [13]:
lr = LinearRegression()

In [14]:
lr.fit(x_train[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']], x_train['SalePrice'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

#### Evaluation:

Evaluation method chosen for this problem, as per the Kaggle Competition, is log RMSE. The log RMSE is RMSE between the logarithm of the predicted value and the logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)

In [15]:
def log_rmse(dataset, preds, true):
    preds_ = pd.concat([dataset['Id'], pd.Series(preds)], axis = 1)
    preds_.rename(str, columns = {0:'SalePrice'}, inplace = True)
    true['log_pred'] = np.log(true['SalePrice'])
    preds_['log_pred'] = np.log(preds_['SalePrice'])
    eval = pd.merge(preds_, true, how = 'inner', on = 'Id')
    eval['log_error'] = eval['log_pred_x'] - eval['log_pred_y']
    
    return np.sqrt(np.sum(eval['log_error']**2)/len(eval['log_error']))

In [16]:
print('The train Log RMSE loss is {}'.format(log_rmse(x_train, lr.predict(x_train[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']]), x_train[['Id', 'SalePrice']])))
print('The validation Log RMSE loss is {}'.format(log_rmse(x_val, lr.predict(x_val[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']]), x_val[['Id', 'SalePrice']])))        

The train Log RMSE loss is 0.244807986935202
The validation Log RMSE loss is 0.2543075163842803


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


This is an improvement from the baseline model, but we can clearly see that there is a lot of scope for improvement. By including the categorical features that can help better determine 'SalePrice' and removing the features with less importance we can improve the performance of the model without increasing the complexity much.

### Decision Tree on select Basic Numeric Features

In [22]:
from sklearn.tree import DecisionTreeRegressor

In [23]:
dt = DecisionTreeRegressor()

In [24]:
dt.fit(x_train[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']], x_train['SalePrice'])

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [25]:
print('The train Log RMSE loss is {}'.format(log_rmse(x_train, dt.predict(x_train[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']]), x_train[['Id', 'SalePrice']])))   
print('The validation Log RMSE loss is {}'.format(log_rmse(x_val, dt.predict(x_val[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']]), x_val[['Id', 'SalePrice']])))

The train Log RMSE loss is 0.0011836958451535025
The validation Log RMSE loss is 0.27086688195595976


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


From the above log RMSE values we can understand that the model is suffering from overfitting. As the train log RMSE is close to zero and the validation error is 0.27, it is clear that the model has been overfitted to the train data and hence the model has to be tuned for the right hyper-parameters so that it doesn't overfit the train data

#### Hyper-Parameter tuning of the model to avoid overfitting

In [26]:
min_split = [2, 5, 10, 20, 50, 75, 100, 200, 300, 500, 750, 1000, 2000, 5000]
min_leaves = [1, 2, 5, 10, 20, 50, 75, 100, 200, 300, 500, 750, 1000, 2000, 5000]

In [27]:
dt_train_acc = []
dt_val_acc = []
for i in min_split:
    for j in min_leaves:
        dt = DecisionTreeRegressor(min_samples_split = i, min_samples_leaf = j)
        dt.fit(x_train[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']], x_train['SalePrice'])
        dt_train_acc.append(log_rmse(x_train, dt.predict(x_train[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']]), x_train[['Id', 'SalePrice']]))
        dt_val_acc.append(log_rmse(x_val, dt.predict(x_val[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']]), x_val[['Id', 'SalePrice']]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in

In [31]:
dt_train_acc[np.argmin(dt_val_acc)], np.min(dt_val_acc)

(0.17881359345610576, 0.22596608415074426)

The model performance on validation data has improved and the difference between the model's performance on train dataset and validation dataset has reduced indicating that the model isn't overfitting the train dataset. Although, the difference between train and validation error is high indicating that the model is still suffering from high variance. Also, the train error is also high now which means that the model is suffering from the bias issue as well. To improve the model's performance and reduce the bias and variance present in the model, we need to increase the qualitative features and data and opt for a better model that is more flexible and can learn the patterns in the data

### Random Forest on select Basic Numeric Features

In [32]:
from sklearn.ensemble import RandomForestRegressor

In [36]:
rf = RandomForestRegressor()

In [37]:
rf.fit(x_train[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']], x_train['SalePrice'])



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [38]:
print('The train Log RMSE loss is {}'.format(log_rmse(x_train, rf.predict(x_train[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']]), x_train[['Id', 'SalePrice']])))
print('The validation Log RMSE loss is {}'.format(log_rmse(x_val, rf.predict(x_val[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']]), x_val[['Id', 'SalePrice']])))  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


(0.08552741900686368, 0.20812832080082125)

We can see that the model performance has improved by choosing a more complex and flexible model that can better discern the trends inherent to the data. But again, we have the problem of the overfitting and hence we've to tune the model to reduce the overfitting of the model to the train dataset to improve the validation dataset performance.

First we perform coarse grid search and then we can further narrow down on the hyper-parameters, if we want to, to find the ideal optimal parameters

In [41]:
from sklearn.model_selection import GridSearchCV

In [58]:
clf = RandomForestRegressor(n_estimators=20)

param_grid = {"max_depth": [3, 5, None],
              "max_features": [1, 3, 10, None],
              "min_samples_split": [2, 50, 100, 500, 1000],
              "bootstrap": [True, False],
              "n_estimators": [100, 500, 1000, 3000]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=3, scoring= 'neg_mean_squared_log_error')
start = time()
grid_search.fit(x_train[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']], x_train['SalePrice'])

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))

GridSearchCV took 1217.41 seconds for 480 candidate parameter settings.


In [59]:
grid_search.best_score_, grid_search.best_params_

(-0.03466070799254006,
 {'bootstrap': True,
  'max_depth': None,
  'max_features': 10,
  'min_samples_split': 2,
  'n_estimators': 500})

In [60]:
rf = RandomForestRegressor(**grid_search.best_params_)

In [61]:
rf.fit(x_train[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']], x_train['SalePrice'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)

In [62]:
log_rmse(x_train, rf.predict(x_train[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']]), x_train[['Id', 'SalePrice']]), log_rmse(x_val, rf.predict(x_val[['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold', 'YrSold']]), x_val[['Id', 'SalePrice']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


(0.07290128250864045, 0.19866393034206925)

In [84]:
(len(x_train) - x_train.count()).reset_index()[(len(x_train) - x_train.count()).reset_index()[0] != 0]

Unnamed: 0,index,0
3,LotFrontage,208
6,Alley,1093
25,MasVnrType,8
26,MasVnrArea,8
30,BsmtQual,30
31,BsmtCond,30
32,BsmtExposure,31
33,BsmtFinType1,30
35,BsmtFinType2,31
42,Electrical,1


Here's a brief version of the above features and what they actually are:

* LotFrontage: Linear feet of street connected to property
* Alley: Type of alley access
* MasVnrType: Masonry veneer type
* MasVnrArea: Masonry veneer area in square feet
* BsmtQual: Height of the basement
* BsmtCond: General condition of the basement
* BsmtExposure: Walkout or garden level basement walls
* BsmtFinType1: Quality of basement finished area
* BsmtFinType2: Quality of second finished area (if present)
* Electrical: Electrical system
* FireplaceQu: Fireplace quality
* GarageType: Garage location
* GarageYrBlt: Year garage was built
* GarageFinish: Interior finish of the garage
* GarageQual: Garage quality
* GarageCond: Garage condition
* PoolQC: Pool quality
* Fence: Fence quality
* MiscFeature: Miscellaneous feature not covered in other categories

### Encoding all categorical Features

As we've seen that the model still has a bias issue, we try to add more features from the available data to improve the model. Adding more features implies more data and more complexity to the model. It's upto us to decide whether the increased complexity and effort is worth the improvement in the model's performance.

In [64]:
train_encoded = pd.get_dummies(train)

In [65]:
train_encoded.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


As we can see that encoding all the categorical features into numerical dummies has significantly increased the data to be fed into the model and now the dataset has 290 features instead of earlier 14 features.

In [66]:
x_train_encoded, x_val_encoded = train_val_split(train_encoded, 0.8)

In [67]:
x_train_encoded.shape, x_val_encoded.shape

((1168, 290), (292, 290))

Unnamed: 0,index,0
2,LotFrontage,213
8,MasVnrArea,6
25,GarageYrBlt,64


In [68]:
x_train_encoded.dropna(axis=1, how='any').shape

(1168, 287)

### Random Forest on all encoded features

In [307]:
rf = RandomForestRegressor()#max_depth=200, n_estimators=100)

In [308]:
rf.fit(x_train_encoded.dropna(axis=1, how='any').drop(['Id', 'SalePrice'], axis = 1), x_train_encoded.dropna(axis=1, how='any')['SalePrice'])



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [311]:
log_rmse(x_train_encoded, rf.predict(x_train_encoded.dropna(axis=1, how='any').drop(['Id', 'SalePrice'], axis = 1)), x_train_encoded.dropna(axis=1, how='any')[['Id', 'SalePrice']])

0.06828580227366929

In [316]:
log_rmse(x_val_encoded, rf.predict(x_val_encoded.dropna(axis=1, how='any').drop(['Id', 'SalePrice'], axis = 1)), x_val_encoded.dropna(axis=1, how='any')[['Id', 'SalePrice']])

ValueError: Number of features of the model must match the input. Model n_features is 285 and input n_features is 286 

In [306]:
pd.DataFrame(rf.feature_importances_, x_train_encoded.drop('SalePrice', axis = 1).columns).reset_index(inplace = True)

ValueError: Shape of passed values is (1, 285), indices imply (1, 289)

#Make evaluation set and try to tune on it

check for more features and make it better

implement more models