# Notes
- tried to remove rows with outliers in Saleprice. It made the model overfit.
- Need to focus on Exploration and make it agile. Not making it rigid.

# Load Data

In [1]:
import pandas as pd, numpy as np, seaborn as sns

df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df.keys()

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [2]:
df.drop(columns=['Id'], inplace=True)

df.dtypes.value_counts()

object     43
int64      34
float64     3
Name: count, dtype: int64

In [3]:
num = [x for x in df.columns if df.dtypes[x] in ('int64', 'float')]
cat = [x for x in df.columns if df.dtypes[x] == 'object']
target = 'SalePrice'
num.remove(target)

In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=.2, random_state=0)

# Explore

In [43]:
train['YearBuiltRemodel'] = train['YearBuilt'] * train['YearRemodAdd']

In [39]:
train.corr(numeric_only=True)[target].sort_values(ascending=False)

SalePrice           1.000000
OverallQual         0.790636
GrLivArea           0.721707
GarageCars          0.660927
GarageArea          0.646603
TotalBsmtSF         0.635535
1stFlrSF            0.620740
YearBuiltRemodel    0.579258
FullBath            0.578134
TotRmsAbvGrd        0.542658
YearBuilt           0.521242
YearRemodAdd        0.517822
GarageYrBlt         0.494154
MasVnrArea          0.493482
Fireplaces          0.470320
BsmtFinSF1          0.401390
LotFrontage         0.360274
OpenPorchSF         0.320193
2ndFlrSF            0.320141
WoodDeckSF          0.311813
HalfBath            0.283762
LotArea             0.252766
BsmtFullBath        0.226432
BsmtUnfSF           0.206736
BedroomAbvGr        0.182258
ScreenPorch         0.096740
3SsnPorch           0.057561
MoSold              0.041369
PoolArea            0.027441
BsmtHalfBath       -0.010994
BsmtFinSF2         -0.020622
LowQualFinSF       -0.023128
MiscVal            -0.024750
YrSold             -0.045628
OverallCond   

In [None]:
train

# Pipeline

In [44]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [45]:
from sklearn.base import BaseEstimator, TransformerMixin
class DataSelect(BaseEstimator, TransformerMixin):
    def __init__(self, attributes):
        self.attributes = attributes
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attributes]

In [49]:
eng = ['YearBuilt', 'YearRemodAdd']

In [68]:
class Engineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X.loc[:, ['YearBuiltRemodel']] = X['YearBuilt'] * X['YearRemodAdd']
        return X

In [69]:
eng_pp = Pipeline([
    ('dataselect', DataSelect(eng))
    , ('engineer', Engineer())
])

num_pp = Pipeline([
    ('dataselect', DataSelect(num))
    , ('impute', SimpleImputer(strategy='median'))
    , ('scaler', StandardScaler())
])

cat_pp = Pipeline([
    ('dataselect', DataSelect(cat))
    , ('encoder', OneHotEncoder(sparse_output=False))
])

pipe = FeatureUnion([
    ('numeric', num_pp)
    , ('engineer', eng_pp)
#     , ('categorical', cat_pp)
])

pipe.fit(train)
train_pre = pipe.transform(train)
test_pre = pipe.transform(test)

# Model Selection

In [70]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [71]:
lin = LinearRegression()
lin.fit(train_pre, train[target])
np.sqrt(-cross_val_score(lin, train_pre, train[target], scoring='neg_mean_squared_error')).mean()

31807.142058718367

In [72]:
dtr = DecisionTreeRegressor(random_state=0)
dtr.fit(train_pre, train[target])
np.sqrt(-cross_val_score(dtr, train_pre, train[target], scoring='neg_mean_squared_error')).mean()

40988.03524904093

In [73]:
rfr = RandomForestRegressor(random_state=0)
rfr.fit(train_pre, train[target])
np.sqrt(-cross_val_score(rfr, train_pre, train[target], scoring='neg_mean_squared_error')).mean()

29349.154409152503

In [74]:
model_base = rfr

# Tune Model

In [75]:
model_base.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [76]:
from sklearn.model_selection import GridSearchCV
grd = GridSearchCV(model_base
                         , param_grid={ 'random_state': [0]
                                       , 'n_estimators': [100, 200, 300]
                                       , 'max_features': [2, 4, 8, 10]
                                      }
                         , scoring='neg_mean_squared_error')
# grd.fit(train_pre, train[target])
# grd.best_params_

In [77]:
model_tune = RandomForestRegressor(
    random_state=0
    , n_estimators=300
    , max_features=10
#     , max_leaf_nodes=None
#     , max_samples=None
)
model_tune.fit(train_pre, train[target]);

# Validation

In [80]:
print('base',
    np.sqrt(-cross_val_score(model_base, train_pre, train[target], scoring='neg_mean_squared_error')).mean(),
    np.sqrt(-cross_val_score(model_base, test_pre, test[target], scoring='neg_mean_squared_error')).mean()
)

print('tune',
    np.sqrt(-cross_val_score(model_tune, train_pre, train[target], scoring='neg_mean_squared_error')).mean(),
    np.sqrt(-cross_val_score(model_tune, test_pre, test[target], scoring='neg_mean_squared_error')).mean()
)

base 29349.154409152503 40226.526639703676
tune 28313.172877800593 39594.55051914004


    ## Multiplying YearBuilt * YearRemodAdd
    29305.91019299344 39817.936855094296
    28593.345346409456 39726.29547575848
    
    ## Baseline
    29701.66889770482 38888.041253125855
    28576.013401852346 38899.51112575973

# Submission

In [81]:
samp = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

pd.DataFrame({
    'Id': samp['Id']
    , 'SalePrice': model_tune.predict(pipe.transform(samp))
}).to_csv('submission.csv', index=False)