In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import seaborn as sns


pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
#hide
import warnings
warnings.simplefilter('ignore', FutureWarning)

# Loading data

In [3]:
df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# Data cleaning + Feature Engineering

In [4]:
df.head().T

In [5]:
[print(f"{col:25}\t{df[col].dtype}\t{df[col].nunique()}") for col in df.columns];

In [6]:
# Combine the ["MoSold","YrSold"] columns into a single datetime column ['DateSold'] 
# and drop the original columns; make a bunch of new columns out of the new ['DateSold'] col

df['DateSold'] = pd.to_datetime(df.YrSold.astype(str) + '/' + df.MoSold.astype(str) + '/01')
df.drop(['MoSold', 'YrSold'], axis=1, inplace=True)
add_datepart(df, 'DateSold')

In [7]:
dep_var = 'SalePrice' # dependent variable
df[dep_var] = np.log(df[dep_var]) # evaluation criterion is the RMSE of the logarithm

In [8]:
# separate continuous and categorial variables based on cardinality
cont,cat = cont_cat_split(df, 25, dep_var=dep_var) # the highest categorical data had cardinality = 25

In [9]:
print(cont)
print(cat)

In [10]:
# Categorify: https://docs.fast.ai/tabular.core.html#Categorify
# FillMissing: https://docs.fast.ai/tabular.core.html#FillMissing
# Normalize: https://docs.fast.ai/data.transforms.html#Normalize
procs = [Categorify, FillMissing, Normalize]

In [11]:
# This is not an extrapolation problem; the test set does not contain points 
# boyond the region of validity of the training set; hence, we do a random 80-20 split
# of the training set into train + valid

splits = RandomSplitter(valid_pct=0.3, seed=0)(range_of(df))

In [12]:
# TabularPandas: https://docs.fast.ai/tabular.core.html#TabularPandas
to = TabularPandas(df, procs, cat, cont, dep_var, splits=splits)

In [13]:
print(cont)
print(cat)

In [14]:
fig, axs = plt.subplots(ncols=len(cat), nrows=1, figsize=(len(cat)*5, 5))

for idx, col in enumerate(cat):
    to.train.xs[col].value_counts(sort=False).plot.barh(ax=axs[idx], label=col)
    axs[idx].legend()

In [15]:
fig, axs = plt.subplots(ncols=len(cont), nrows=2, figsize=(len(cont)*5, 10))

for idx, col in enumerate(cont):
    sns.boxplot(y=to.train.xs[col], ax=axs[0,idx])
    sns.violinplot(y=to.train.xs[col], ax=axs[1,idx])
    axs[1,idx].set_xlabel(col)

OUTLIERS?

In [16]:
# Metrics for evaluation: Root-Mean-Squared-Error (RMSE) between the logarithm of the 
# predicted value and the logarithm of the observed sales price
# https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview/evaluation

def r_mse(pred,y): return round(math.sqrt(((pred-y)**2).mean()), 6)
def m_rmse(m, xs, y): return r_mse(m.predict(xs), y)

In [17]:
# Split into train + valid

xs,y = to.train.xs,to.train.y
valid_xs,valid_y = to.valid.xs,to.valid.y
len(to.train),len(to.valid)

# Model Building

## Decision Tree Regressor

In [18]:
dtr = DecisionTreeRegressor()
dtr.fit(xs, y)
m_rmse(dtr, xs, y), m_rmse(dtr, valid_xs, valid_y)

This gives a 0.0 training set error (exact predictions), but generalises badly to the validation set. Hence we do some hyperparameter tuning.

In [19]:
# Hyperparameter tuning

from sklearn.model_selection import GridSearchCV

param_grid = {"splitter": ["best", "random"],
              "min_samples_split": [10, 20, 40],
              "max_depth": [2, 6, 8],
              "min_samples_leaf": [20, 40, 100],
              "max_leaf_nodes": [5, 20, 100],
              "min_weight_fraction_leaf": [0.1, 0.3, 0.5],
              }

dtr = DecisionTreeRegressor()

clf = GridSearchCV(estimator=dtr, 
                   param_grid=param_grid,
                   scoring='neg_root_mean_squared_error',
                   n_jobs=-1)

clf.fit(xs, y)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_))

In [20]:
# Ouptut: Best parameters: {'max_depth': 6, 'max_leaf_nodes': 20, 'min_samples_leaf': 20, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.1, 'splitter': 'best'}
# Lowest RMSE:  0.2349359993399057

# Saving the output so that we don't run the above cell multiple times

In [21]:
dtr = DecisionTreeRegressor(max_depth=6, max_leaf_nodes=20, 
                          min_samples_leaf=20, min_samples_split=10,
                         min_weight_fraction_leaf=0.1)
dtr.fit(xs, y)
m_rmse(dtr, xs, y), m_rmse(dtr, valid_xs, valid_y)

## Random Forest Regressor

In [22]:
rfr = RandomForestRegressor()
rfr.fit(xs, y)
m_rmse(rfr, xs, y), m_rmse(rfr, valid_xs, valid_y)

In [23]:
from sklearn.model_selection import RandomizedSearchCV

random_grid = {'n_estimators': [5,20,50,100],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [int(x) for x in np.linspace(10, 120, num = 12)],
               'min_samples_split': [2, 6, 10],
               'min_samples_leaf': [1, 3, 4],
               'bootstrap': [True, False]}


rfr = RandomForestRegressor()

clf = RandomizedSearchCV(estimator=rfr, 
                   param_distributions=random_grid,
                   scoring='neg_root_mean_squared_error',
                   n_jobs=-1)

clf.fit(xs, y)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_))

In [24]:
# Best parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 70, 'bootstrap': True}
# Lowest RMSE:  0.1509913147378329

In [25]:
def rf(xs, y, n_estimators=100, min_samples_split=6, min_samples_leaf=4,
       max_features='sqrt',max_depth=20, bootstrap=True, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
                                 min_samples_split=min_samples_split,
                                 min_samples_leaf=min_samples_leaf,
                                 max_features=max_features,
                                 max_depth=max_depth,
                                 oob_score=True).fit(xs, y)

In [26]:
rfr = rf(xs, y)
m_rmse(rfr, xs, y), m_rmse(rfr, valid_xs, valid_y)

### Plotting feature importance

In [27]:
def rf_feat_importance(rfr, df):
    return pd.DataFrame({'cols':df.columns, 'imp':rfr.feature_importances_}
                       ).sort_values('imp', ascending=False)

In [28]:
fi = rf_feat_importance(rfr, xs)
fi[:10]

In [29]:
def plot_fi(fi):
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

plot_fi(fi[:30]);

In [30]:
to_keep = fi[fi.imp>0.005].cols
len(to_keep)

In [31]:
xs_imp = xs[to_keep]
valid_xs_imp = valid_xs[to_keep]

In [32]:
rfr = rf(xs_imp, y)
m_rmse(rfr, xs_imp, y), m_rmse(rfr, valid_xs_imp, valid_y)

In [33]:
plot_fi(rf_feat_importance(rfr, xs_imp));

In [34]:
len(xs.columns), len(xs_imp.columns)

### Removing Redundant Features

In [35]:
!pip install -Uqq fastbook
import fastbook
from fastbook import *

In [36]:
cluster_columns(xs_imp)

In [37]:
def get_oob(df):
    rfr = RandomForestRegressor(n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=15,
       max_features='sqrt',max_depth=20, bootstrap=True, oob_score=True)
    rfr.fit(df, y)
    return rfr.oob_score_

In [38]:
get_oob(xs_imp)

In [39]:
{c:get_oob(xs_imp.drop(c, axis=1)) for c in (
    'FireplaceQu','Fireplaces',
    'GrLivArea', 'TotRmsAbvGrd', 
    'GarageArea', 'GarageCars',
    'GarageYrBlt', 'YearBuilt',
    '1stFlrSF', 'TotalBsmtSF')}

In [40]:
to_drop = ['FireplaceQu', 'TotRmsAbvGrd', 'GarageArea', 'GarageYrBlt', '1stFlrSF']
get_oob(xs_imp.drop(to_drop, axis=1))

In [41]:
xs_final = xs_imp.drop(to_drop, axis=1)
valid_xs_final = valid_xs_imp.drop(to_drop, axis=1)

In [42]:
# saving final versions

save_pickle('/kaggle/working/xs_final.pkl', xs_final)
save_pickle('/kaggle/working/valid_xs_final.pkl', valid_xs_final)

In [43]:
# loading them back

xs_final = load_pickle('/kaggle/working/xs_final.pkl')
valid_xs_final = load_pickle('/kaggle/working/valid_xs_final.pkl')

In [44]:
rfr = rf(xs_final, y)
m_rmse(rfr, xs_final, y), m_rmse(rfr, valid_xs_final, valid_y)

### Partial Dependence

As we've seen, the two most important predictors are `GrLivArea` and `OverallQual`. We'd like to understand the relationship between these predictors and sale price. It's a good idea to first check the count of values per category (provided by the Pandas value_counts method), to see how common each category is:

In [45]:
p = valid_xs_final['OverallQual'].value_counts(sort=False).plot.barh()
c = to.classes['OverallQual']
plt.yticks(range(len(c)), c);

#na#: label fastai applies to missing values.

In [46]:
ax = valid_xs_final['GrLivArea'].hist()

In [47]:
from sklearn.inspection import plot_partial_dependence

fig,ax = plt.subplots(figsize=(12, 4))
plot_partial_dependence(rfr, valid_xs_final, ['GrLivArea','OverallQual'],
                        grid_resolution=20, ax=ax);

### Test set predictions with RFR

In [48]:
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

df_test['DateSold'] = pd.to_datetime(df_test.YrSold.astype(str) + '/' + df_test.MoSold.astype(str) + '/01')
add_datepart(df_test, 'DateSold')

cont_test,cat_test = cont_cat_split(df_test, max_card=25, dep_var=None)
procs_test = [Categorify, FillMissing, Normalize]
to_test = TabularPandas(df_test, procs_test, cat_test, cont_test, y_names=None)

xs_test = to_test.train.xs[list(xs_final.columns)]

predicted_prices = rfr.predict(xs_test)

my_submission = pd.DataFrame({'Id': df_test.Id, 'SalePrice': np.exp(predicted_prices)})

my_submission.to_csv('submission_rfr.csv', index=False)

## XGBoost

In [49]:
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

In [50]:
xgr = XGBRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(xgr, xs, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

In [51]:
xgr = XGBRegressor().fit(xs, y)
m_rmse(xgr, xs, y), m_rmse(xgr, valid_xs, valid_y)

In [52]:
# Start with original training and validation set: (xs,y), (valid_xs,valid_y) 
# and perform a hyperparameter-tuning

"""
from xgboost import XGBRegressor
from xgboost import plot_importance


params = {"learning_rate"    : [ 0.001, 0.01, 0.1, 1.0 ],
          "max_depth"        : [ 3, 6, 9 ],
          "n_estimators"     : [ 50, 100, 500 ]}

xgbr = XGBRegressor(seed = 20)
clf = GridSearchCV(estimator=xgbr, 
                   param_grid=params,
                   scoring='neg_root_mean_squared_error',
                   n_jobs=-1)

clf.fit(xs, y)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_))
"""

In [53]:
# Saving the output so that we don't run the above cell multiple times

#Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
#Lowest RMSE:  0.13391363620758057

In [54]:
xgr = XGBRegressor(learning_rate=0.1, max_depth=3, n_estimators=500).fit(xs, y)
m_rmse(xgr, xs, y), m_rmse(xgr, valid_xs, valid_y)

### Performing feature importance and dropping redundant featurse

In [55]:
plot_importance(xgr, max_num_features=10)

In [56]:
# Model selection

# Credit: https://machinelearningmastery.com/feature-importance-and-feature-selection-with-xgboost-in-python/

import warnings
warnings.filterwarnings('ignore')

# use feature importance for feature selection
from numpy import sort
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectFromModel

# split data into train and test sets
#X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=7)
# fit model on all training data
xgr = XGBRegressor(learning_rate=0.1, max_depth=3, n_estimators=500)
xgr.fit(xs, y)
# make predictions for test data and evaluate
y_pred = xgr.predict(valid_xs)
predictions = [round(value) for value in y_pred]
rmse_score = np.sqrt(mean_squared_error(valid_y, predictions))
print("RMSE: %.2f" % (rmse_score))
# Fit model using each importance as a threshold
thresholds = sort(xgr.feature_importances_)
for thresh in thresholds[::-1][0:60]:
	# select features using threshold
	selection = SelectFromModel(xgr, threshold=thresh, prefit=True)
	select_X_train = selection.transform(xs)
	# train model
	selection_model = XGBRegressor(learning_rate=0.1, max_depth=3, n_estimators=500)
	selection_model.fit(select_X_train, y)
	# eval model
	select_X_test = selection.transform(valid_xs)
	y_pred = selection_model.predict(select_X_test)
	predictions = [round(value) for value in y_pred]
	rmse_score = np.sqrt(mean_squared_error(valid_y, predictions))
	print("Thresh=%.3f, n=%d, RMSE: %.2f" % (thresh, select_X_train.shape[1], rmse_score))

In [57]:
to_keep = fi[fi.imp>0.01].cols
print(len(to_keep))

xs_xgr_final = xs[to_keep]

xgr = XGBRegressor(learning_rate=0.1, max_depth=3, n_estimators=500)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(xgr, xs_xgr_final, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('Mean RMSE: %.3f (%.3f)' % (scores.mean(), scores.std()))

### Test set predictions with XGBoost

In [58]:
xgr = XGBRegressor(learning_rate=0.1, max_depth=3, n_estimators=500).fit(xs_xgr_final, y)
m_rmse(xgr, xs_xgr_final, y), m_rmse(xgr, valid_xs[list(xs_xgr_final.columns)], valid_y)

In [59]:
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

df_test['DateSold'] = pd.to_datetime(df_test.YrSold.astype(str) + '/' + df_test.MoSold.astype(str) + '/01')
add_datepart(df_test, 'DateSold')

cont_test,cat_test = cont_cat_split(df_test, max_card=25, dep_var=None)
procs_test = [Categorify, FillMissing, Normalize]
to_test = TabularPandas(df_test, procs_test, cat_test, cont_test, y_names=None)

xs_test = to_test.train.xs[list(xs_xgr_final.columns)]

predicted_prices = xgr.predict(xs_test)

my_submission = pd.DataFrame({'Id': df_test.Id, 'SalePrice': np.exp(predicted_prices)})

my_submission.to_csv('submission_xgr.csv', index=False)

## Neural Network [WIP]

In [60]:
df_nn = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

df_nn['DateSold'] = pd.to_datetime(df_nn.YrSold.astype(str) + '/' + df_nn.MoSold.astype(str) + '/01')
df_nn.drop(['MoSold', 'YrSold'], axis=1, inplace=True)
add_datepart(df_nn, 'DateSold')

df_nn[dep_var] = np.log(df_nn[dep_var])

cont_nn,cat_nn = cont_cat_split(df_nn, max_card=25, dep_var=dep_var)
procs_nn = [Categorify, FillMissing, Normalize]
to_nn = TabularPandas(df_nn, procs_nn, cat_nn, cont_nn,
                      splits=splits, y_names=dep_var)

In [61]:
dls = to_nn.dataloaders(bs=64)

In [62]:
dls.show_batch()

In [63]:
# It's a good idea to set y_range for regression models, so let's find the min and max of our dependent variable:

y = to_nn.train.y
y.min(),y.max()

In [64]:
learn = tabular_learner(dls, y_range=(10,14),
                        n_out=1, loss_func=F.mse_loss)

In [65]:
learn.lr_find()

In [66]:
learn.fit_one_cycle(5, 1e-2)

In [67]:
dl = learn.dls.test_dl(valid_xs)
pred = learn.get_preds(dl=dl)[0].numpy()
#rmse(pred.T.flatten(),valid_y.values)
#round(math.sqrt(((pred-y)**2).mean()), 6)
math.sqrt(((pred.T.flatten() - valid_y.values)**2.).mean())

In [68]:
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

df_test['DateSold'] = pd.to_datetime(df_test.YrSold.astype(str) + '/' + df_test.MoSold.astype(str) + '/01')
add_datepart(df_test, 'DateSold')

cont_test,cat_test = cont_cat_split(df_test, max_card=25, dep_var=None)
procs_test = [Categorify, FillMissing, Normalize]
to_test = TabularPandas(df_test, procs_test, cat_test, cont_test, y_names=None)

xs_test = to_test.train.xs#[list(xs_xgr_final.columns)]

In [69]:
dl = learn.dls.test_dl(xs_test)
learn.get_preds(dl=dl)

In [70]:
predicted_prices = learn.get_preds(dl=dl)

In [71]:
np.shape(predicted_prices[0].numpy().flatten())

In [72]:
my_submission = pd.DataFrame({'Id': df_test.Id, 'SalePrice': np.exp(predicted_prices[0].numpy().flatten())})
my_submission.to_csv('submission_nn.csv', index=False)

In [73]:
learn.save('nn')

| Model | Training Score | Valid Score | Test Score |
|--|--|--|--|
| DecisionTreeRegressor | 0.215144 | 0.234927 | not submitted |
| RandomForestRegressor | 0.105759 | 0.141392 | 0.17211 |
| XGBoost | 0.047828 | 0.131914 | 0.14819 |
| Neural Network (WIP) | XX | 2.0703435931479985 | 2.05518 | 