# Home Data for ML Course

https://www.kaggle.com/competitions/home-data-for-ml-course

## Setup

In [1]:
import os
import glob
from pathlib import Path
import random
from PIL import Image

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [2]:
IS_KAGGLE = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

COMP_NAME = 'home-data-for-ml-course'
if COMP_NAME is None:
    raise NameError('COMP_NAME has not been initialized')

DATA_PATH = Path('../input/' + COMP_NAME) if IS_KAGGLE else Path('./data')

RANDOM_SEED = 42

In [3]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

## Load Data

In [4]:
path = Path('./data')
if not DATA_PATH.exists():
    import zipfile, kaggle
    kaggle.api.competition_download_cli(COMP_NAME)
    zipfile.ZipFile(f'{COMP_NAME}.zip').extractall(DATA_PATH)

In [5]:
train_data = pd.read_csv(DATA_PATH / 'train.csv')
test_data = pd.read_csv(DATA_PATH / 'test.csv')

print('train:', train_data.shape)
print('test:', test_data.shape)

train: (1460, 81)
test: (1459, 80)


## Data Exploration

In [6]:
train_data.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
test_data.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [8]:
num_cols = train_data.select_dtypes(exclude='object')
print('Numerical columns:\n', num_cols.columns)

Numerical columns:
 Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')


In [9]:
cat_cols = train_data.select_dtypes(exclude=['int64', 'float64'])
print('Categorical columns:\n', cat_cols.columns)

Categorical columns:
 Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')


In [10]:
missing_counts = train_data.isnull().sum()
missing_counts = missing_counts[missing_counts > 0].sort_values(ascending=False)
print(missing_counts)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtFinType2      38
BsmtExposure      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrArea         8
Electrical         1
dtype: int64


## Prepare Training/Validation Data

In [11]:
X = train_data.dropna(axis=0, subset=['SalePrice'])
y = train_data['SalePrice']

In [19]:
features = [
    'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
    'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
    'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'
]

X = X[features]

print(X.shape)

(1460, 33)


In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=RANDOM_SEED)

print('train:', 'X =', X_train.shape, ', y=', y_train.shape)
print('valid:', 'X =', X_valid.shape, ', y=', y_valid.shape)

train: X = (1168, 33) , y= (1168,)
valid: X = (292, 33) , y= (292,)


## Decision Tree Regressor

In [14]:
model = DecisionTreeRegressor(random_state=RANDOM_SEED)

grid_params = [{
    'max_depth':[5,10,15,20,25,50],
}]
grid = GridSearchCV(estimator=model, param_grid=grid_params, cv=10, n_jobs=-1, verbose=True)
grid.fit(X_train, y_train)

print('Best accuracy:', grid.best_score_)
print('Best parameters:', grid.best_params_)

y_valid_preds = grid.predict(X_valid)

print('\nMean absolute error:', mean_absolute_error(y_valid_preds, y_valid))
print('R2 score:', r2_score(y_valid_preds, y_valid))

Fitting 10 folds for each of 6 candidates, totalling 60 fits
Best accuracy: 0.7013232648031689
Best parameters: {'max_depth': 20}

Mean absolute error: 26550.97978500761
R2 score: 0.7577377135969902


## Random Forest Regressor

In [15]:
model = RandomForestRegressor(random_state=RANDOM_SEED)

grid_params = [{
    'n_estimators': [5, 10, 25, 50, 100, 200],
    'max_depth': [5, 10, 15, 20, 25, 50],
}]
grid = GridSearchCV(estimator=model, param_grid=grid_params, cv=10, n_jobs=-1, verbose=True)
grid.fit(X_train, y_train)

print('Best accuracy:', grid.best_score_)
print('Best parameters:', grid.best_params_)

y_valid_preds = grid.predict(X_valid)

print('\nMean absolute error:', mean_absolute_error(y_valid_preds, y_valid))
print('R2 score:', r2_score(y_valid_preds, y_valid))

Fitting 10 folds for each of 36 candidates, totalling 360 fits
Best accuracy: 0.8410186573292183
Best parameters: {'max_depth': 20, 'n_estimators': 200}

Mean absolute error: 17909.12533672568
R2 score: 0.8502395726095415


## XGBRegressor

In [16]:
model = XGBRegressor(random_state=RANDOM_SEED)

grid_params = [{
    'learning_rate': [0.03, 0.05, 0.07],
    'max_depth': [5, 6, 7],
    'n_estimators': [500],
}]
grid = GridSearchCV(estimator=model, param_grid=grid_params, cv=10, n_jobs=-1, verbose=True)
grid.fit(X_train, y_train)

print('Best accuracy:', grid.best_score_)
print('Best parameters:', grid.best_params_)

y_valid_preds = grid.predict(X_valid)

print('Mean absolute error:', mean_absolute_error(y_valid_preds, y_valid))
print('R2 score:', r2_score(y_valid_preds, y_valid))

Fitting 10 folds for each of 9 candidates, totalling 90 fits
Best accuracy: 0.847079088260198
Best parameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 500}
Mean absolute error: 16631.136825770547
R2 score: 0.845765620501494


## Submission

In [17]:
model = XGBRegressor(random_state=RANDOM_SEED, learning_rate=0.05, max_depth=5, n_estimators=500)
model.fit(X_train, y_train)
y_valid_preds = grid.predict(X_valid)

print('Mean absolute error:', mean_absolute_error(y_valid_preds, y_valid))
print('R2 score:', r2_score(y_valid_preds, y_valid))

X_test = test_data[features]
y_test_preds = model.predict(X_test)

Mean absolute error: 16631.136825770547
R2 score: 0.845765620501494


In [18]:
submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': list(y_test_preds)})
submission.to_csv('submission.csv', index=False)