# House Price EDA and Model Training 

In [1]:
import sys
import os
rootpath = os.path.dirname(os.path.dirname(__vsc_ipynb_file__))

In [2]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)  
pd.set_option('display.max_columns', None)

In [3]:
# Defining Data Path
path = "data/house-prices-advanced-regression-techniques/train.csv"
df = pd.read_csv(os.path.join(rootpath,path))

In [4]:
# Checking Initial Length
len(df)

1460

In [5]:
# Will be removed after in the next commit

# Columns that had either low feature importance or majority of the values were na

# df.drop(columns=["Id","PoolQC","Fence","MiscFeature","Alley","MasVnrType","FireplaceQu"],inplace=True)

# # Unnecessary features - based on feature importance from catboost
# unnecessary_feats = ['MSZoning', 'Street', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFinSF2', 'Heating', 'HeatingQC', 'Electrical', 'LowQualFinSF', 'BsmtHalfBath', 'KitchenAbvGr', 'KitchenQual', 'Functional', 'GarageType', 'GarageQual', 'GarageCond', 'PavedDrive', 'MiscVal', 'SaleType', 'SaleCondition']

# df.drop(columns=unnecessary_feats,inplace=True)

# df[df.columns[df.isna().sum() > 0]].isna().sum()

In [6]:
# Final Columns to use to train Catboost model
final_cols = ['MSSubClass', 'LotFrontage', 'LotArea', 'LotShape', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'ExterQual',
       'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 'CentralAir', '1stFlrSF',
       '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MoSold',
       'YrSold']

train_cols = final_cols.copy()
train_cols.append("SalePrice")

df = df[train_cols]

df.dropna(inplace=True,how="any")
df.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,LotShape,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,CentralAir,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,FullBath,HalfBath,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageFinish,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,SalePrice
0,60,65.0,8450,Reg,7,5,2003,2003,196.0,Gd,706,150,856,Y,856,854,1710,1,2,1,3,8,0,2003.0,RFn,2,548,0,61,0,0,0,0,2,2008,208500
1,20,80.0,9600,Reg,6,8,1976,1976,0.0,TA,978,284,1262,Y,1262,0,1262,0,2,0,3,6,1,1976.0,RFn,2,460,298,0,0,0,0,0,5,2007,181500
2,60,68.0,11250,IR1,7,5,2001,2002,162.0,Gd,486,434,920,Y,920,866,1786,1,2,1,3,6,1,2001.0,RFn,2,608,0,42,0,0,0,0,9,2008,223500
3,70,60.0,9550,IR1,7,5,1915,1970,0.0,TA,216,540,756,Y,961,756,1717,1,1,0,3,7,1,1998.0,Unf,3,642,0,35,272,0,0,0,2,2006,140000
4,60,84.0,14260,IR1,8,5,2000,2000,350.0,Gd,655,490,1145,Y,1145,1053,2198,1,2,1,4,9,1,2000.0,RFn,3,836,192,84,0,0,0,0,12,2008,250000


In [7]:
# Length after processing data
len(df)

1121

In [8]:
# Generate test data by picking random values from columns
def generate_sample_houseprice_points(df,num_samples = 100):
    bootstrapped_samples = []

    for _ in range(num_samples):
        bootstrap_sample = {col: df[col].sample(n=1).values[0] for col in df.columns}

        bootstrapped_samples.append(bootstrap_sample)

    return pd.DataFrame(bootstrapped_samples)

In [9]:
# Get train and eval data

from catboost import CatBoostRegressor
# Initialize data

train_data = df.drop(columns="SalePrice")
train_labels = df["SalePrice"]


In [10]:
non_numeric_columns = df.select_dtypes(exclude=['number']).columns.tolist()

In [11]:
# Initialize CatBoostRegressor and train it
model = CatBoostRegressor(iterations=100,
                          learning_rate=1,
                          depth=16,
                          verbose = True,
                          cat_features = non_numeric_columns)

model.fit(train_data, train_labels)

0:	learn: 44903.4193547	total: 158ms	remaining: 15.6s
1:	learn: 33287.6717132	total: 1.67s	remaining: 1m 21s
2:	learn: 24691.5437296	total: 2.86s	remaining: 1m 32s
3:	learn: 19782.1199954	total: 3.92s	remaining: 1m 34s
4:	learn: 16126.3881764	total: 4.88s	remaining: 1m 32s
5:	learn: 13459.0207176	total: 5.85s	remaining: 1m 31s
6:	learn: 11668.7880929	total: 6.89s	remaining: 1m 31s
7:	learn: 9886.3056548	total: 7.86s	remaining: 1m 30s
8:	learn: 8392.8186455	total: 8.83s	remaining: 1m 29s
9:	learn: 6961.5414271	total: 9.65s	remaining: 1m 26s
10:	learn: 5621.9163685	total: 10.4s	remaining: 1m 24s
11:	learn: 4502.0066381	total: 11.2s	remaining: 1m 21s
12:	learn: 3604.4603126	total: 11.9s	remaining: 1m 19s
13:	learn: 3052.8621748	total: 12.5s	remaining: 1m 16s
14:	learn: 2510.4611690	total: 13.3s	remaining: 1m 15s
15:	learn: 2192.8592207	total: 14s	remaining: 1m 13s
16:	learn: 1821.5382861	total: 14.6s	remaining: 1m 11s
17:	learn: 1573.7421641	total: 15.3s	remaining: 1m 9s
18:	learn: 1325.3

<catboost.core.CatBoostRegressor at 0x1594171c0>

In [12]:
# Check feature with low importance

# print([colname for colname,val in list(zip(train_data.columns,model.feature_importances_)) if val < 0.01])

In [13]:
model_path = 'models/checkpoints//housing_price_model.cbm'

In [14]:
# Save checkpoint
model.save_model(os.path.join(rootpath,model_path))
print(f"Model saved to {model_path}")

Model saved to models/checkpoints//housing_price_model.cbm


In [15]:
# Load catboost model and check on eval data
eval_data = generate_sample_houseprice_points(df,10).drop(columns="SalePrice")
loaded_model = CatBoostRegressor()
loaded_model.load_model(os.path.join(rootpath,model_path))
print("Model loaded successfully")

Model loaded successfully


In [16]:
# Check evaluation results
preds = loaded_model.predict(eval_data)
eval_data["Predicted"] = preds
eval_data

Unnamed: 0,MSSubClass,LotFrontage,LotArea,LotShape,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,CentralAir,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,FullBath,HalfBath,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageFinish,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,Predicted
0,120,50.0,8160,IR1,7,5,1990,1953,0.0,TA,0,276,1337,Y,1212,0,1682,1,2,1,4,5,0,1993.0,RFn,2,288,0,32,0,0,385,0,8,2007,176824.222113
1,30,93.0,11367,IR1,5,5,1973,1950,0.0,TA,0,184,616,Y,765,1093,1456,0,2,1,3,6,1,2005.0,Fin,3,308,0,50,116,0,0,0,6,2008,189646.107116
2,60,120.0,10793,Reg,4,6,1954,1953,450.0,TA,48,1158,912,Y,1423,920,1456,0,2,1,3,8,1,2003.0,RFn,2,528,218,0,0,0,0,0,5,2009,156754.820637
3,60,66.0,9020,IR1,7,8,1930,1950,0.0,TA,0,742,874,Y,1316,252,1961,0,2,0,2,8,0,2002.0,RFn,2,264,0,0,0,0,0,0,8,2006,190696.440263
4,20,69.0,8190,Reg,6,7,2004,1950,0.0,Ex,777,328,738,Y,864,866,936,0,1,0,3,7,1,1986.0,Unf,2,719,239,0,0,0,0,0,9,2010,152075.452432
5,60,70.0,10186,IR1,7,5,1940,1950,183.0,Gd,523,1652,1069,Y,1055,0,1500,0,2,1,3,6,1,2003.0,Fin,2,810,120,24,0,0,0,0,1,2006,171037.03681
6,120,75.0,8197,IR1,5,5,2004,1963,0.0,Gd,483,322,547,Y,1593,252,924,0,2,0,4,7,1,1939.0,Unf,2,500,100,36,0,0,0,0,5,2007,153098.442548
7,60,60.0,7332,Reg,6,5,1970,2005,0.0,TA,0,1571,1522,Y,649,0,1626,0,2,1,1,7,0,1940.0,Unf,1,673,120,16,0,0,0,0,10,2007,171083.125586
8,45,52.0,9187,Reg,6,5,1954,1977,0.0,TA,194,1008,1165,Y,768,0,864,0,1,0,3,9,1,1989.0,Fin,1,473,147,0,0,0,0,0,7,2006,156115.039474
9,20,53.0,16321,IR1,8,5,1941,1974,74.0,Gd,1196,876,732,Y,1746,0,1214,0,2,0,3,5,1,1953.0,Fin,2,660,96,0,0,0,0,0,7,2008,235894.014109
