In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import tensorflow as tf
print(tf.__version__)

2.12.0


# Load Dataset

In [21]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Preprocessing

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [23]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [24]:
df.drop(columns=["Id"], inplace=True)
# df.isna().sum().sort_values(ascending=False)[:20]
df_test.drop(columns=["Id"], inplace=True)


In [25]:
df.drop(columns=["PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu"], inplace=True)
df_test.drop(columns=["PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu"], inplace=True)

for i in df_test.columns:
    if df_test[i].isna().any():
        if df_test[i].dtypes == 'object':
            df_test[i].fillna(df_test[i].mode()[0], inplace=True)
        else:
            df_test[i].fillna(df_test[i].median(), inplace=True)

for i in df.columns:
    if df[i].isna().any():
        if df[i].dtypes == 'object':
            df[i].fillna(df[i].mode()[0], inplace=True)
        else:
            df[i].fillna(df[i].median(), inplace=True)
            

In [26]:
combined = pd.concat([df, df_test], axis=0)
combined = pd.get_dummies(combined, drop_first=True)

train_encoded = combined.iloc[:len(df), :]
test_encoded = combined.iloc[len(df):, :]

## Data Splitting

In [27]:
X_cleaned = train_encoded.drop("SalePrice", axis=1)
y_cleaned = train_encoded["SalePrice"]

X_tested = test_encoded.drop("SalePrice", axis=1)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=.25, random_state=1111)

# X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_cleaned, y_cleaned, test_size=.25, random_state=1111)

In [29]:
def metrics(prediction, y_test):
    MAE = mean_absolute_error(y_test, prediction)
    MSE = mean_squared_error(y_test, prediction)
    RMSE = np.sqrt(MSE)
    R_squared = r2_score(y_test, prediction)

    print('MAE: ' + str(MAE))
    print('MSE: ' + str(MSE))
    print('RMSE: ' + str(RMSE))
    print('R_squared: ' + str(R_squared))

# Model Training

In [30]:
gb = HistGradientBoostingRegressor()

gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)

metrics(y_pred, y_test)

MAE: 17472.71993085875
MSE: 786836936.9612526
RMSE: 28050.613842860064
R_squared: 0.870621266090276


In [31]:
rf = RandomForestRegressor()

rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)
metrics(rf_pred, y_test)

MAE: 17548.37273972603
MSE: 755877302.5433844
RMSE: 27493.222847519795
R_squared: 0.8757119247962093


# Hyperparameter Tuning

In [32]:
param_gb = {
    'loss':['squared_error', 'absolute_error', 'gamma', 'poisson', 'quantile'],
    'learning_rate': [0.5, 0.1, 0.15],
    'max_depth': [None, 10, 20, 30],
}

paran_rf = {
    'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split':[1, 2, 3],
}



In [33]:
rfs = RandomForestRegressor()
rlfs = GridSearchCV(estimator=rfs, param_grid=paran_rf, cv=5)
rlfs.fit(X_train, y_train)

In [34]:
gbs = HistGradientBoostingRegressor()
gbfs = GridSearchCV(estimator = gbs, param_grid=param_gb, cv=5)
gbfs.fit(X_train, y_train)

120 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ABIN\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ABIN\anaconda3\lib\site-packages\sklearn\ensemble\_hist_gradient_boosting\gradient_boosting.py", line 353, in fit
    self._validate_params()
  File "c:\Users\ABIN\anaconda3\lib\site-packages\sklearn\base.py", line 581, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\ABIN\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_cons

In [35]:
rfmod = RandomForestRegressor(**rlfs.best_params_)
rfmod.fit(X_train, y_train)
y_rf = rfmod.predict(X_test)

metrics(y_rf, y_test)


MAE: 17345.161904109587
MSE: 737526437.5102847
RMSE: 27157.437977656962
R_squared: 0.878729337391632


In [36]:
gbmod = HistGradientBoostingRegressor(**gbfs.best_params_)
gbmod.fit(X_train, y_train)
y_gb = gbmod.predict(X_test)

metrics(y_gb, y_test)

MAE: 16979.848794355774
MSE: 740491796.5364147
RMSE: 27211.978916212887
R_squared: 0.8782417466617533


In [37]:
result_rf = rfmod.predict(X_tested)
result_gb = gbmod.predict(X_tested)

In [38]:
temp_test = pd.read_csv('test.csv')

docrf = pd.DataFrame(temp_test["Id"])
docgb = pd.DataFrame(temp_test["Id"])

docrf["SalePrice"] = result_rf.flatten().tolist()
docgb["SalePrice"] = result_gb.flatten().tolist()

In [39]:
docrf.to_csv("result_rf.csv", index=False)
docgb.to_csv("result_gb.csv", index=False)