In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import catboost as cat
import numpy as np
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("../../datasets/house_prices_dataset/house_prices_train.csv")
del df['Id']

target = "SalePrice"
num_features = list(df.select_dtypes(exclude='object'))
num_features.remove(target)
cat_features = list(df.select_dtypes(include='object'))

In [3]:
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

# Dataset preprocessing

In [4]:
df.loc[:, num_features] = df.loc[:, num_features].fillna(0)
df.loc[:, cat_features] = df.loc[:, cat_features].fillna('Other')

In [5]:
enc = OneHotEncoder(sparse_output=False, drop='first')
encoded_cat_df = enc.fit_transform(df[cat_features])
encoded_cat_df = pd.DataFrame(encoded_cat_df, columns=enc.get_feature_names_out(cat_features)).astype('Int16')
encoded_cat_features = list(encoded_cat_df.columns)
encoded_cat_df.shape

(1460, 224)

In [6]:
encoded_df = pd.concat([df, encoded_cat_df], axis=1)

In [7]:
encoded_df['class'] = encoded_df[target].apply(lambda x: 1 if x >= 250_000 else 0)
clf_target = 'class'
encoded_df['class'].value_counts()

class
0    1235
1     225
Name: count, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    encoded_df[num_features+cat_features+encoded_cat_features], 
    encoded_df[[target, clf_target]], 
    test_size=0.33, random_state=2025)

df_train = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
df_test = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

In [9]:
linear_features = num_features+encoded_cat_features

In [10]:
scaler = MinMaxScaler()
scaled_df_train = scaler.fit_transform(df_train[linear_features])
scaled_df_train = pd.DataFrame(scaled_df_train, columns=linear_features)
scaled_df_train = pd.concat([scaled_df_train, df_train[[target, clf_target]]], axis=1)

scaled_df_test = scaler.transform(df_test[linear_features])
scaled_df_test = pd.DataFrame(scaled_df_test, columns=linear_features)
scaled_df_test = pd.concat([scaled_df_test, df_test[[target, clf_target]]], axis=1)

# Simple Linear Regression

In [11]:
result_metrics = pd.DataFrame(columns=[
     'algorithm',
     'dataset_type',
     'R2',
     'MSE',
     'MAE',
     'MAPE'
])

In [12]:
def get_metrics(y_true, y_pred, algorithm, dataset_type, res_df=None):
    res_df = res_df.copy()
    
    r2 = metrics.r2_score(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    mae = metrics.mean_absolute_error(y_true, y_pred)
    mape = metrics.mean_absolute_percentage_error(y_true, y_pred)

    print(f"{algorithm}, -- Type Dataset - {dataset_type}")
    print(f"R2 = {r2}")
    print(f"MSE = {mse}")
    print(f"MAE = {mae}")
    print(f"MAPE = {mape}")
    
    curr_res = [algorithm, dataset_type] + [r2, mse, mae, mape]
    res_df = pd.concat([res_df,
                        pd.DataFrame([curr_res], columns=['algorithm',
                                                            'dataset_type',
                                                            'R2',
                                                            'MSE',
                                                            'MAE',
                                                            'MAPE'])])
    return res_df

In [13]:
simple_model = LinearRegression()
simple_model.fit(scaled_df_train[linear_features], scaled_df_train[target])

In [14]:
preds_train = simple_model.predict(scaled_df_train[linear_features])
preds_test = simple_model.predict(scaled_df_test[linear_features])

In [15]:
result_metrics = get_metrics(scaled_df_train[target], preds_train, 
                              algorithm='simple_regression', dataset_type='train', 
                              res_df=result_metrics)

simple_regression, -- Type Dataset - train
R2 = 0.9342844366357325
MSE = 397886427.41475236
MAE = 12432.844048246428
MAPE = 0.07180052924794227


  res_df = pd.concat([res_df,


In [16]:
result_metrics = get_metrics(scaled_df_test[target], preds_test, 
                              algorithm='simple_regression', dataset_type='test', 
                              res_df=result_metrics)

simple_regression, -- Type Dataset - test
R2 = 0.7636131923354911
MSE = 1611764410.0001333
MAE = 19483.623690172844
MAPE = 0.11750337980686873


In [17]:
result_metrics

Unnamed: 0,algorithm,dataset_type,R2,MSE,MAE,MAPE
0,simple_regression,train,0.934284,397886400.0,12432.844048,0.071801
0,simple_regression,test,0.763613,1611764000.0,19483.62369,0.117503


# Combine with classifier

## fit clf

In [18]:
clf = RandomForestClassifier(random_state=2025)
clf.fit(scaled_df_train[linear_features], scaled_df_train[clf_target])

clf_pred_train = clf.predict(scaled_df_train[linear_features])
clf_pred_test = clf.predict(scaled_df_test[linear_features])

In [19]:
print(metrics.classification_report(scaled_df_train[clf_target], clf_pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       822
           1       1.00      1.00      1.00       156

    accuracy                           1.00       978
   macro avg       1.00      1.00      1.00       978
weighted avg       1.00      1.00      1.00       978



In [20]:
print(metrics.classification_report(scaled_df_test[clf_target], clf_pred_test))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98       413
           1       0.93      0.78      0.85        69

    accuracy                           0.96       482
   macro avg       0.95      0.89      0.91       482
weighted avg       0.96      0.96      0.96       482



## fit regression

In [21]:
scaled_df_train_0 = scaled_df_train[scaled_df_train[clf_target] == 0]
scaled_df_train_1 = scaled_df_train[scaled_df_train[clf_target] == 1]

clf_pred_test = clf.predict(scaled_df_test[linear_features])
scaled_df_test['clf_pred'] = clf_pred_test

scaled_df_test_0 = scaled_df_test[scaled_df_test['clf_pred'] == 0]
scaled_df_test_1 = scaled_df_test[scaled_df_test['clf_pred'] == 1]

In [22]:
scaled_df_train_0.shape, scaled_df_train_1.shape

((822, 262), (156, 262))

In [23]:
scaled_df_test_0.shape, scaled_df_test_1.shape

((424, 263), (58, 263))

### lr for class 0

In [24]:
lr_0 = LinearRegression()
lr_0.fit(scaled_df_train_0[linear_features], scaled_df_train_0[target])
pred_train_0 = lr_0.predict(scaled_df_train_0[linear_features])
pred_test_0 = lr_0.predict(scaled_df_test_0[linear_features])

### lr for class 1

In [25]:
lr_1 = LinearRegression()
lr_1.fit(scaled_df_train_1[linear_features], scaled_df_train_1[target])
pred_train_1 = lr_1.predict(scaled_df_train_1[linear_features])
pred_test_1 = lr_1.predict(scaled_df_test_1[linear_features])

### collect all

In [26]:
result_metrics = get_metrics(pd.concat([scaled_df_train_0[target], scaled_df_train_1[target]]),
                             np.hstack([pred_train_0, pred_train_1]),
                              algorithm='segm_expert_regression', dataset_type='train', 
                              res_df=result_metrics)

segm_expert_regression, -- Type Dataset - train
R2 = 0.9793499663967853
MSE = 125029257.54183368
MAE = 7893.84701122459
MAPE = 0.05125567547666112


In [27]:
result_metrics = get_metrics(pd.concat([scaled_df_test_0[target], scaled_df_test_1[target]]),
                             np.hstack([pred_test_0, pred_test_1]),
                              algorithm='segm_expert_regression', dataset_type='test', 
                              res_df=result_metrics)

segm_expert_regression, -- Type Dataset - test
R2 = -0.7342943315423969
MSE = 11824999489.870808
MAE = 31738.506285292486
MAPE = 0.1582408356581977


In [28]:
result_metrics.drop_duplicates()

Unnamed: 0,algorithm,dataset_type,R2,MSE,MAE,MAPE
0,simple_regression,train,0.934284,397886400.0,12432.844048,0.071801
0,simple_regression,test,0.763613,1611764000.0,19483.62369,0.117503
0,segm_expert_regression,train,0.97935,125029300.0,7893.847011,0.051256
0,segm_expert_regression,test,-0.734294,11825000000.0,31738.506285,0.158241


# Catboost

In [29]:
catboost_features = num_features+cat_features

In [30]:
catboost_features

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'F

In [31]:
cat_model = cat.CatBoostRegressor(random_seed = 2025, cat_features=cat_features)
cat_model.fit(df_train[catboost_features], df_train[target])

Learning rate set to 0.040799
0:	learn: 75779.1065966	total: 74.5ms	remaining: 1m 14s
1:	learn: 73901.6120488	total: 94ms	remaining: 46.9s
2:	learn: 72103.5644285	total: 115ms	remaining: 38.3s
3:	learn: 70488.2855799	total: 136ms	remaining: 33.8s
4:	learn: 68796.7691366	total: 147ms	remaining: 29.2s
5:	learn: 67174.1154661	total: 153ms	remaining: 25.3s
6:	learn: 65681.9153493	total: 160ms	remaining: 22.7s
7:	learn: 64150.4496420	total: 166ms	remaining: 20.6s
8:	learn: 62717.7761873	total: 171ms	remaining: 18.9s
9:	learn: 61309.6153231	total: 177ms	remaining: 17.5s
10:	learn: 59808.2130685	total: 183ms	remaining: 16.5s
11:	learn: 58422.1564772	total: 189ms	remaining: 15.6s
12:	learn: 57203.9015213	total: 195ms	remaining: 14.8s
13:	learn: 55979.2313161	total: 202ms	remaining: 14.2s
14:	learn: 54693.3139259	total: 207ms	remaining: 13.6s
15:	learn: 53509.3379826	total: 215ms	remaining: 13.2s
16:	learn: 52467.5732884	total: 221ms	remaining: 12.8s
17:	learn: 51395.9217353	total: 226ms	remain

<catboost.core.CatBoostRegressor at 0x14a417510>

In [32]:
y_train_pred = cat_model.predict(df_train[catboost_features])
y_test_pred = cat_model.predict(df_test[catboost_features])

In [33]:
result_metrics = get_metrics(df_train[target], y_train_pred,
                              algorithm='catboost_regression', dataset_type='train', 
                              res_df=result_metrics)

catboost_regression, -- Type Dataset - train
R2 = 0.9826189666310685
MSE = 105236521.12067415
MAE = 7074.099107043198
MAPE = 0.04287479062542724


In [34]:
result_metrics = get_metrics(df_test[target], y_test_pred,
                              algorithm='catboost_regression', dataset_type='test', 
                              res_df=result_metrics)

catboost_regression, -- Type Dataset - test
R2 = 0.8925282018563128
MSE = 732778707.230372
MAE = 15651.013002386228
MAPE = 0.09401566317537778


In [35]:
result_metrics 

Unnamed: 0,algorithm,dataset_type,R2,MSE,MAE,MAPE
0,simple_regression,train,0.934284,397886400.0,12432.844048,0.071801
0,simple_regression,test,0.763613,1611764000.0,19483.62369,0.117503
0,segm_expert_regression,train,0.97935,125029300.0,7893.847011,0.051256
0,segm_expert_regression,test,-0.734294,11825000000.0,31738.506285,0.158241
0,catboost_regression,train,0.982619,105236500.0,7074.099107,0.042875
0,catboost_regression,test,0.892528,732778700.0,15651.013002,0.094016
