In [67]:
import pandas as pd
import numpy as np
from sklearn.inspection import permutation_importance
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,OrdinalEncoder,TargetEncoder,PowerTransformer
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
import optuna

In [2]:
df=pd.read_csv('cleaned.csv')

In [None]:
df

# Baseline Model

In [3]:
yeo_johnson_transformer = PowerTransformer(method='yeo-johnson')
df['price'] = yeo_johnson_transformer.fit_transform(df[['price']])

In [4]:
X=df.drop(columns='price')
y=df['price']

In [5]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,shuffle=True,random_state=42)

In [6]:
num_cols=[]
cat_cols=[]
def dtype_check(x):
    if (df[x].dtype=='float64') | (df[x].dtype=='int64'):
        num_cols.append(x)
    else:
        cat_cols.append(x)
for i in X_train.columns:
    dtype_check(i)

In [7]:
num_cols

['bedRoom',
 'bathroom',
 'balcony',
 'floorNum',
 'Servant_Room',
 'Study_Room',
 'Pooja_Room',
 'Store_Room',
 'Built_Up_Area']

In [8]:
cat_cols

['agePossession', 'Location', 'furnishing_type', 'luxury_category']

In [9]:
lux_categories = ['Not Luxurious', 'Semi Luxurious', 'Luxurious']
fur_categories=['Unfurnished','Semi Furnished','Fully Furnished']

In [10]:
transformer=ColumnTransformer(transformers=[
    ('scaling',MinMaxScaler(),num_cols),
    ('encoding1',OrdinalEncoder(categories=[lux_categories]),['luxury_category']),
    ('encoding2',OrdinalEncoder(categories=[fur_categories]),['furnishing_type']),
    ('ohe',OneHotEncoder(drop='first'),['agePossession']),
    ('target',TargetEncoder(),['Location']),
    ('transformation',PowerTransformer(method='yeo-johnson'),['Built_Up_Area'])
])

In [11]:
pipeline=Pipeline([
    ('transformer',transformer),
    ('svm',SVR())
])

In [12]:
pipeline.fit(X_train,y_train)

In [13]:
y_pred = pipeline.predict(X_test)

mean_absolute_error(y_test,y_pred)

0.2836753189087019

In [14]:
r2_score(y_pred,y_test)

0.7980802012825567

# Feature Selection

### Permutation Importance

In [57]:
pi=permutation_importance(pipeline,X_train,y_train,n_repeats=10,random_state=29)

In [58]:
mean=pi.importances_mean
pi=pd.DataFrame({
    'feature':X_train.columns,
    'importance':mean
})

In [59]:
pi.sort_values(by='importance',ascending=False)

Unnamed: 0,feature,importance
10,Built_Up_Area,0.433222
5,Location,0.349472
4,agePossession,0.040243
11,furnishing_type,0.022399
2,balcony,0.016128
0,bedRoom,0.015572
6,Servant_Room,0.013738
12,luxury_category,0.0132
1,bathroom,0.010414
3,floorNum,0.009494


### Recursive Feature Elimination

In [60]:
scaler = MinMaxScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

object_cols = ['luxury_category', 'furnishing_type','agePossession']
encoder = OrdinalEncoder()
encoder.fit(X_train[object_cols])
encoded_cols = list(encoder.get_feature_names_out(object_cols))
X_train[encoded_cols] = encoder.transform(X_train[object_cols])

encoder = TargetEncoder()
encoder.fit(X_train[['Location']], y_train)
encoded_cols = list(encoder.get_feature_names_out(['Location']))
X_train[encoded_cols] = encoder.transform(X_train[['Location']])

In [61]:
model=RandomForestRegressor()
rfe=RFE(model,n_features_to_select=10)
rfe.fit(X_train,y_train)

In [62]:
feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Ranking': rfe.ranking_})
feature_importance = feature_importance.sort_values(by='Ranking')
feature_importance

Unnamed: 0,Feature,Ranking
0,bedRoom,1
1,bathroom,1
2,balcony,1
3,floorNum,1
4,agePossession,1
5,Location,1
6,Servant_Room,1
10,Built_Up_Area,1
11,furnishing_type,1
12,luxury_category,1


### Feature Importances

In [63]:
model=RandomForestRegressor()
model.fit(X_train,y_train)

In [65]:
importances=model.feature_importances_
rf=pd.DataFrame({
    'feature':X_train.columns,
    'importance':importances
})
rf.sort_values(by='importance',ascending=False)

Unnamed: 0,feature,importance
10,Built_Up_Area,0.589775
5,Location,0.273427
1,bathroom,0.050303
3,floorNum,0.026943
0,bedRoom,0.015001
4,agePossession,0.012555
2,balcony,0.010584
6,Servant_Room,0.00566
12,luxury_category,0.005286
11,furnishing_type,0.005206


* Since all these techniques show that the columns Store, Study and Pooja Room are not that important, we can remove them

# Model Selection

In [15]:
df.drop(columns=['Study_Room','Pooja_Room','Store_Room'],inplace=True)

In [16]:
X=df.drop(columns='price')
y=df['price']

In [17]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,shuffle=True,random_state=42)

In [18]:
num_cols=[]
cat_cols=[]
def dtype_check(x):
    if (df[x].dtype=='float64') | (df[x].dtype=='int64'):
        num_cols.append(x)
    else:
        cat_cols.append(x)
for i in X_train.columns:
    dtype_check(i)

In [19]:
transformer=ColumnTransformer(transformers=[
    ('scaling',MinMaxScaler(),num_cols),
    ('encoding1',OrdinalEncoder(categories=[lux_categories]),['luxury_category']),
    ('encoding2',OrdinalEncoder(categories=[fur_categories]),['furnishing_type']),
    ('ohe',OneHotEncoder(drop='first'),['agePossession']),
    ('target',TargetEncoder(),['Location']),
    ('transformation',PowerTransformer(method='yeo-johnson'),['Built_Up_Area'])
])

In [20]:
pipeline=Pipeline([
    ('transformer',transformer),
    ('svm',SVR())
])
pipeline.fit(X_train,y_train)

In [21]:
y_pred = pipeline.predict(X_test)

mean_absolute_error(y_test,y_pred)

0.2827458977340318

In [22]:
r2_score(y_pred,y_test) # slightly improved

0.8025221405735069

In [62]:
def model_selection(x):
    pipeline=Pipeline([
    ('transformer',transformer),
    ('model',x())
])
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='r2')
    mean_score = scores.mean()
    return mean_score

In [47]:
r2_lr=model_selection(LinearRegression)
print(r2_lr)

0.7989145142129459


In [53]:
r2_svm=model_selection(SVR)
print(r2_svm)

0.8258709504803766


In [48]:
r2_rf=model_selection(RandomForestRegressor)
print(r2_rf)

0.8465143165761368


In [68]:
r2_xgb=model_selection(XGBRegressor)
print(r2_xgb)

0.8355492367651248


In [50]:
r2_gb=model_selection(GradientBoostingRegressor)
print(r2_gb)

0.847729990826345


In [51]:
r2_dt=model_selection(DecisionTreeRegressor)
print(r2_dt)

0.7035832552765533


In [52]:
r2_nn=model_selection(MLPRegressor)
print(r2_nn)



0.8254866317638226




Random Forest and Boosting models have outperformed
Will be proceeding with Hyper-parameter tuning of these models

# Hyper-paramter tuning

In [60]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    }
    
    model = GradientBoostingRegressor(**params)
    pipeline = Pipeline([
        ('transformer', transformer),
        ('model', model)
    ])
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='r2', n_jobs=-1)
    mean_score = scores.mean()
    
    return mean_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, timeout=600)

print("Best hyperparameters: ", study.best_params)
print("Best R² score: ", study.best_value)

[I 2024-06-04 22:44:45,365] A new study created in memory with name: no-name-21ab0466-e560-42fa-939a-dea00d2a019a
[I 2024-06-04 22:44:47,026] Trial 0 finished with value: 0.8539814009213507 and parameters: {'n_estimators': 150, 'learning_rate': 0.06165418798243923, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 2, 'subsample': 0.572726295945431, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8539814009213507.
[I 2024-06-04 22:44:47,769] Trial 1 finished with value: 0.8438582327096356 and parameters: {'n_estimators': 104, 'learning_rate': 0.1624308914888317, 'max_depth': 5, 'min_samples_split': 13, 'min_samples_leaf': 11, 'subsample': 0.5255540547001747, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8539814009213507.
[I 2024-06-04 22:44:49,698] Trial 2 finished with value: 0.8327314462307582 and parameters: {'n_estimators': 238, 'learning_rate': 0.23392596859808845, 'max_depth': 9, 'min_samples_split': 13, 'min_samples_leaf': 3, 'subsample': 0.609760643501

[I 2024-06-04 22:45:27,388] Trial 26 finished with value: 0.8530897956586125 and parameters: {'n_estimators': 224, 'learning_rate': 0.1392135461503504, 'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 13, 'subsample': 0.8190244287640936, 'max_features': 'log2'}. Best is trial 21 with value: 0.8558485860698666.
[I 2024-06-04 22:45:30,033] Trial 27 finished with value: 0.8561384845368295 and parameters: {'n_estimators': 256, 'learning_rate': 0.03087772487547283, 'max_depth': 6, 'min_samples_split': 13, 'min_samples_leaf': 5, 'subsample': 0.8751926217750264, 'max_features': 'sqrt'}. Best is trial 27 with value: 0.8561384845368295.
[I 2024-06-04 22:45:32,691] Trial 28 finished with value: 0.8573090512119294 and parameters: {'n_estimators': 261, 'learning_rate': 0.031363701813485974, 'max_depth': 6, 'min_samples_split': 14, 'min_samples_leaf': 4, 'subsample': 0.8901735427070294, 'max_features': 'sqrt'}. Best is trial 28 with value: 0.8573090512119294.
[I 2024-06-04 22:45:34,448] 

[I 2024-06-04 22:46:18,938] Trial 52 finished with value: 0.855623319172133 and parameters: {'n_estimators': 130, 'learning_rate': 0.045342402680509784, 'max_depth': 8, 'min_samples_split': 9, 'min_samples_leaf': 3, 'subsample': 0.9430156228065001, 'max_features': 'log2'}. Best is trial 44 with value: 0.8575736309553494.
[I 2024-06-04 22:46:20,918] Trial 53 finished with value: 0.8518751024284494 and parameters: {'n_estimators': 201, 'learning_rate': 0.07418835548456756, 'max_depth': 7, 'min_samples_split': 11, 'min_samples_leaf': 1, 'subsample': 0.9227337294165905, 'max_features': 'log2'}. Best is trial 44 with value: 0.8575736309553494.
[I 2024-06-04 22:46:22,540] Trial 54 finished with value: 0.8449314080437222 and parameters: {'n_estimators': 138, 'learning_rate': 0.1635625076269948, 'max_depth': 8, 'min_samples_split': 10, 'min_samples_leaf': 2, 'subsample': 0.9470019942222953, 'max_features': 'log2'}. Best is trial 44 with value: 0.8575736309553494.
[I 2024-06-04 22:46:24,437] Tr

[I 2024-06-04 22:47:09,960] Trial 78 finished with value: 0.8332728010157817 and parameters: {'n_estimators': 193, 'learning_rate': 0.26657625186399714, 'max_depth': 8, 'min_samples_split': 11, 'min_samples_leaf': 8, 'subsample': 0.7684458964529838, 'max_features': 'log2'}. Best is trial 55 with value: 0.8577727429065763.
[I 2024-06-04 22:47:12,314] Trial 79 finished with value: 0.8571143338381267 and parameters: {'n_estimators': 225, 'learning_rate': 0.02594726865155941, 'max_depth': 7, 'min_samples_split': 12, 'min_samples_leaf': 1, 'subsample': 0.9322916845560355, 'max_features': 'sqrt'}. Best is trial 55 with value: 0.8577727429065763.
[I 2024-06-04 22:47:15,243] Trial 80 finished with value: 0.8576939525733309 and parameters: {'n_estimators': 239, 'learning_rate': 0.01729713022709082, 'max_depth': 8, 'min_samples_split': 17, 'min_samples_leaf': 1, 'subsample': 0.8933115377096335, 'max_features': 'sqrt'}. Best is trial 55 with value: 0.8577727429065763.
[I 2024-06-04 22:47:18,106] 

Best hyperparameters:  {'n_estimators': 209, 'learning_rate': 0.03099601210969703, 'max_depth': 7, 'min_samples_split': 11, 'min_samples_leaf': 2, 'subsample': 0.9123892030188259, 'max_features': 'sqrt'}
Best R² score:  0.8586658389591


In [69]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 2.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 2.0)
    }
    
    model = XGBRegressor(**params)
    pipeline = Pipeline([
        ('transformer', transformer),
        ('model', model)
    ])
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='r2', n_jobs=-1)
    mean_score = scores.mean()
    
    return mean_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, timeout=600)

print("Best hyperparameters: ", study.best_params)
print("Best R² score: ", study.best_value)

[I 2024-06-04 22:53:29,046] A new study created in memory with name: no-name-06fa315c-9cc5-4bd0-970e-5112d30eaa07
[I 2024-06-04 22:53:38,005] Trial 0 finished with value: 0.8555820951363413 and parameters: {'n_estimators': 256, 'learning_rate': 0.02812652719410702, 'max_depth': 8, 'subsample': 0.5195817593672145, 'colsample_bytree': 0.7330324420470189, 'reg_alpha': 0.6668451520319831, 'reg_lambda': 0.9274174562235726}. Best is trial 0 with value: 0.8555820951363413.
[I 2024-06-04 22:53:38,576] Trial 1 finished with value: 0.841502354799761 and parameters: {'n_estimators': 51, 'learning_rate': 0.2332595522158302, 'max_depth': 9, 'subsample': 0.5881991946529761, 'colsample_bytree': 0.5548714361470284, 'reg_alpha': 0.2981126415502957, 'reg_lambda': 1.3472118589996285}. Best is trial 0 with value: 0.8555820951363413.
[I 2024-06-04 22:53:41,663] Trial 2 finished with value: 0.8441687322216671 and parameters: {'n_estimators': 297, 'learning_rate': 0.10936588721796299, 'max_depth': 7, 'subsam

[I 2024-06-04 22:54:20,054] Trial 23 finished with value: 0.8564540271698118 and parameters: {'n_estimators': 188, 'learning_rate': 0.05480066600055642, 'max_depth': 8, 'subsample': 0.5536386038150866, 'colsample_bytree': 0.6207263231791272, 'reg_alpha': 0.35944457495296833, 'reg_lambda': 0.7343678948388254}. Best is trial 23 with value: 0.8564540271698118.
[I 2024-06-04 22:54:21,551] Trial 24 finished with value: 0.8536900348842323 and parameters: {'n_estimators': 187, 'learning_rate': 0.047738531561125494, 'max_depth': 8, 'subsample': 0.5642557271945277, 'colsample_bytree': 0.6944694176168216, 'reg_alpha': 0.47305685337263714, 'reg_lambda': 0.7051525775273542}. Best is trial 23 with value: 0.8564540271698118.
[I 2024-06-04 22:54:22,535] Trial 25 finished with value: 0.8510816765846199 and parameters: {'n_estimators': 96, 'learning_rate': 0.04795101254756347, 'max_depth': 7, 'subsample': 0.6254980323523275, 'colsample_bytree': 0.6188990862043188, 'reg_alpha': 0.4359408336965017, 'reg_

[I 2024-06-04 22:54:44,042] Trial 46 finished with value: 0.8555945654408724 and parameters: {'n_estimators': 127, 'learning_rate': 0.07619395203093421, 'max_depth': 5, 'subsample': 0.9548383953114776, 'colsample_bytree': 0.5002203451315059, 'reg_alpha': 0.8095947761260671, 'reg_lambda': 0.5177889265996869}. Best is trial 23 with value: 0.8564540271698118.
[I 2024-06-04 22:54:44,559] Trial 47 finished with value: 0.8514636674155728 and parameters: {'n_estimators': 112, 'learning_rate': 0.0768286650683509, 'max_depth': 5, 'subsample': 0.9548650700236145, 'colsample_bytree': 0.5173726859090773, 'reg_alpha': 0.8402767769568606, 'reg_lambda': 0.20108019806289046}. Best is trial 23 with value: 0.8564540271698118.
[I 2024-06-04 22:54:45,028] Trial 48 finished with value: 0.841116553760705 and parameters: {'n_estimators': 100, 'learning_rate': 0.03872606172104666, 'max_depth': 4, 'subsample': 0.8867517470742601, 'colsample_bytree': 0.573618314841791, 'reg_alpha': 0.9444973116554759, 'reg_lamb

[I 2024-06-04 22:55:06,281] Trial 69 finished with value: 0.8439226237820613 and parameters: {'n_estimators': 280, 'learning_rate': 0.24918330079825646, 'max_depth': 7, 'subsample': 0.785461886637937, 'colsample_bytree': 0.5264239433742671, 'reg_alpha': 0.717297676987795, 'reg_lambda': 0.9740827812253889}. Best is trial 58 with value: 0.8584571149563169.
[I 2024-06-04 22:55:07,297] Trial 70 finished with value: 0.8539398872635917 and parameters: {'n_estimators': 157, 'learning_rate': 0.09073191498410038, 'max_depth': 6, 'subsample': 0.7320514842976364, 'colsample_bytree': 0.6020927490593931, 'reg_alpha': 0.5684457548927251, 'reg_lambda': 0.39242812590066767}. Best is trial 58 with value: 0.8584571149563169.
[I 2024-06-04 22:55:08,335] Trial 71 finished with value: 0.8563720601135996 and parameters: {'n_estimators': 181, 'learning_rate': 0.04430288092707478, 'max_depth': 6, 'subsample': 0.5530975668249652, 'colsample_bytree': 0.8080506567982301, 'reg_alpha': 0.4515582110659725, 'reg_lam

[I 2024-06-04 22:55:29,749] Trial 92 finished with value: 0.8543189987128713 and parameters: {'n_estimators': 144, 'learning_rate': 0.06368723825985623, 'max_depth': 6, 'subsample': 0.564596730228216, 'colsample_bytree': 0.8385638773510431, 'reg_alpha': 0.1667922254432611, 'reg_lambda': 0.5559978894889317}. Best is trial 58 with value: 0.8584571149563169.
[I 2024-06-04 22:55:30,683] Trial 93 finished with value: 0.8538343069851786 and parameters: {'n_estimators': 163, 'learning_rate': 0.07504288531035172, 'max_depth': 6, 'subsample': 0.5964751064871248, 'colsample_bytree': 0.8482220819507533, 'reg_alpha': 0.6105078465809404, 'reg_lambda': 0.8098746383593987}. Best is trial 58 with value: 0.8584571149563169.
[I 2024-06-04 22:55:31,347] Trial 94 finished with value: 0.8545062041129728 and parameters: {'n_estimators': 132, 'learning_rate': 0.058772037775137814, 'max_depth': 5, 'subsample': 0.5522199374877862, 'colsample_bytree': 0.8218982147679179, 'reg_alpha': 0.2710272894996909, 'reg_la

Best hyperparameters:  {'n_estimators': 289, 'learning_rate': 0.04922782681150982, 'max_depth': 5, 'subsample': 0.8179937232840969, 'colsample_bytree': 0.5800252561667454, 'reg_alpha': 0.4738156679000969, 'reg_lambda': 0.8875763169039237}
Best R² score:  0.8584571149563169


In [70]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    }
    
    model = RandomForestRegressor(**params)
    pipeline = Pipeline([
        ('transformer', transformer),
        ('model', model)
    ])
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='r2', n_jobs=-1)
    mean_score = scores.mean()
    
    return mean_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, timeout=600)

print("Best hyperparameters: ", study.best_params)
print("Best R² score: ", study.best_value)

[I 2024-06-04 22:55:45,213] A new study created in memory with name: no-name-5c4a0790-8238-4164-9df7-0183a07d5bf5
[I 2024-06-04 22:55:47,868] Trial 0 finished with value: 0.7893359446946475 and parameters: {'n_estimators': 277, 'max_depth': 5, 'min_samples_split': 14, 'min_samples_leaf': 14, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7893359446946475.
[I 2024-06-04 22:55:50,216] Trial 1 finished with value: 0.8207123221719707 and parameters: {'n_estimators': 137, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 10, 'max_features': None}. Best is trial 1 with value: 0.8207123221719707.
[I 2024-06-04 22:55:54,820] Trial 2 finished with value: 0.825593176037955 and parameters: {'n_estimators': 259, 'max_depth': 8, 'min_samples_split': 12, 'min_samples_leaf': 11, 'max_features': None}. Best is trial 2 with value: 0.825593176037955.
[I 2024-06-04 22:55:57,147] Trial 3 finished with value: 0.8368566227127616 and parameters: {'n_estimators': 237, 'max_depth': 8, 'min_sa

[I 2024-06-04 22:57:15,890] Trial 33 finished with value: 0.8429896384535731 and parameters: {'n_estimators': 81, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 31 with value: 0.8433680657000122.
[I 2024-06-04 22:57:16,798] Trial 34 finished with value: 0.8358719354207939 and parameters: {'n_estimators': 76, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 31 with value: 0.8433680657000122.
[I 2024-06-04 22:57:17,765] Trial 35 finished with value: 0.8413968054234076 and parameters: {'n_estimators': 79, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 31 with value: 0.8433680657000122.
[I 2024-06-04 22:57:18,480] Trial 36 finished with value: 0.8239993883546536 and parameters: {'n_estimators': 63, 'max_depth': 8, 'min_samples_split': 3, 'min_samples_leaf': 6, 'max_features': 'log2'}. Best is trial 31 with value: 0.84336806570

[I 2024-06-04 22:58:01,319] Trial 66 finished with value: 0.8309938175070866 and parameters: {'n_estimators': 70, 'max_depth': 9, 'min_samples_split': 12, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 64 with value: 0.8443909507359153.
[I 2024-06-04 22:58:03,738] Trial 67 finished with value: 0.8218685501242493 and parameters: {'n_estimators': 277, 'max_depth': 7, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 64 with value: 0.8443909507359153.
[I 2024-06-04 22:58:04,480] Trial 68 finished with value: 0.8386948786260386 and parameters: {'n_estimators': 54, 'max_depth': 9, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 64 with value: 0.8443909507359153.
[I 2024-06-04 22:58:06,640] Trial 69 finished with value: 0.8323874195522979 and parameters: {'n_estimators': 105, 'max_depth': 8, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': None}. Best is trial 64 with value: 0.8443909507359

[I 2024-06-04 22:59:39,973] Trial 99 finished with value: 0.8399896873485198 and parameters: {'n_estimators': 165, 'max_depth': 9, 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_features': None}. Best is trial 78 with value: 0.8475195897318452.


Best hyperparameters:  {'n_estimators': 128, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None}
Best R² score:  0.8475195897318452


XGBoost Regressor has perfromed the best, will be going ahead with using this as the final model

# Final Model

In [73]:
pipeline = Pipeline([
    ('transformer', transformer),
    ('xgb', XGBRegressor(
        n_estimators=289,
        learning_rate=0.04922782681150982,
        max_depth=5,
        subsample=0.8179937232840969,
        colsample_bytree=0.5800252561667454,
        reg_alpha=0.4738156679000969,
        reg_lambda=0.8875763169039237
    ))
])
pipeline.fit(X_train,y_train)

In [74]:
y_pred=pipeline.predict(X_test)
mean_absolute_error(y_pred,y_test)

0.25936045303883337

In [75]:
r2_score(y_pred,y_test)

0.8402045299794927

In [76]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [77]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [79]:
# Note, I had dropped a few columns from my original cleaned dataset which I felt was not important before feature selection
# However, I feel that the society column which had a lot of categories could somehow be used, by using clustering or PCA
# Can try that out later, however an r2 score of 84 with mae 26 lac is not bad for the current state of the project