In [55]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.model_selection import train_test_split , GridSearchCV , RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score , mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from xgboost import XGBRegressor

In [56]:
df = pd.read_csv('cleaned_data.csv')

In [57]:
X = df.drop('price', axis=1)
Y = df['price']

In [58]:
X_train , x_test , y_train , y_test = train_test_split(X , Y , test_size=0.3, random_state= 69)

In [59]:
num_features = X.select_dtypes(include=['int', 'float64']).columns
cat_features = X.select_dtypes(include=['object', 'category']).columns

In [60]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features),
     ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

In [61]:
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb)
])


In [62]:
param_dist = {
    'model__n_estimators': [100, 200, 300, 400, 500],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__max_depth': [3, 5, 7, 9],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.6, 0.8, 1.0]
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best Parameters (Randomized):", random_search.best_params_)
print("Best CV Score:", random_search.best_score_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters (Randomized): {'model__subsample': 0.8, 'model__n_estimators': 500, 'model__max_depth': 5, 'model__learning_rate': 0.2, 'model__colsample_bytree': 0.8}
Best CV Score: 0.9455512370302148


In [63]:
param_grid = {
    'model__n_estimators': [random_search.best_params_['model__n_estimators'] - 50,
                            random_search.best_params_['model__n_estimators'],
                            random_search.best_params_['model__n_estimators'] + 50],
    'model__learning_rate': [0.05, 0.1, 0.15],
    'model__max_depth': [random_search.best_params_['model__max_depth'] - 1,
                         random_search.best_params_['model__max_depth'],
                         random_search.best_params_['model__max_depth'] + 1]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Best Parameters (Grid):", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters (Grid): {'model__learning_rate': 0.15, 'model__max_depth': 6, 'model__n_estimators': 550}
Best CV Score: 0.9459618776477523


In [66]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(x_test)

print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("MAPE:", mean_absolute_percentage_error(y_test, y_pred))


R2 Score: 0.9129127422511554
MAE: 0.15389707036119185
RMSE: 0.2948502221731438
MAPE: 0.009383275376785679


# A simple note that the price and area_sqft are in np.log1p form already in the dataset as i set them after cleaning the data in notebook one.