In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.model_selection import train_test_split , GridSearchCV , RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score , mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from xgboost import XGBRegressor

In [3]:
df = pd.read_csv('cleaned_data.csv')

In [4]:
X = df.drop('price', axis=1)
Y = df['price']

In [6]:
X_train , x_test , y_train , y_test = train_test_split(X , Y , test_size=0.3, random_state= 69)

In [7]:
num_features = X.select_dtypes(include=['int', 'float64']).columns
cat_features = X.select_dtypes(include=['object', 'category']).columns

In [9]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features),
     ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

In [15]:
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb)
])


In [16]:
param_dist = {
    'model__n_estimators': [100, 200, 300, 400, 500],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__max_depth': [3, 5, 7, 9],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.6, 0.8, 1.0]
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best Parameters (Randomized):", random_search.best_params_)
print("Best CV Score:", random_search.best_score_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters (Randomized): {'model__subsample': 0.8, 'model__n_estimators': 500, 'model__max_depth': 5, 'model__learning_rate': 0.2, 'model__colsample_bytree': 0.8}
Best CV Score: 0.9455512370302148
