In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.model_selection import train_test_split , GridSearchCV , RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score , mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from xgboost import XGBRegressor

In [3]:
df = pd.read_csv('/workspaces/Ahmedabad-Flat-Price-/dataset/cleaned_data.csv')

In [4]:
X = df.drop('price', axis=1)
Y = df['price']

In [5]:
X_train , x_test , y_train , y_test = train_test_split(X , Y , test_size=0.3, random_state= 69)

In [6]:
num_features = X.select_dtypes(include=['int', 'float64']).columns
cat_features = X.select_dtypes(include=['object', 'category']).columns

In [7]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features),
     ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

In [8]:
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb)
])


In [9]:
param_dist = {
    'model__n_estimators': [100, 200, 300, 400, 500],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__max_depth': [3, 5, 7, 9],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.6, 0.8, 1.0]
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best Parameters (Randomized):", random_search.best_params_)
print("Best CV Score:", random_search.best_score_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=7, model__n_estimators=400, model__subsample=0.8; total time=   1.4s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=7, model__n_estimators=400, model__subsample=0.8; total time=   1.4s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=7, model__n_estimators=400, model__subsample=0.8; total time=   1.2s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=7, model__n_estimators=400, model__subsample=0.8; total time=   1.3s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.05, model__max_depth=7, model__n_estimators=400, model__subsample=0.8; total time=   1.3s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.01, model__max_depth=9, model__n_estimators=200, model__subsample=1.0; total time=   2.5s
[CV] END model__colsample_bytree=0.8, model__learning_rate=0.01, model__max_depth=9, model__n_

In [10]:
param_grid = {
    'model__n_estimators': [random_search.best_params_['model__n_estimators'] - 50,
                            random_search.best_params_['model__n_estimators'],
                            random_search.best_params_['model__n_estimators'] + 50],
    'model__learning_rate': [0.05, 0.1, 0.15],
    'model__max_depth': [random_search.best_params_['model__max_depth'] - 1,
                         random_search.best_params_['model__max_depth'],
                         random_search.best_params_['model__max_depth'] + 1]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Best Parameters (Grid):", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)


Fitting 5 folds for each of 27 candidates, totalling 135 fits


[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=450; total time=   0.6s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=450; total time=   0.7s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=450; total time=   0.7s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=450; total time=   0.7s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=450; total time=   0.6s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=500; total time=   0.6s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=500; total time=   0.8s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=500; total time=   0.8s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=500; total time=   0.6s
[CV] END model__learning_rate=0.05, model__max_depth=4, model__n_estimators=500; total time=   0.7s


In [11]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(x_test)

print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("MAPE:", mean_absolute_percentage_error(y_test, y_pred))


R2 Score: 0.9120366580390529
MAE: 0.1551508210733535
RMSE: 0.2961413082372654
MAPE: 0.009460337290196007


In [12]:
#  original predicted value in INR

print("original predicted value of price column :")
print(list(np.expm1(y_pred)))

original predicted value of price column :
[np.float32(15359181.0), np.float32(4957698.5), np.float32(5552364.5), np.float32(29460660.0), np.float32(2659768.0), np.float32(6839432.5), np.float32(28158246.0), np.float32(33838976.0), np.float32(33823170.0), np.float32(12608576.0), np.float32(4716195.5), np.float32(5043884.0), np.float32(3977670.0), np.float32(51613040.0), np.float32(38366676.0), np.float32(8480721.0), np.float32(7508803.5), np.float32(4651716.0), np.float32(26718692.0), np.float32(9185934.0), np.float32(35865096.0), np.float32(9946373.0), np.float32(10591434.0), np.float32(5042532.5), np.float32(65053388.0), np.float32(12897440.0), np.float32(43234920.0), np.float32(6178500.0), np.float32(28164100.0), np.float32(5618896.5), np.float32(2447478.8), np.float32(5135362.0), np.float32(23738544.0), np.float32(56051400.0), np.float32(43548176.0), np.float32(4928029.5), np.float32(78023280.0), np.float32(44826880.0), np.float32(4753602.5), np.float32(5068234.5), np.float32(48906

# A simple note that the price and area_sqft column are in np.log1p form already in the dataset as i set them after cleaning the data in notebook one.

In [13]:
import joblib

joblib.dump(best_model, 'xgboost_pipeline_model.pkl')


['xgboost_pipeline_model.pkl']