In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/predicting-car-selling-price/y_train.csv
/kaggle/input/predicting-car-selling-price/X_train.csv
/kaggle/input/predicting-car-selling-price/__results__.html
/kaggle/input/predicting-car-selling-price/__notebook__.ipynb
/kaggle/input/predicting-car-selling-price/__output__.json
/kaggle/input/predicting-car-selling-price/car_processed.csv
/kaggle/input/predicting-car-selling-price/custom.css
/kaggle/input/predicting-car-selling-price/__results___files/__results___47_0.png
/kaggle/input/predicting-car-selling-price/__results___files/__results___45_0.png
/kaggle/input/predicting-car-selling-price/__results___files/__results___53_0.png
/kaggle/input/predicting-car-selling-price/__results___files/__results___56_0.png
/kaggle/input/predicting-car-selling-price/__results___files/__results___78_0.png
/kaggle/input/predicting-car-selling-price/__results___files/__results___58_0.png
/kaggle/input/predicting-car-selling-price/__results___files/__results___51_0.png
/kaggle/input/predic

In [2]:
!pip install -q xgboost
!pip install -q lightgbm

# Hyperparamters tuning best models for car price prediction

In [3]:
import warnings
import random

# Scikit-learn Components
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import TransformedTargetRegressor

# Model Algorithms
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

warnings.filterwarnings('ignore')

X_train = pd.read_csv('/kaggle/input/predicting-car-selling-price/X_train.csv')
y_train = pd.read_csv('/kaggle/input/predicting-car-selling-price/y_train.csv').squeeze()

## Setting global seed

In [4]:
def set_seed(seed=42):
    """
    Sets the random seed for Python, NumPy, and other libraries to ensure 
    reproducibility of results.
    """
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    print(f"Random seed set to {seed}")

set_seed(42)

Random seed set to 42


## Distinguishing features

In [5]:
numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = ['fuel', 'seller_type', 'transmission', 'owner', 'brand', 'model']

## Preprocessor

In [6]:
# predefine the categories for one got encoder
ohe = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)
ohe.fit(X_train[categorical_features])
known_categories = ohe.categories_

preprocessor_tree = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(categories=known_categories, handle_unknown='ignore', drop='first', sparse_output=False), categorical_features)
    ],
    remainder='passthrough'
)

## Seach Space

In [7]:
# -- RandomForest --
rf_param_grid = {
    'regressor__model__n_estimators': [int(x) for x in np.linspace(start=100, stop=1200, num=12)],
    'regressor__model__max_features': ['sqrt', 'log2', 1.0],
    'regressor__model__max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None],
    'regressor__model__min_samples_split': [2, 5, 10],
    'regressor__model__min_samples_leaf': [1, 2, 4]
}

# -- XGBoost --
xgb_param_grid = {
    'regressor__model__n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
    'regressor__model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'regressor__model__max_depth': [3, 4, 5, 6, 8],
    'regressor__model__subsample': [0.7, 0.8, 0.9, 1.0],
    'regressor__model__colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

# -- LightGBM --
lgbm_param_grid = {
    'regressor__model__n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
    'regressor__model__learning_rate': [0.01, 0.05, 0.1],
    'regressor__model__num_leaves': [20, 31, 40, 50],
    'regressor__model__max_depth': [-1, 5, 10, 15],
    'regressor__model__colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

# Dictionary to hold models and their param grids
models_to_tune = {
    'RandomForest': (RandomForestRegressor(random_state=42), rf_param_grid),
    'XGBoost': (XGBRegressor(random_state=42), xgb_param_grid),
    'LightGBM': (LGBMRegressor(random_state=42, verbosity=-1), lgbm_param_grid)
}

## Run Hyperparameter tuning

In [8]:
best_params_found = {}

for name, (model, param_grid) in models_to_tune.items():
    print(f"\n" + "="*30)
    print(f"RUNNING TUNING FOR: {name}")
    print("="*30)
    
    # Create the full pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor_tree),
        ('selector', SelectFromModel(ExtraTreesRegressor(n_estimators=50, random_state=42), max_features=35)),
        ('model', model)
    ])
    
    # Wrap in the target transformer
    final_pipeline = TransformedTargetRegressor(
        regressor=pipeline,
        func=np.log1p,
        inverse_func=np.expm1
    )
    
    # Set up RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=final_pipeline,
        param_distributions=param_grid,
        n_iter=50,  # Number of parameter settings that are sampled.
        cv=5,
        verbose=2,
        random_state=42,
        n_jobs=-1,
        scoring='r2'
    )
    
    # Fit the search
    random_search.fit(X_train, y_train)
    
    # Store and print the best parameters
    best_params_found[name] = random_search.best_params_
    print(f"\nBest parameters for {name}:")
    print(random_search.best_params_)
    print(f"Best R² score found: {random_search.best_score_:.4f}")



RUNNING TUNING FOR: RandomForest
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END regressor__model__max_depth=None, regressor__model__max_features=1.0, regressor__model__min_samples_leaf=4, regressor__model__min_samples_split=5, regressor__model__n_estimators=100; total time=  41.8s
[CV] END regressor__model__max_depth=80, regressor__model__max_features=sqrt, regressor__model__min_samples_leaf=2, regressor__model__min_samples_split=5, regressor__model__n_estimators=300; total time=  40.8s
[CV] END regressor__model__max_depth=80, regressor__model__max_features=sqrt, regressor__model__min_samples_leaf=2, regressor__model__min_samples_split=5, regressor__model__n_estimators=300; total time=  40.7s
[CV] END regressor__model__max_depth=80, regressor__model__max_features=log2, regressor__model__min_samples_leaf=2, regressor__model__min_samples_split=5, regressor__model__n_estimators=600; total time=  42.1s
[CV] END regressor__model__max_depth=80, regressor__model__max_

## Summary

In [9]:
print("\n\n--- SUMMARY OF BEST PARAMETERS ---")
for name, params in best_params_found.items():
    print(f"\n{name}:")
    print(params)



--- SUMMARY OF BEST PARAMETERS ---

RandomForest:
{'regressor__model__n_estimators': 500, 'regressor__model__min_samples_split': 2, 'regressor__model__min_samples_leaf': 1, 'regressor__model__max_features': 1.0, 'regressor__model__max_depth': 20}

XGBoost:
{'regressor__model__subsample': 0.8, 'regressor__model__n_estimators': 500, 'regressor__model__max_depth': 4, 'regressor__model__learning_rate': 0.2, 'regressor__model__colsample_bytree': 0.8}

LightGBM:
{'regressor__model__num_leaves': 40, 'regressor__model__n_estimators': 1000, 'regressor__model__max_depth': -1, 'regressor__model__learning_rate': 0.01, 'regressor__model__colsample_bytree': 0.8}
