In [1]:
#!pip install -q catboost optuna

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna

# Load the Dataset

In [3]:
cars = pd.read_csv('/content/quikr_car.csv')

# Data Cleaning

In [4]:
cars = cars[~cars['fuel_type'].isna()]
cars['kms_driven'] = cars['kms_driven'].str.split().str.get(0).str.replace(',', '').astype(float)
cars = cars[cars['Price'] != 'Ask For Price']
cars['Price'] = cars['Price'].str.replace(',', '').astype(float)
cars = cars[cars['year'].str.isnumeric()]
cars['year'] = cars['year'].astype(int)
cars['name'] = cars['name'].str.split().str.slice(0, 3).str.join(' ')
cars = cars[cars['Price'] < 6000000].reset_index(drop=True)

In [5]:
cars.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000.0,45000.0,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000.0,40.0,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000.0,28000.0,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000.0,36000.0,Diesel
4,Ford Figo,Ford,2012,175000.0,41000.0,Diesel


# Feature Engineering

In [6]:
cars['car_age'] = 2025 - cars['year']
cars['kms_per_year'] = cars['kms_driven'] / (cars['car_age'] + 1) #This is to avoid getting divided by zero.
cars['is_premium_brand'] = cars['company'].isin(['Audi', 'BMW', 'Mercedes', 'Jaguar', 'Volvo']).astype(int)

In [7]:
cars.head(1)

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type,car_age,kms_per_year,is_premium_brand
0,Hyundai Santro Xing,Hyundai,2007,80000.0,45000.0,Petrol,18,2368.421053,0


# Features and target

In [8]:
X = cars[['name', 'company', 'fuel_type', 'car_age', 'kms_per_year', 'is_premium_brand']]
y = cars['Price']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline

In [10]:
column_trans = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), ['name', 'company', 'fuel_type']),  # CatBoost handles categorical features
        ('num', StandardScaler(), ['car_age', 'kms_per_year', 'is_premium_brand'])
    ],
    remainder='passthrough'
)

# Model Training and Evaluation

In [11]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2 = r2_score(true, predicted)
    return mae, rmse, r2

In [12]:
models = {
    'Ridge': Ridge(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42, verbose=-1),
    'CatBoost': CatBoostRegressor(cat_features=['name', 'company', 'fuel_type'], verbose=0, random_state=42)
}

# Hyperparameter tuning with Optuna

In [13]:
def objective(trial, model_name, X_train, y_train):
    if model_name == 'Ridge':
        params = {'alpha': trial.suggest_float('alpha', 0.01, 100.0, log=True)}
        model = Ridge(**params)
    elif model_name == 'Random Forest':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'max_depth': trial.suggest_int('max_depth', 5, 20),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10)
        }
        model = RandomForestRegressor(**params, random_state=42)
    elif model_name == 'Gradient Boosting':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 10)
        }
        model = GradientBoostingRegressor(**params, random_state=42)
    elif model_name == 'XGBoost':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 10)
        }
        model = XGBRegressor(**params, random_state=42)
    elif model_name == 'LightGBM':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 20, 50),
            'max_depth': trial.suggest_int('max_depth', 3, 10)
        }
        model = LGBMRegressor(**params, random_state=42, verbose=-1)
    elif model_name == 'CatBoost':
        params = {
            'iterations': trial.suggest_int('iterations', 50, 300),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'depth': trial.suggest_int('depth', 4, 10)
        }
        model = CatBoostRegressor(**params, cat_features=['name', 'company', 'fuel_type'], verbose=0, random_state=42)

    if model_name in ['Ridge', 'Random Forest', 'Gradient Boosting', 'XGBoost', 'LightGBM']:
        pipe = make_pipeline(column_trans, model)
        pipe.fit(X_train, y_train)
        score = cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2').mean()
    else:  # CatBoost
        model.fit(X_train, y_train)
        score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2').mean()

    return score

# Store results

In [14]:
results = {}
best_params = {}

In [15]:
# Train and evaluate models
for name in models:
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, name, X_train, y_train), n_trials=50)
    best_params[name] = study.best_params

    # Train with best params
    if name in ['Ridge', 'Random Forest', 'Gradient Boosting', 'XGBoost', 'LightGBM']:
        model = models[name].set_params(**best_params[name])
        pipe = make_pipeline(column_trans, model)
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
    else:  # CatBoost
        model = CatBoostRegressor(**best_params[name], cat_features=['name', 'company', 'fuel_type'], verbose=0, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # Evaluate
    mae, rmse, r2 = evaluate_model(y_test, y_pred)
    cv_scores = cross_val_score(model if name == 'CatBoost' else pipe, X_train, y_train, cv=5, scoring='r2')

    results[name] = {
        'Test MAE': mae,
        'Test RMSE': rmse,
        'Test R2': r2,
        'CV R2 Mean': cv_scores.mean(),
        'CV R2 Std': cv_scores.std() * 2
    }

    print(f"{name}")
    print('----------------------------------')
    print('Model performance for Test set')
    print(f"- Root Mean Squared Error: {rmse:.4f}")
    print(f"- Mean Absolute Error: {mae:.4f}")
    print(f"- R2 Score: {r2:.4f}")
    print(f"Cross-Validation R2: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"Best Parameters: {best_params[name]}")
    print('='*35)
    print('\n')


[I 2025-06-27 09:26:19,686] A new study created in memory with name: no-name-a5157a23-9b41-4371-8403-6a4aa14b9139
[I 2025-06-27 09:26:20,048] Trial 0 finished with value: 0.4575161590434778 and parameters: {'alpha': 0.2659945982928723}. Best is trial 0 with value: 0.4575161590434778.
[I 2025-06-27 09:26:20,400] Trial 1 finished with value: 0.5561273523546855 and parameters: {'alpha': 0.6637271404965887}. Best is trial 1 with value: 0.5561273523546855.
[I 2025-06-27 09:26:20,936] Trial 2 finished with value: 0.5546299961137056 and parameters: {'alpha': 0.6494720921992566}. Best is trial 1 with value: 0.5561273523546855.
[I 2025-06-27 09:26:21,372] Trial 3 finished with value: 0.5754420709982974 and parameters: {'alpha': 4.027347812229777}. Best is trial 3 with value: 0.5754420709982974.
[I 2025-06-27 09:26:21,881] Trial 4 finished with value: 0.4454353741066585 and parameters: {'alpha': 26.459289051005474}. Best is trial 3 with value: 0.5754420709982974.
[I 2025-06-27 09:26:22,425] Tria

Ridge
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 245576.3729
- Mean Absolute Error: 123235.1584
- R2 Score: 0.7039
Cross-Validation R2: 0.5929 (+/- 0.5889)
Best Parameters: {'alpha': 1.8464783371396272}




[I 2025-06-27 09:26:48,666] Trial 0 finished with value: 0.5984512057730145 and parameters: {'n_estimators': 124, 'max_depth': 19, 'min_samples_split': 5}. Best is trial 0 with value: 0.5984512057730145.
[I 2025-06-27 09:26:56,884] Trial 1 finished with value: 0.602660168725356 and parameters: {'n_estimators': 293, 'max_depth': 15, 'min_samples_split': 9}. Best is trial 1 with value: 0.602660168725356.
[I 2025-06-27 09:27:02,504] Trial 2 finished with value: 0.5594111246186122 and parameters: {'n_estimators': 271, 'max_depth': 9, 'min_samples_split': 8}. Best is trial 1 with value: 0.602660168725356.
[I 2025-06-27 09:27:04,354] Trial 3 finished with value: 0.5816953536296774 and parameters: {'n_estimators': 52, 'max_depth': 15, 'min_samples_split': 4}. Best is trial 1 with value: 0.602660168725356.
[I 2025-06-27 09:27:11,072] Trial 4 finished with value: 0.5858064036283059 and parameters: {'n_estimators': 230, 'max_depth': 12, 'min_samples_split': 9}. Best is trial 1 with value: 0.6026

Random Forest
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 243833.4298
- Mean Absolute Error: 112156.3796
- R2 Score: 0.7080
Cross-Validation R2: 0.6339 (+/- 0.4044)
Best Parameters: {'n_estimators': 53, 'max_depth': 19, 'min_samples_split': 8}




[I 2025-06-27 09:29:46,629] Trial 0 finished with value: 0.548722174651341 and parameters: {'n_estimators': 127, 'learning_rate': 0.018596836581538226, 'max_depth': 7}. Best is trial 0 with value: 0.548722174651341.
[I 2025-06-27 09:29:47,839] Trial 1 finished with value: 0.5288025166734622 and parameters: {'n_estimators': 102, 'learning_rate': 0.13450266597444857, 'max_depth': 4}. Best is trial 0 with value: 0.548722174651341.
[I 2025-06-27 09:29:51,796] Trial 2 finished with value: 0.48560988292861823 and parameters: {'n_estimators': 269, 'learning_rate': 0.022176593990801036, 'max_depth': 5}. Best is trial 0 with value: 0.548722174651341.
[I 2025-06-27 09:29:52,891] Trial 3 finished with value: 0.5777127977050466 and parameters: {'n_estimators': 66, 'learning_rate': 0.09943753412516416, 'max_depth': 7}. Best is trial 3 with value: 0.5777127977050466.
[I 2025-06-27 09:29:55,138] Trial 4 finished with value: 0.5635378427677946 and parameters: {'n_estimators': 107, 'learning_rate': 0.0

Gradient Boosting
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 249709.3888
- Mean Absolute Error: 98092.5229
- R2 Score: 0.6938
Cross-Validation R2: 0.5964 (+/- 0.3051)
Best Parameters: {'n_estimators': 121, 'learning_rate': 0.280435733110256, 'max_depth': 9}




[I 2025-06-27 09:32:01,901] Trial 0 finished with value: 0.5231874549171094 and parameters: {'n_estimators': 170, 'learning_rate': 0.02821850299765049, 'max_depth': 8}. Best is trial 0 with value: 0.5231874549171094.
[I 2025-06-27 09:32:02,476] Trial 1 finished with value: 0.5764420387995841 and parameters: {'n_estimators': 147, 'learning_rate': 0.10716678393017137, 'max_depth': 4}. Best is trial 1 with value: 0.5764420387995841.
[I 2025-06-27 09:32:03,427] Trial 2 finished with value: 0.531086459353246 and parameters: {'n_estimators': 158, 'learning_rate': 0.056834049547027664, 'max_depth': 7}. Best is trial 1 with value: 0.5764420387995841.
[I 2025-06-27 09:32:04,311] Trial 3 finished with value: 0.5330803389470583 and parameters: {'n_estimators': 107, 'learning_rate': 0.04941673156361871, 'max_depth': 8}. Best is trial 1 with value: 0.5764420387995841.
[I 2025-06-27 09:32:05,392] Trial 4 finished with value: 0.5488648897287763 and parameters: {'n_estimators': 199, 'learning_rate': 0

XGBoost
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 243176.3331
- Mean Absolute Error: 122437.5621
- R2 Score: 0.7096
Cross-Validation R2: 0.6250 (+/- 0.4761)
Best Parameters: {'n_estimators': 123, 'learning_rate': 0.23300330910687775, 'max_depth': 3}




[I 2025-06-27 09:32:44,053] Trial 0 finished with value: 0.29633269481826396 and parameters: {'n_estimators': 273, 'learning_rate': 0.010999976457410302, 'num_leaves': 25, 'max_depth': 5}. Best is trial 0 with value: 0.29633269481826396.
[I 2025-06-27 09:32:44,510] Trial 1 finished with value: 0.1728078695949384 and parameters: {'n_estimators': 246, 'learning_rate': 0.20899362882105885, 'num_leaves': 27, 'max_depth': 7}. Best is trial 0 with value: 0.29633269481826396.
[I 2025-06-27 09:32:44,938] Trial 2 finished with value: 0.30894061246004795 and parameters: {'n_estimators': 185, 'learning_rate': 0.028898179412532508, 'num_leaves': 44, 'max_depth': 9}. Best is trial 2 with value: 0.30894061246004795.
[I 2025-06-27 09:32:45,221] Trial 3 finished with value: 0.33052190724403774 and parameters: {'n_estimators': 118, 'learning_rate': 0.08863083918265997, 'num_leaves': 40, 'max_depth': 6}. Best is trial 3 with value: 0.33052190724403774.
[I 2025-06-27 09:32:45,692] Trial 4 finished with v

LightGBM
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 305425.6696
- Mean Absolute Error: 172564.2593
- R2 Score: 0.5419
Cross-Validation R2: 0.3387 (+/- 0.5714)
Best Parameters: {'n_estimators': 121, 'learning_rate': 0.10258630660049955, 'num_leaves': 23, 'max_depth': 6}




[I 2025-06-27 09:33:04,360] Trial 0 finished with value: 0.49794280025548054 and parameters: {'iterations': 91, 'learning_rate': 0.06202321423626717, 'depth': 6}. Best is trial 0 with value: 0.49794280025548054.
[I 2025-06-27 09:33:09,272] Trial 1 finished with value: 0.5267673902947463 and parameters: {'iterations': 288, 'learning_rate': 0.042373979526771466, 'depth': 7}. Best is trial 1 with value: 0.5267673902947463.
[I 2025-06-27 09:33:15,879] Trial 2 finished with value: 0.4995922995353964 and parameters: {'iterations': 274, 'learning_rate': 0.27296546587825493, 'depth': 7}. Best is trial 1 with value: 0.5267673902947463.
[I 2025-06-27 09:33:18,226] Trial 3 finished with value: 0.5024616039355003 and parameters: {'iterations': 205, 'learning_rate': 0.015163022766165058, 'depth': 6}. Best is trial 1 with value: 0.5267673902947463.
[I 2025-06-27 09:33:21,821] Trial 4 finished with value: 0.5410080575345031 and parameters: {'iterations': 163, 'learning_rate': 0.04063653475826705, 'de

CatBoost
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 270143.9611
- Mean Absolute Error: 131018.9874
- R2 Score: 0.6416
Cross-Validation R2: 0.5769 (+/- 0.4659)
Best Parameters: {'iterations': 225, 'learning_rate': 0.14369073940801055, 'depth': 10}




In [16]:
# Compare models
results_df = pd.DataFrame(results).T
best_model = results_df['Test R2'].idxmax()
best_r2 = results_df['Test R2'].max()
print("===== Model Comparison =====")
print(results_df[['Test R2', 'Test MAE', 'Test RMSE', 'CV R2 Mean', 'CV R2 Std']].sort_values(by='Test R2', ascending=False))
print(f"\nBest Model: {best_model} with Test R2 Score: {best_r2:.4f}")

===== Model Comparison =====
                    Test R2       Test MAE      Test RMSE  CV R2 Mean  \
XGBoost            0.709613  122437.562141  243176.333139    0.625049   
Random Forest      0.708041  112156.379565  243833.429829    0.633942   
Ridge              0.703853  123235.158417  245576.372861    0.592925   
Gradient Boosting  0.693800   98092.522905  249709.388777    0.596444   
CatBoost           0.641635  131018.987387  270143.961064    0.576946   
LightGBM           0.541915  172564.259305  305425.669604    0.338732   

                   CV R2 Std  
XGBoost             0.476092  
Random Forest       0.404444  
Ridge               0.588902  
Gradient Boosting   0.305059  
CatBoost            0.465875  
LightGBM            0.571351  

Best Model: XGBoost with Test R2 Score: 0.7096


In [17]:
# Example prediction
sample = pd.DataFrame(columns=X.columns, data=np.array(['Maruti Suzuki Swift', 'Maruti', 'Petrol', 6, 100/6, 0]).reshape(1, -1))
print("\nPrediction for sample (Maruti Suzuki Swift, Maruti, Petrol, 6 years, 100 kms/year, non-premium):")
for name in models:
    if name in ['Ridge', 'Random Forest', 'Gradient Boosting', 'XGBoost', 'LightGBM']:
        model = models[name].set_params(**best_params[name])
        pipe = make_pipeline(column_trans, model)
        pipe.fit(X_train, y_train)
        pred = pipe.predict(sample)
    else:  # CatBoost
        model = CatBoostRegressor(**best_params[name], cat_features=['name', 'company', 'fuel_type'], verbose=0, random_state=42)
        model.fit(X_train, y_train)
        pred = model.predict(sample)
    print(f"{name}: {pred[0]:.2f}")


Prediction for sample (Maruti Suzuki Swift, Maruti, Petrol, 6 years, 100 kms/year, non-premium):
Ridge: 450362.22
Random Forest: 610535.12
Gradient Boosting: 601832.69
XGBoost: 463929.81
LightGBM: 508249.40




CatBoost: 494450.41
