In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor
import datetime


df = pd.read_csv('used_cars_simplified.csv')


if 'model' in df.columns:
    df = df.drop(columns=['model'])


df['milage'] = df['milage'].astype(str).str.replace(',', '', regex=False)
df['milage'] = df['milage'].str.replace('mi.', '', regex=False)
df['milage'] = df['milage'].str.replace('miles', '', regex=False)
df['milage'] = df['milage'].str.replace('mi', '', regex=False)
df['milage'] = df['milage'].str.strip()
df['milage'] = pd.to_numeric(df['milage'], errors='coerce')


df['price'] = df['price'].astype(str).str.replace('$', '', regex=False)
df['price'] = df['price'].str.replace(',', '', regex=False)
df['price'] = df['price'].str.strip()
df['price'] = pd.to_numeric(df['price'], errors='coerce')


for col in ['model_year', 'engine_displacement_l', 'engine_cylinders']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')


categorical_cols = [col for col in df.columns if df[col].dtype == 'object']
for col in categorical_cols:
    df[col] = df[col].fillna('missing')


price_q_low = df['price'].quantile(0.01)
price_q_high = df['price'].quantile(0.99)
milage_q_low = df['milage'].quantile(0.01)
milage_q_high = df['milage'].quantile(0.99)
df = df[(df['price'] >= price_q_low) & (df['price'] <= price_q_high)]
df = df[(df['milage'] >= milage_q_low) & (df['milage'] <= milage_q_high)]
df = df.dropna(subset=['price'])
current_year = datetime.datetime.now().year


df['car_age'] = current_year - df['model_year']
df['price_log'] = np.log1p(df['price'])
X = df.drop(['price', 'price_log'], axis=1)
y = df['price_log']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
categorical_features = [col for col in X.columns if X[col].dtype == 'object']

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    eval_metric='R2',
    random_seed=42,
    cat_features=categorical_features,
    verbose=100
)
model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.4f}")


0:	learn: 0.0610500	test: 0.0585040	best: 0.0585040 (0)	total: 16.3ms	remaining: 16.2s
100:	learn: 0.8524421	test: 0.8140291	best: 0.8140757 (99)	total: 1.32s	remaining: 11.8s
200:	learn: 0.8855146	test: 0.8292187	best: 0.8292187 (200)	total: 2.62s	remaining: 10.4s
300:	learn: 0.9060949	test: 0.8362151	best: 0.8363954 (291)	total: 3.97s	remaining: 9.23s
400:	learn: 0.9223016	test: 0.8394363	best: 0.8394724 (399)	total: 5.31s	remaining: 7.93s
500:	learn: 0.9341599	test: 0.8418657	best: 0.8418903 (499)	total: 7.03s	remaining: 7s
600:	learn: 0.9421979	test: 0.8429077	best: 0.8429594 (589)	total: 9.54s	remaining: 6.33s
700:	learn: 0.9492305	test: 0.8441232	best: 0.8441429 (698)	total: 11s	remaining: 4.68s
800:	learn: 0.9560746	test: 0.8447581	best: 0.8447581 (800)	total: 12.4s	remaining: 3.08s
900:	learn: 0.9616897	test: 0.8455992	best: 0.8457361 (893)	total: 13.8s	remaining: 1.51s
999:	learn: 0.9656903	test: 0.8460735	best: 0.8460735 (999)	total: 15.2s	remaining: 0us

bestTest = 0.8460735

In [8]:
pip install optuna xgboost

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.5.0


In [11]:
pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
