In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1. Load data

df = pd.read_csv('used_cars_simplified.csv')

# 2. Drop 'model' column to avoid high cardinality
if 'model' in df.columns:
    df = df.drop(columns=['model'])

# 3. Clean 'milage' column: remove commas, 'mi.', and convert to float
if 'milage' in df.columns:
    df['milage'] = df['milage'].astype(str).str.replace(',', '', regex=False)
    df['milage'] = df['milage'].str.replace('mi.', '', regex=False)
    df['milage'] = df['milage'].str.replace('miles', '', regex=False)
    df['milage'] = df['milage'].str.replace('mi', '', regex=False)
    df['milage'] = df['milage'].str.strip()
    df['milage'] = pd.to_numeric(df['milage'], errors='coerce')

# 4. Clean other numeric columns if needed
for col in ['model_year', 'engine_displacement_l', 'engine_cylinders', 'price']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# 5. Define features and target
X = df.drop('price', axis=1)
y = df['price']

# 6. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Feature lists (match your columns exactly)
numeric_features = ['model_year', 'milage', 'engine_displacement_l', 'engine_cylinders']
categorical_features = ['brand', 'fuel_type', 'transmission_type', 'ext_col', 'int_col', 'accident', 'clean_title']

# 8. Preprocessing pipeline with SimpleImputer
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ]), categorical_features)
])

# 9. Full pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# 10. Hyperparameter grid
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# 11. GridSearchCV
search = GridSearchCV(pipeline, param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=2)
search.fit(X_train, y_train)

print("Best parameters:", search.best_params_)

# 12. Evaluate on test set
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.4f}")


Fitting 3 folds for each of 81 candidates, totalling 243 fits


ValueError: 
All the 243 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
243 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/ensemble/_forest.py", line 360, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py", line 1387, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py", line 1397, in _check_y
    y = check_array(
        ^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py", line 1107, in check_array
    _assert_all_finite(
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py", line 120, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py", line 169, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input y contains NaN.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1. Load data

df = pd.read_csv('used_cars_simplified.csv')

# 2. Drop 'model' column to avoid high cardinality
if 'model' in df.columns:
    df = df.drop(columns=['model'])

# 3. Clean 'milage' column: remove commas, 'mi.', and convert to float
if 'milage' in df.columns:
    df['milage'] = df['milage'].astype(str).str.replace(',', '', regex=False)
    df['milage'] = df['milage'].str.replace('mi.', '', regex=False)
    df['milage'] = df['milage'].str.replace('miles', '', regex=False)
    df['milage'] = df['milage'].str.replace('mi', '', regex=False)
    df['milage'] = df['milage'].str.strip()
    df['milage'] = pd.to_numeric(df['milage'], errors='coerce')

# 4. Clean other numeric columns if needed
for col in ['model_year', 'engine_displacement_l', 'engine_cylinders', 'price']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# 5. Drop rows where price is missing
# This is the key fix for your error!
df = df.dropna(subset=['price'])

# 6. Define features and target
X = df.drop('price', axis=1)
y = df['price']

# 7. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 8. Feature lists (match your columns exactly)
numeric_features = ['model_year', 'milage', 'engine_displacement_l', 'engine_cylinders']
categorical_features = ['brand', 'fuel_type', 'transmission_type', 'ext_col', 'int_col', 'accident', 'clean_title']

# 9. Preprocessing pipeline with SimpleImputer
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ]), categorical_features)
])

# 10. Full pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# 11. Hyperparameter grid
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# 12. GridSearchCV
search = GridSearchCV(pipeline, param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=2)
search.fit(X_train, y_train)

print("Best parameters:", search.best_params_)

# 13. Evaluate on test set
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.4f}")


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1. Load data

df = pd.read_csv('used_cars_simplified.csv')

# 2. Drop 'model' column to avoid high cardinality
if 'model' in df.columns:
    df = df.drop(columns=['model'])

# 3. Clean 'milage' column: remove commas, 'mi.', and convert to float
if 'milage' in df.columns:
    df['milage'] = df['milage'].astype(str).str.replace(',', '', regex=False)
    df['milage'] = df['milage'].str.replace('mi.', '', regex=False)
    df['milage'] = df['milage'].str.replace('miles', '', regex=False)
    df['milage'] = df['milage'].str.replace('mi', '', regex=False)
    df['milage'] = df['milage'].str.strip()
    df['milage'] = pd.to_numeric(df['milage'], errors='coerce')

# 4. Clean 'price' column: remove '$', ',', and convert to float
if 'price' in df.columns:
    df['price'] = df['price'].astype(str).str.replace('$', '', regex=False)
    df['price'] = df['price'].str.replace(',', '', regex=False)
    df['price'] = df['price'].str.strip()
    df['price'] = pd.to_numeric(df['price'], errors='coerce')

# 5. Clean other numeric columns if needed
for col in ['model_year', 'engine_displacement_l', 'engine_cylinders']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# 6. Drop rows where price is missing
# This is the key fix for your error!
df = df.dropna(subset=['price'])

# 7. Define features and target
X = df.drop('price', axis=1)
y = df['price']

# 8. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 9. Feature lists (match your columns exactly)
numeric_features = ['model_year', 'milage', 'engine_displacement_l', 'engine_cylinders']
categorical_features = ['brand', 'fuel_type', 'transmission_type', 'ext_col', 'int_col', 'accident', 'clean_title']

# 10. Preprocessing pipeline with SimpleImputer
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ]), categorical_features)
])

# 11. Full pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# 12. Hyperparameter grid
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# 13. GridSearchCV
search = GridSearchCV(pipeline, param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=2)
search.fit(X_train, y_train)

print("Best parameters:", search.best_params_)

# 14. Evaluate on test set
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.4f}")


Fitting 3 folds for each of 81 candidates, totalling 243 fits



KeyboardInterrupt



In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1. Load data

df = pd.read_csv('used_cars_simplified.csv')

# 2. Drop 'model' column to avoid high cardinality
if 'model' in df.columns:
    df = df.drop(columns=['model'])

# 3. Clean 'milage' column: remove commas, 'mi.', and convert to float
if 'milage' in df.columns:
    df['milage'] = df['milage'].astype(str).str.replace(',', '', regex=False)
    df['milage'] = df['milage'].str.replace('mi.', '', regex=False)
    df['milage'] = df['milage'].str.replace('miles', '', regex=False)
    df['milage'] = df['milage'].str.replace('mi', '', regex=False)
    df['milage'] = df['milage'].str.strip()
    df['milage'] = pd.to_numeric(df['milage'], errors='coerce')

# 4. Clean 'price' column: remove '$', ',', and convert to float
if 'price' in df.columns:
    df['price'] = df['price'].astype(str).str.replace('$', '', regex=False)
    df['price'] = df['price'].str.replace(',', '', regex=False)
    df['price'] = df['price'].str.strip()
    df['price'] = pd.to_numeric(df['price'], errors='coerce')

# 5. Clean other numeric columns if needed
for col in ['model_year', 'engine_displacement_l', 'engine_cylinders']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# 6. Drop rows where price is missing
# This is the key fix for your error!
df = df.dropna(subset=['price'])

# 7. Define features and target
X = df.drop('price', axis=1)
y = df['price']

# 8. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 9. Feature lists (match your columns exactly)
numeric_features = ['model_year', 'milage', 'engine_displacement_l', 'engine_cylinders']
categorical_features = ['brand', 'fuel_type', 'transmission_type', 'ext_col', 'int_col', 'accident', 'clean_title']

# 10. Preprocessing pipeline with SimpleImputer
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ]), categorical_features)
])

# 11. Full pipeline with Linear Regression
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', LinearRegression())
])

# 12. Train
pipeline.fit(X_train, y_train)

# 13. Evaluate on test set
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.4f}")


Mean Absolute Error: 24456.05
Mean Squared Error: 19083118060.66
R-squared: 0.0664




In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1. Load data
df = pd.read_csv('used_cars_simplified.csv')

# 2. Drop 'model' column to avoid high cardinality
if 'model' in df.columns:
    df = df.drop(columns=['model'])

# 3. Clean 'milage' column: remove commas, 'mi.', and convert to float
if 'milage' in df.columns:
    df['milage'] = df['milage'].astype(str).str.replace(',', '', regex=False)
    df['milage'] = df['milage'].str.replace('mi.', '', regex=False)
    df['milage'] = df['milage'].str.replace('miles', '', regex=False)
    df['milage'] = df['milage'].str.replace('mi', '', regex=False)
    df['milage'] = df['milage'].str.strip()
    df['milage'] = pd.to_numeric(df['milage'], errors='coerce')

# 4. Clean 'price' column: remove '$', ',', and convert to float
if 'price' in df.columns:
    df['price'] = df['price'].astype(str).str.replace('$', '', regex=False)
    df['price'] = df['price'].str.replace(',', '', regex=False)
    df['price'] = df['price'].str.strip()
    df['price'] = pd.to_numeric(df['price'], errors='coerce')

# 5. Clean other numeric columns if needed
for col in ['model_year', 'engine_displacement_l', 'engine_cylinders']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# 6. Drop rows where price is missing
df = df.dropna(subset=['price'])

# 7. Define features and target
X = df.drop('price', axis=1)
y = df['price']

# 8. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 9. Feature lists (match your columns exactly)
numeric_features = ['model_year', 'milage', 'engine_displacement_l', 'engine_cylinders']
categorical_features = ['brand', 'fuel_type', 'transmission_type', 'ext_col', 'int_col', 'accident', 'clean_title']

# 10. Preprocessing pipeline with SimpleImputer
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ]), categorical_features)
])

# 11. Full pipeline with Random Forest
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42))
])

# 12. Train
pipeline.fit(X_train, y_train)

# 13. Evaluate on test set
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.4f}")


Mean Absolute Error: 19175.72
Mean Squared Error: 17158710102.61
R-squared: 0.1605




In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

# 1. Load data
df = pd.read_csv('used_cars_simplified.csv')

# 2. Drop 'model' column to avoid high cardinality
if 'model' in df.columns:
    df = df.drop(columns=['model'])

# 3. Clean 'milage' column: remove commas, 'mi.', and convert to float
if 'milage' in df.columns:
    df['milage'] = df['milage'].astype(str).str.replace(',', '', regex=False)
    df['milage'] = df['milage'].str.replace('mi.', '', regex=False)
    df['milage'] = df['milage'].str.replace('miles', '', regex=False)
    df['milage'] = df['milage'].str.replace('mi', '', regex=False)
    df['milage'] = df['milage'].str.strip()
    df['milage'] = pd.to_numeric(df['milage'], errors='coerce')

# 4. Clean 'price' column: remove '$', ',', and convert to float
if 'price' in df.columns:
    df['price'] = df['price'].astype(str).str.replace('$', '', regex=False)
    df['price'] = df['price'].str.replace(',', '', regex=False)
    df['price'] = df['price'].str.strip()
    df['price'] = pd.to_numeric(df['price'], errors='coerce')

# 5. Clean other numeric columns if needed
for col in ['model_year', 'engine_displacement_l', 'engine_cylinders']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# 6. Drop rows where price is missing
df = df.dropna(subset=['price'])

# 7. Define features and target
X = df.drop('price', axis=1)
y = df['price']

# 8. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 9. Feature lists (match your columns exactly)
numeric_features = ['model_year', 'milage', 'engine_displacement_l', 'engine_cylinders']
categorical_features = ['brand', 'fuel_type', 'transmission_type', 'ext_col', 'int_col', 'accident', 'clean_title']

# 10. Preprocessing pipeline with SimpleImputer
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ]), categorical_features)
])

# 11. Full pipeline with XGBoost
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', XGBRegressor(n_estimators=300, max_depth=8, learning_rate=0.1, random_state=42, n_jobs=-1))
])

# 12. Train
pipeline.fit(X_train, y_train)

# 13. Evaluate on test set
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.4f}")


Mean Absolute Error: 18090.25
Mean Squared Error: 16812980224.00
R-squared: 0.1774




In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import optuna

# 1. Load data
df = pd.read_csv('used_cars_simplified.csv')

# 2. Drop 'model' column to avoid high cardinality
if 'model' in df.columns:
    df = df.drop(columns=['model'])

# 3. Clean 'milage' column: remove commas, 'mi.', and convert to float
if 'milage' in df.columns:
    df['milage'] = df['milage'].astype(str).str.replace(',', '', regex=False)
    df['milage'] = df['milage'].str.replace('mi.', '', regex=False)
    df['milage'] = df['milage'].str.replace('miles', '', regex=False)
    df['milage'] = df['milage'].str.replace('mi', '', regex=False)
    df['milage'] = df['milage'].str.strip()
    df['milage'] = pd.to_numeric(df['milage'], errors='coerce')

# 4. Clean 'price' column: remove '$', ',', and convert to float
if 'price' in df.columns:
    df['price'] = df['price'].astype(str).str.replace('$', '', regex=False)
    df['price'] = df['price'].str.replace(',', '', regex=False)
    df['price'] = df['price'].str.strip()
    df['price'] = pd.to_numeric(df['price'], errors='coerce')

# 5. Clean other numeric columns if needed
for col in ['model_year', 'engine_displacement_l', 'engine_cylinders']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# 6. Drop rows where price is missing
df = df.dropna(subset=['price'])

# 7. Define features and target
X = df.drop('price', axis=1)
y = df['price']

# 8. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 9. Feature lists (match your columns exactly)
numeric_features = ['model_year', 'milage', 'engine_displacement_l', 'engine_cylinders']
categorical_features = ['brand', 'fuel_type', 'transmission_type', 'ext_col', 'int_col', 'accident', 'clean_title']

# 10. Preprocessing pipeline with SimpleImputer
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ]), categorical_features)
])

# 11. Optuna objective function
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 600),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 2.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 2.0),
        'random_state': 42,
        'n_jobs': -1
    }
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', XGBRegressor(**params))
    ])
    scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='r2', n_jobs=-1)
    return np.mean(scores)

# 12. Run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40)
print('Best trial:', study.best_trial.params)

# 13. Train final model with best params
best_params = study.best_trial.params
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', XGBRegressor(**best_params))
])
pipeline.fit(X_train, y_train)

# 14. Evaluate on test set
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.4f}")


[I 2025-09-30 16:59:54,902] A new study created in memory with name: no-name-b77518ed-c868-456d-84d6-c2e0553e43b8
[I 2025-09-30 17:00:22,865] Trial 0 finished with value: 0.7541163563728333 and parameters: {'n_estimators': 414, 'max_depth': 11, 'learning_rate': 0.051094606309504416, 'subsample': 0.6001589330689612, 'colsample_bytree': 0.7227475159945312, 'reg_alpha': 1.6043095369720526, 'reg_lambda': 1.9129582264760636}. Best is trial 0 with value: 0.7541163563728333.
[I 2025-09-30 17:00:25,438] Trial 1 finished with value: 0.7354361613591512 and parameters: {'n_estimators': 401, 'max_depth': 7, 'learning_rate': 0.2627908282229473, 'subsample': 0.9784047423093085, 'colsample_bytree': 0.9907066604162295, 'reg_alpha': 0.3773675371737497, 'reg_lambda': 0.5938062059337341}. Best is trial 0 with value: 0.7541163563728333.
[I 2025-09-30 17:00:29,240] Trial 2 finished with value: 0.7572028636932373 and parameters: {'n_estimators': 338, 'max_depth': 9, 'learning_rate': 0.1331216586928483, 'sub

Best trial: {'n_estimators': 489, 'max_depth': 3, 'learning_rate': 0.19835034506233776, 'subsample': 0.6730991390060816, 'colsample_bytree': 0.6390499391469543, 'reg_alpha': 1.2321664650617599, 'reg_lambda': 1.5151252351368312}
Mean Absolute Error: 18496.51
Mean Squared Error: 17569193984.00
R-squared: 0.1404




In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor
import datetime

# 1. Load data
df = pd.read_csv('used_cars_simplified.csv')

# 2. Drop 'model' column to avoid high cardinality
if 'model' in df.columns:
    df = df.drop(columns=['model'])

# 3. Clean 'milage' column
df['milage'] = df['milage'].astype(str).str.replace(',', '', regex=False)
df['milage'] = df['milage'].str.replace('mi.', '', regex=False)
df['milage'] = df['milage'].str.replace('miles', '', regex=False)
df['milage'] = df['milage'].str.replace('mi', '', regex=False)
df['milage'] = df['milage'].str.strip()
df['milage'] = pd.to_numeric(df['milage'], errors='coerce')

# 4. Clean 'price' column
df['price'] = df['price'].astype(str).str.replace('$', '', regex=False)
df['price'] = df['price'].str.replace(',', '', regex=False)
df['price'] = df['price'].str.strip()
df['price'] = pd.to_numeric(df['price'], errors='coerce')

# 5. Clean other numeric columns
for col in ['model_year', 'engine_displacement_l', 'engine_cylinders']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# 6. Fill missing values in categorical columns with 'missing'
categorical_cols = [col for col in df.columns if df[col].dtype == 'object']
for col in categorical_cols:
    df[col] = df[col].fillna('missing')

# 7. Drop rows where price is missing and remove outliers
price_q_low = df['price'].quantile(0.01)
price_q_high = df['price'].quantile(0.99)
milage_q_low = df['milage'].quantile(0.01)
milage_q_high = df['milage'].quantile(0.99)
df = df[(df['price'] >= price_q_low) & (df['price'] <= price_q_high)]
df = df[(df['milage'] >= milage_q_low) & (df['milage'] <= milage_q_high)]
df = df.dropna(subset=['price'])

# 8. Feature engineering: car age
current_year = datetime.datetime.now().year
df['car_age'] = current_year - df['model_year']

# 9. Log-transform the target
df['price_log'] = np.log1p(df['price'])

# 10. Define features and target
X = df.drop(['price', 'price_log'], axis=1)
y = df['price_log']

# 11. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 12. Categorical features for CatBoost
categorical_features = [col for col in X.columns if X[col].dtype == 'object']

# 13. Train CatBoost
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    eval_metric='R2',  # Use R2 as the evaluation metric
    random_seed=42,
    cat_features=categorical_features,
    verbose=100
)
model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

# 14. Evaluate on test set (convert back from log)
y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.4f}")


0:	learn: 0.0610500	test: 0.0585040	best: 0.0585040 (0)	total: 98.5ms	remaining: 1m 38s
100:	learn: 0.8524421	test: 0.8140291	best: 0.8140757 (99)	total: 2.87s	remaining: 25.6s
200:	learn: 0.8855146	test: 0.8292187	best: 0.8292187 (200)	total: 4.16s	remaining: 16.5s
300:	learn: 0.9060949	test: 0.8362151	best: 0.8363954 (291)	total: 5.5s	remaining: 12.8s
400:	learn: 0.9223016	test: 0.8394363	best: 0.8394724 (399)	total: 7.61s	remaining: 11.4s
500:	learn: 0.9341599	test: 0.8418657	best: 0.8418903 (499)	total: 9.59s	remaining: 9.55s
600:	learn: 0.9421979	test: 0.8429077	best: 0.8429594 (589)	total: 11s	remaining: 7.29s
700:	learn: 0.9492305	test: 0.8441232	best: 0.8441429 (698)	total: 12.4s	remaining: 5.28s
800:	learn: 0.9560746	test: 0.8447581	best: 0.8447581 (800)	total: 13.8s	remaining: 3.42s
900:	learn: 0.9616897	test: 0.8455992	best: 0.8457361 (893)	total: 15.1s	remaining: 1.66s
999:	learn: 0.9656903	test: 0.8460735	best: 0.8460735 (999)	total: 16.6s	remaining: 0us

bestTest = 0.8460

In [8]:
pip install optuna xgboost

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.5.0


In [11]:
pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
