In [None]:
# import os
# import gc
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# os.chdir('/content/drive/MyDrive/car_listing_project/find-best-car-listing')

In [1]:
import pandas as pd
import numpy as np
import joblib
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.compose import TransformedTargetRegressor
from xgboost import XGBRegressor
import xgboost as xgb

## Configuration

In [2]:
#scripts/config.py
model_save_path = 'models/saved_xgb_model.json'
pipeline_save_path = 'models/saved_pipeline.pkl'

#test data
test_data_path = 'data/test_data.csv'

#target encoding smooting
smoothing = 0.2
#features to target encode
cat_features = ['make', 'model']
#features
features = ['make', 'model', 'engine', 'listing_year', 'car_age','fuel_bifuel', 'fuel_diesel',
            'fuel_hybrid', 'fuel_petrol', 'is_luxury', 'mileage_bin_1',
            'mileage_bin_2', 'mileage_bin_3', 'mileage_bin_4', 'log_mileage', 'luxury_age_interaction',
       'age_auto_interaction', 'age_engine_interaction','lux_auto_interaction','listing_month','gearbox_type_automatic',
            'is_suv_truck', 'is_reliable', 'popularity', 'country_czech republic',
       'country_france', 'country_germany', 'country_italy', 'country_japan',
       'country_romania', 'country_russia', 'country_south korea',
       'country_spain', 'country_sweden', 'country_uk', 'country_ukraine',
       'country_unknown', 'country_usa']

## Functions

In [3]:
# Function to calculate RMSE
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


# Function to train the model
def train_model():
    """
    - Loads and cleans the data.
    - Prepares the pipeline with preprocessing and model training.
    - Saves the pipeline and test data for evaluation.
    """
    encoder = ce.TargetEncoder(smoothing=0.2)

    # Define the model with target transformation
    model_with_log_target = TransformedTargetRegressor(
        regressor=XGBRegressor(
            n_estimators=1500,
            max_depth=14,
            learning_rate=0.05,
            reg_alpha=0.1,
            reg_lambda=0.2,
            random_state=42,
            objective='reg:squarederror'
        ),
        func=np.log1p,          # Log transform for target
        inverse_func=np.expm1   # Inverse transform for predictions
    )

    # Load data
    df = pd.read_csv('data/all_clean.csv')
    X = df[features]
    y = df['price']

    # Preprocessing step
    preprocessor = ColumnTransformer(
        transformers=[
            ('target', encoder, cat_features)
        ],
        remainder='passthrough'
    )

    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('xgb', model_with_log_target)
    ])

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train pipeline
    pipeline.fit(X_train, y_train)

    # Save the XGBoost model and the pipeline
    pipeline.named_steps['xgb'].regressor_.save_model(model_save_path)  # Save XGBoost model
    joblib.dump(pipeline, pipeline_save_path)  # Save the full pipeline

    # Save test data for evaluation
    test_data = pd.DataFrame(X_test)
    test_data['true_price'] = y_test
    test_data.to_csv(test_data_path, index=False)


# Function to load the model pipeline
def load_model_pipeline():
    """
    Load the saved pipeline, including the preprocessing and trained model.
    """
    return joblib.load(pipeline_save_path)


# Function to evaluate the model
def evaluate_model():
    """
    Evaluate the model on the saved test data.
    """
    # Load test data
    test_data = pd.read_csv(test_data_path)

    # Prepare test features and labels
    X_test = test_data.drop(columns=['true_price'])
    y_test = test_data['true_price']

    # Load the pipeline and make predictions
    pipeline = load_model_pipeline()
    y_pred = pipeline.predict(X_test)

    # Evaluate performance
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    print(f"Model Performance on Test Data:\nRMSE: {rmse:.2f}\nMAE: {mae:.2f}")
    return rmse, mae


# Run the training, loading, and evaluation process
train_model()
pipeline = load_model_pipeline()
rmse, mae = evaluate_model()

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")


KeyboardInterrupt: 