In [4]:
from autogluon.tabular import TabularPredictor

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import re
import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping
from catboost import CatBoostRegressor, Pool

from xgboost import XGBRegressor


import random

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

from autogluon.tabular import TabularPredictor

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
Original = pd.read_csv('data/used_cars.csv')
Original[['milage', 'price']] = Original[['milage', 'price']].map(
    lambda x: int(''.join(re.findall(r'\d+', x))))


In [15]:
test_df = pd.read_csv('data/test.csv')
print(test_df.isna().sum())
display(test_df.head(1))



id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type        3383
engine              0
transmission        0
ext_col             0
int_col             0
accident         1632
clean_title     14239
dtype: int64


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes


In [6]:
def calculate_vehicle_age_features(df):
    """
    Calculate age-related features for vehicles.
    
    This function performs the following operations:
    1. Calculates the age of each vehicle based on the current year (2024) and the model year.
    2. Computes the annual milage for each vehicle.
    3. Calculates the average milage for each age group.
    4. Computes the average annual milage for each age group.
    
    Features extracted:
    - age: The age of the vehicle in years.
    - annual_milage: The average number of miles driven per year.
    - avg_milage_for_age: The average milage for all vehicles of the same age.
    - avg_annual_milage_for_age: The average annual milage for all vehicles of the same age.
    
    Args:
    df (pd.DataFrame): Input dataframe with 'model_year' and 'milage' columns
    
    Returns:
    pd.DataFrame: Dataframe with additional age-related features
    """
    current_year = 2024
    df['age'] = current_year - df['model_year']
    df['annual_milage'] = df['milage'] / df['age']
    
    df['avg_milage_for_age'] = df.groupby('age')['milage'].transform('mean')
    df['avg_annual_milage_for_age'] = df.groupby('age')['annual_milage'].transform('mean')
    
    return df


def identify_luxury_brands(df):
    """
    Identify luxury brands in the dataset and create a binary indicator.
    
    This function performs the following operations:
    1. Defines a set of luxury car brands.
    2. Creates a binary indicator column 'is_luxury' based on whether the brand is in the luxury set.
    
    Features extracted:
    - is_luxury: Binary indicator (0 or 1) representing whether the brand is considered luxury.
    
    Modifications:
    - Adds a new column 'is_luxury' to the dataframe.
    
    Args:
    df (pd.DataFrame): Input dataframe with 'brand' column
    
    Returns:
    pd.DataFrame: Dataframe with additional luxury brand indicator
    """
    luxury_brands = {
        'Mercedes-Benz', 'BMW', 'Audi', 'Porsche', 'Land Rover', 
        'Lexus', 'Jaguar', 'Bentley', 'Maserati', 'Lamborghini', 
        'Rolls-Royce', 'Ferrari', 'McLaren', 'Aston Martin', 'Maybach'
    }
    df['is_luxury'] = df['brand'].isin(luxury_brands).astype(int)
    
    return df


def enrich_dataset(df):
    """
    Enrich the dataset with all additional features.
    
    Args:
    df (pd.DataFrame): Input dataframe
    
    Returns:
    pd.DataFrame: Enriched dataframe with all additional features
    """
    df = calculate_vehicle_age_features(df)
    df = identify_luxury_brands(df)
    return df


# Enrich both training and testing datasets
train_enriched = enrich_dataset(train)
test_enriched = enrich_dataset(test)

In [7]:
def preprocess_categorical_features(dataframe, threshold=100):
    """
    Preprocess categorical features in the dataframe.

    This function performs the following operations:
    1. Replaces low-frequency categories with 'noise' for specified columns.
    2. Fills missing values with 'missing' for all categorical columns.
    3. Converts categorical columns to 'category' dtype.

    Args:
    dataframe (pd.DataFrame): Input dataframe to be processed.
    threshold (int): Minimum frequency for a category to be kept. Default is 100.

    Returns:
    pd.DataFrame: Processed dataframe with updated categorical features.
    """
    
    categorical_columns = [
        'brand', 'model', 'fuel_type', 'engine', 'transmission',
        'ext_col', 'int_col', 'accident', 'clean_title'
    ]
    columns_to_reduce = ['model', 'engine', 'transmission', 'ext_col', 'int_col']
    
    for column in columns_to_reduce:
        mask = dataframe[column].value_counts(dropna=False)[dataframe[column]].values < threshold
        dataframe.loc[mask, column] = "noise"
        
    for column in categorical_columns:
        dataframe[column] = dataframe[column].fillna('missing')
        dataframe[column] = dataframe[column].astype('category')
        
    return dataframe

train_processed = preprocess_categorical_features(train_enriched)
test_processed = preprocess_categorical_features(test_enriched)

X_features = train_processed.drop('price', axis=1)
y_target = train_processed['price']

Calculation of MAE and the difference between MAE and MSE. Thanks to @Backpacker for this idea.
OG comment: the best way to improve predictions is to handle outliers in the training data without removing them. one of participant did this well by using both MSE (robust to outliers) and MAE (less robust) to train two different models then used the difference between the models as a new feature to detect outliers. [Link](https://www.kaggle.com/competitions/playground-series-s4e9/discussion/536456)

# Innovative Outlier Handling: Combining MSE and MAE

1. MSE vs MAE:
    - MSE is more sensitive to outliers because it squares the errors, amplifying large deviations.
    - MAE is less sensitive to outliers as it uses absolute values, treating all deviations linearly.

2. Training two models:
    - One model is trained using MSE as the loss function.
    - Another model is trained using MAE as the loss function.

3. Detecting outliers:
    - The difference between the predictions of these two models can indicate potential outliers.
    - For normal data points, both models should predict similar values.
    - For outliers, the MSE model's prediction might deviate more from the MAE model's prediction.

4. Creating a new feature:
    - The difference between the MSE and MAE model predictions becomes a new feature.
    - This feature essentially quantifies the "outlierness" of each data point.

5. Handling outliers without removal:
    - Instead of removing outliers, which could lose valuable information, this approach allows the final model to learn from the "outlierness" feature.
    - The model can then adjust its predictions based on how likely a data point is to be an outlier.

This method is particularly clever because it:
    - Preserves all data points, including potential outliers.
    - Provides a data-driven way to identify and handle outliers.
    - Adds valuable information to the model without making strong assumptions about what constitutes an outlier.


In [8]:
import numpy as np
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

def train_model(X_train, y_train, X_val, y_val, model_type='LGBM', objective='MAE', cat_cols=None):
    """
    Train a model (LightGBM or CatBoost) and return predictions for validation data.
    
    This function trains either a LightGBM or CatBoost model based on the specified parameters
    and returns the trained model along with predictions for the validation data.
    
    Args:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training target values.
        X_val (pd.DataFrame): Validation features.
        y_val (pd.Series): Validation target values.
        model_type (str, optional): Type of model to use ('LGBM' for LightGBM or 'CAT' for CatBoost). Defaults to 'LGBM'.
        objective (str, optional): Objective function to use ('MAE' or 'MSE'). Defaults to 'MAE'.
        cat_cols (list, optional): List of categorical column names. Required for CatBoost. Defaults to None.
    
    Returns:
        tuple: A tuple containing:
            - trained model: The trained LightGBM or CatBoost model.
            - np.array: Predictions for validation data.
    
    Raises:
        ValueError: If an invalid model_type is specified.
    
    Example:
        >>> X_train = pd.DataFrame({'feature1': [1, 2, 3], 'feature2': ['A', 'B', 'C']})
        >>> y_train = pd.Series([10, 20, 30])
        >>> X_val = pd.DataFrame({'feature1': [4, 5], 'feature2': ['B', 'C']})
        >>> y_val = pd.Series([40, 50])
        >>> model, val_pred = train_model(X_train, y_train, X_val, y_val, model_type='LGBM', objective='MAE')
    """
    if model_type == 'LGBM':
        params = {
            'objective': objective,
            'n_estimators': 1000,
            'random_state': 1,
        }
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        callbacks = [lgb.log_evaluation(period=300), lgb.early_stopping(stopping_rounds=200)]
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[train_data, val_data],
            valid_names=['train', 'valid'],
            callbacks=callbacks    
        )
        val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    
    elif model_type == 'CAT':
        params = {
            'loss_function': objective,
            'iterations': 1000,
            'random_seed': 1,
            'early_stopping_rounds': 200
        }
        train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        val_data = Pool(data=X_val, label=y_val, cat_features=cat_cols)
        
        model = CatBoostRegressor(**params)
        model.fit(train_data, eval_set=val_data, verbose=150)
        
        val_pred = model.predict(X_val)
    
    else:
        raise ValueError("Invalid model_type. Choose either 'LGBM' or 'CAT'.")
    
    return model, val_pred

def cross_validate_and_predict(X_features, y_target, test_data, model_type='LGBM', objective='MAE'):
    """
    Perform cross-validation, train models, and make predictions on test data.
    
    This function implements a 5-fold cross-validation strategy, trains models on each fold,
    and generates out-of-fold predictions for the training data as well as predictions for the test data.
    
    Args:
        X_features (pd.DataFrame): Feature matrix for training data.
        y_target (pd.Series): Target variable for training data.
        test_data (pd.DataFrame): Processed test data.
        model_type (str, optional): Type of model to use ('LGBM' or 'CAT'). Defaults to 'LGBM'.
        objective (str, optional): Objective function to use ('MAE' or 'MSE'). Defaults to 'MAE'.
    
    Returns:
        tuple: A tuple containing:
            - np.array: Out-of-fold predictions for training data.
            - np.array: Predictions for test data.
    
    Raises:
        ValueError: If an invalid model_type is specified.
    
    Example:
        >>> X_features = pd.DataFrame({'feature1': range(100), 'feature2': ['A']*50 + ['B']*50})
        >>> y_target = pd.Series(range(100))
        >>> test_data = pd.DataFrame({'feature1': range(20), 'feature2': ['A']*10 + ['B']*10})
        >>> oof_pred, test_pred = cross_validate_and_predict(X_features, y_target, test_data, model_type='LGBM', objective='MAE')
    """
    cat_cols = X_features.select_dtypes(include=['object', 'category']).columns.tolist()
    print(f"Categorical columns: {cat_cols}")
    
    oof_predictions = np.zeros(len(X_features))
    test_predictions = np.zeros(len(test_data))
    kf = KFold(n_splits=5, shuffle=True, random_state=1)
    rmse_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_features)):
        print(f"Training fold {fold + 1}/5 with {model_type}")

        X_train, X_val = X_features.iloc[train_idx], X_features.iloc[val_idx]
        y_train, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]

        model, val_pred = train_model(X_train, y_train, X_val, y_val, model_type, objective, cat_cols)
        
        if model_type == 'LGBM':
            test_pred = model.predict(test_data, num_iteration=model.best_iteration)
        elif model_type == 'CAT':
            test_pred = model.predict(test_data)
        else:
            raise ValueError("Invalid model_type. Choose either 'LGBM' or 'CAT'.")
        
        rmse = np.sqrt(mean_squared_error(y_val, val_pred))
        rmse_scores.append(rmse)

        print(f'{model_type} Fold RMSE: {rmse}')
        
        oof_predictions[val_idx] = val_pred
        test_predictions += test_pred / 5
    
    print(f'Mean RMSE: {np.mean(rmse_scores)}')
    return oof_predictions, test_predictions

def get_outlierness(X_features, y_target, test_data, model_type='LGBM'):
    """
    Calculate the 'outlierness' of data points by comparing MSE and MAE predictions.
    
    This function trains models using both MAE and MSE objectives, then calculates
    the difference between their predictions to create an 'outlierness' feature.
    This feature can help identify potential outliers in the data.
    
    Args:
        X_features (pd.DataFrame): Feature matrix for training data.
        y_target (pd.Series): Target variable for training data.
        test_data (pd.DataFrame): Processed test data.
        model_type (str, optional): Type of model to use ('LGBM' or 'CAT'). Defaults to 'LGBM'.
    
    Returns:
        tuple: A tuple containing:
            - pd.DataFrame: Training data with added 'MAE_pred' and 'outlierness' features.
            - pd.DataFrame: Test data with added 'MAE_pred' and 'outlierness' features.
    
    Note:
        The 'outlierness' feature is calculated as the difference between
        MSE-based predictions and MAE-based predictions. This calculation is done
        for both training and test data. For training data, we use out-of-fold
        predictions, while for test data, we use the average predictions from
        all folds. This means the 'outlierness' values may have slightly different
        distributions between train and test sets due to the different prediction methods.
    
    Example:
        >>> X_features = pd.DataFrame({'feature1': range(100), 'feature2': ['A']*50 + ['B']*50})
        >>> y_target = pd.Series(range(100))
        >>> test_data = pd.DataFrame({'feature1': range(20), 'feature2': ['A']*10 + ['B']*10})
        >>> X_features_with_outlierness, test_data_with_outlierness = get_outlierness(X_features, y_target, test_data, 'LGBM')
    """
    # Train and predict using MAE objective
    oof_mae, test_mae = cross_validate_and_predict(X_features, y_target, test_data, model_type, 'MAE')
    X_features['MAE_pred'] = oof_mae
    test_data['MAE_pred'] = test_mae

    # Train and predict using MSE objective
    oof_mse, test_mse = cross_validate_and_predict(X_features, y_target, test_data, model_type, 'MSE')

    # Calculate the difference between MSE and MAE predictions (outlierness)
    X_features['outlierness'] = oof_mse - X_features['MAE_pred']
    test_data['outlierness'] = test_mse - test_data['MAE_pred']

    return X_features, test_data

# Example usage:
X_features_with_outlierness, test_processed_with_outlierness = get_outlierness(X_features, y_target, test_processed, 'LGBM')
print(test_processed_with_outlierness.head())

Categorical columns: ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
Training fold 1/5 with LGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004746 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1957
[LightGBM] [Info] Number of data points in the train set: 150826, number of used features: 17
[LightGBM] [Info] Start training from score 30825.000000
Training until validation scores don't improve for 200 rounds
[300]	train's l1: 16527	valid's l1: 16659.9
Early stopping, best iteration is:
[320]	train's l1: 16512.2	valid's l1: 16659.4
LGBM Fold RMSE: 63150.238787390896
Training fold 2/5 with LGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004873 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not 

In [9]:
X_features_with_outlierness['price'] = y_target

predictor = TabularPredictor(
    label='price',
    eval_metric='rmse',
    problem_type='regression'
).fit(
    X_features_with_outlierness,
    presets='best_quality',
    time_limit=600, # in seconds
    verbosity=2,
    num_gpus=0,
    included_model_types=['GBM', 'CAT']
)



No path specified. Models will be saved in: "AutogluonModels/ag-20240927_184022"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.13
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.2.0: Wed Nov 15 21:59:33 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T8112
CPU Count:          8
Memory Avail:       1.54 GB / 8.00 GB (19.3%)
Disk Space Avail:   99.32 GB / 460.43 GB (21.6%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit

In [16]:
display(test_processed_with_outlierness.head(1))

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,age,annual_milage,avg_milage_for_age,avg_annual_milage_for_age,is_luxury,MAE_pred,outlierness
0,188533,Land,noise,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes,9,10888.888889,81078.503981,9008.722665,0,16561.711113,3641.585516


In [17]:
test_processed_with_outlierness.columns


Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title', 'age',
       'annual_milage', 'avg_milage_for_age', 'avg_annual_milage_for_age',
       'is_luxury', 'MAE_pred', 'outlierness'],
      dtype='object')

In [10]:
predictor = TabularPredictor.load("AutogluonModels/ag-20240927_184022")
test_predictions = predictor.predict(test_processed_with_outlierness)

# Save predictions
print("Saving predictions...")

# Save out-of-fold predictions
oof_predictions = predictor.predict(X_features_with_outlierness)
oof_df = X_features_with_outlierness[['id']].copy()
oof_df['pred'] = oof_predictions
timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
oof_path = f"predictions/oof_autogluon_{timestamp}.csv"
oof_df.to_csv(oof_path, index=False)
print(f"Out-of-fold predictions saved to {oof_path}")

# Save test predictions
sub = pd.read_csv("data/sample_submission.csv")
sub['price'] = test_predictions
sub_path = f"predictions/submission_autogluon_{timestamp}.csv"
sub.to_csv(sub_path, index=False)
print(f"Test predictions saved to {sub_path}")


Saving predictions...
Out-of-fold predictions saved to predictions/oof_autogluon_20240928_002127.csv
Test predictions saved to predictions/submission_autogluon_20240928_002127.csv
