In [18]:
VERSION = 7

In [19]:
import pandas as pd

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
print("Train Shape: ", train_df.shape)
display(train_df.head(1))
print("Test Shape: ", test_df.shape)
display(test_df.head(1))

Train Shape:  (188533, 13)


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200


Test Shape:  (125690, 12)


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes


In [20]:
CATS = ['brand', 'model', 'model_year', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col',  'drive_type','accident', 'clean_title']
NUMS = ['milage']

In [43]:
# reference: https://www.kaggle.com/competitions/playground-series-s4e9/discussion/531566 (extract_transmission_features and extract_engine_features)

# reference: https://www.kaggle.com/competitions/playground-series-s4e9/discussion/531434 (decode_engine) - modified slightly

# reference: https://www.kaggle.com/competitions/playground-series-s4e9/discussion/531462 (extract_fuel_type)

import re

def extract_transmission_features(transmission) -> pd.Series:
    # Convert to lowercase for consistency
    transmission = transmission.lower()

    # Initialize feature dictionary
    features = {
        'transmission_type': 'Unknown',
        'num_speeds': None,
        'auto_shift': 0,
        'overdrive': 0,
    }

    # Extract transmission type
    if 'cv' in transmission:
        features['transmission_type'] = 'CVT'
    elif 'manual' in transmission or 'mt' in transmission or 'm/t' in transmission:
        features['transmission_type'] = 'Manual'
    elif 'dct' in transmission:
        features['transmission_type'] = 'DCT Automatic'
    elif 'single-speed' in transmission:
        features['transmission_type'] = 'Single-Speed'
    elif 'variable' in transmission:
        features['transmission_type'] = 'Variable'
    elif 'automatic' in transmission or'a/t' in transmission:
        features['transmission_type'] = 'Automatic'

    # Extract number of speeds
    speed_match = re.search(r'(\d+)-speed', transmission)
    if speed_match:
        features['num_speeds'] = int(speed_match.group(1))

    # Check for special features
    features['auto_shift'] = 1 if 'auto-shift' in transmission or 'dual shift mode' in transmission else 0
    features['overdrive'] = 1 if 'overdrive' in transmission else 0

    return pd.Series(features)

def extract_engine_features(row) -> pd.Series:
    features = {
        'horsepower': 0,
        'num_cylinders': 0,
        'engine_cc': 0,
    }

    hp_match = re.search(r'(\d+\.?\d*)HP', row)
    features['horsepower'] = float(hp_match.group(1)) if hp_match else None

    cylinders_match = re.search(r'(\d+)\s*Cylinder|V(\d+)|I(\d+)', row)
    features['num_cylinders'] = int(cylinders_match.group(1) or cylinders_match.group(2) or cylinders_match.group(3)) if cylinders_match else None

    displacement_match = re.search(r'(\d+\.?\d*)\s*L(?:iter)?', row)
    features['engine_cc'] = float(displacement_match.group(1)) * 1000 if displacement_match else None

    return pd.Series(features)

def decode_engine(s: str) -> pd.Series:
    features = {
        'horsepower': None,
        'engine_cc': None,
        'num_cylinders': None,
    }
    s = s.lower()
    
    # Extract HP
    hp_match = re.search(r'(\d+(\.\d+)?)\s*hp', s)
    features['horsepower'] = float(hp_match.group(1)) if hp_match else None
    
    # Extract engine displacement (cc)
    cc_match = re.search(r'(\d+(\.\d+)?)\s*l', s)
    features['engine_cc'] = float(cc_match.group(1)) if cc_match else None
    
    # Extract cylinder count
    cylinder_match = re.search(r'(\d+)\s*cylinder', s)
    features['num_cylinders'] = int(cylinder_match.group(1)) if cylinder_match else None
    
    return pd.Series(features)

def extract_fuel_type(row) -> pd.Series:
    features = {
        'fuel_type_extracted': None
    }
    
    if isinstance(row, str):
        row = row.lower()
        if 'gasoline' in row:
            features['fuel_type_extracted'] = 'Gasoline'
        elif 'hydrogen' in row:
            features['fuel_type_extracted'] = 'Hydrogen'
        elif 'diesel' in row:
            features['fuel_type_extracted'] = 'Diesel'
        elif 'hybrid' in row or 'gas/electric' in row or 'electric/gas' in row:
            features['fuel_type_extracted'] = 'Hybrid'
        elif 'electric' in row:
            features['fuel_type_extracted'] = 'Electric'
        elif 'flex fuel' in row or 'e85' in row:
            features['fuel_type_extracted'] = 'Flex Fuel'
    
    return pd.Series(features)


def feature_engg(df: pd.DataFrame):
    # Perform fillna operations in one go
    df[['clean_title', 'accident']] = df[['clean_title', 'accident']].fillna({'clean_title': 'No', 'accident': 'None reported'})
    
    # Use vectorized operations instead of apply where possible
    df['model_age'] = 2024 - df['model_year']
    
    # Combine feature extraction operations
    trans_features = df['transmission'].apply(extract_transmission_features)
    engine_features = df['engine'].apply(decode_engine)
    fuel_features = df['fuel_type'].apply(extract_fuel_type)
    
    # Concatenate all features at once
    df = pd.concat([df, trans_features, engine_features, fuel_features], axis=1)
    
    # Drop columns in one operation
    columns_to_drop = ['num_speeds', 'horsepower', 'num_cylinders', 'transmission', 'engine', 'fuel_type', 'model_year'] # we are dropping num_speeds - 84k+ missing, horsepower 33k+missing, num_cylinders 37k+ missing.
    df.drop(columns=columns_to_drop, inplace=True)

    # rare category substitution
    MIN_COUNT = 100
    categorical_cols = ['brand', 'model', 'ext_col', 'int_col']
    value_counts = {col: df[col].value_counts() for col in categorical_cols}
    rare_mappings = {col: {cat: 'RARE' for cat in counts[counts < MIN_COUNT].index} for col, counts in value_counts.items()}
    df[categorical_cols] = df[categorical_cols].replace(rare_mappings)

    return df

train_df_fe = feature_engg(train_df)
display(train_df_fe.head(1))

Unnamed: 0,id,brand,model,milage,ext_col,int_col,accident,clean_title,price,model_age,transmission_type,auto_shift,overdrive,engine_cc,fuel_type_extracted
0,0,MINI,Cooper S Base,213000,Yellow,Gray,None reported,Yes,4200,17,Automatic,0,0,1.6,Gasoline


In [44]:
train_df_fe.isna().sum()

id                        0
brand                     0
model                     0
milage                    0
ext_col                   0
int_col                   0
accident                  0
clean_title               0
price                     0
model_age                 0
transmission_type         0
auto_shift                0
overdrive                 0
engine_cc              6698
fuel_type_extracted    5879
dtype: int64

In [47]:
from autogluon.tabular import TabularDataset, TabularPredictor

train_data = TabularDataset(train_df_fe.drop('id', axis=1))
predictor = TabularPredictor(label='price', eval_metric='rmse', problem_type='regression')
predictor.fit(
    train_data,
    num_bag_folds=5, 
    excluded_model_types=['KNN','RF','XT'],
    fit_weighted_ensemble=True,
    num_cpus=8
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240921_070440"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.13
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.2.0: Wed Nov 15 21:59:33 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T8112
CPU Count:          8
Memory Avail:       2.19 GB / 8.00 GB (27.4%)
Disk Space Avail:   114.12 GB / 460.43 GB (24.8%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x3416296f0>

In [48]:
predictor.leaderboard(silent=True)

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-72715.972919,root_mean_squared_error,3.967247,366.382877,0.001905,0.102987,2,True,8
1,LightGBMXT_BAG_L1,-72816.746267,root_mean_squared_error,1.184551,2.466789,1.184551,2.466789,1,True,1
2,CatBoost_BAG_L1,-72912.518696,root_mean_squared_error,0.256582,124.922143,0.256582,124.922143,1,True,3
3,LightGBM_BAG_L1,-73008.913903,root_mean_squared_error,0.691377,1.911765,0.691377,1.911765,1,True,2
4,NeuralNetFastAI_BAG_L1,-73098.406598,root_mean_squared_error,1.34889,121.389851,1.34889,121.389851,1,True,4
5,XGBoost_BAG_L1,-73408.840882,root_mean_squared_error,0.466337,109.223927,0.466337,109.223927,1,True,5
6,NeuralNetTorch_BAG_L1,-73452.068174,root_mean_squared_error,0.483942,115.589342,0.483942,115.589342,1,True,6
7,LightGBMLarge_BAG_L1,-73571.300425,root_mean_squared_error,1.304571,2.587804,1.304571,2.587804,1,True,7


In [49]:
predictor.feature_importance(train_data)

Computing feature importance via permutation shuffling for 13 features using 5000 rows with 5 shuffle sets...
	36.64s	= Expected runtime (7.33s per shuffle set)
	20.45s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
milage,4041.703846,846.803336,0.000218,5,5785.283618,2298.124074
brand,1461.493731,480.586127,0.001222,5,2451.027208,471.960254
model_age,1252.571662,292.109414,0.000331,5,1854.028983,651.114341
model,1013.442451,546.125527,0.007133,5,2137.92246,-111.037557
engine_cc,969.477863,391.493252,0.0026,5,1775.567881,163.387845
ext_col,542.448722,444.012717,0.026173,5,1456.677043,-371.779599
int_col,410.393778,215.40429,0.006526,5,853.914189,-33.126633
accident,198.249163,155.4463,0.023156,5,518.315243,-121.816917
auto_shift,184.341322,142.368752,0.022163,5,477.480551,-108.797907
fuel_type_extracted,67.852424,63.60581,0.037774,5,198.817667,-63.112819


In [50]:
test_df_fe = feature_engg(test_df)
display(test_df_fe.head(1))

Unnamed: 0,id,brand,model,milage,ext_col,int_col,accident,clean_title,model_age,transmission_type,auto_shift,overdrive,engine_cc,fuel_type_extracted
0,188533,Land,RARE,98000,White,Beige,None reported,Yes,9,Automatic,0,0,2.0,Gasoline


In [51]:
test_data = TabularDataset(test_df_fe.drop('id', axis=1))
predictor = TabularPredictor.load("AutogluonModels/ag-20240921_070440")
predictions = predictor.predict(test_data) # returns a pandas series

test_df_fe['price'] = predictions.values
display(test_df_fe.head(1))
test_df_fe[['id','price']].to_csv(f'predictions/submission_v{VERSION}.csv',index=False)

Unnamed: 0,id,brand,model,milage,ext_col,int_col,accident,clean_title,model_age,transmission_type,auto_shift,overdrive,engine_cc,fuel_type_extracted,price
0,188533,Land,RARE,98000,White,Beige,None reported,Yes,9,Automatic,0,0,2.0,Gasoline,19230.652344
