**Confirm we have enough memory**

In [16]:
!free -g

               total        used        free      shared  buff/cache   available
Mem:              12           1           0           0          10          10
Swap:              0           0           0


**Install kaggle and autogluon**

In [32]:
!pip install uv
!uv pip install kaggle --system
!uv pip install autogluon --system
!uv pip install torch==2.3.1 --system
!uv pip uninstall torchaudio --system

[2mAudited [1m1 package[0m [2min 89ms[0m[0m
[2mAudited [1m1 package[0m [2min 193ms[0m[0m
[2mAudited [1m1 package[0m [2min 110ms[0m[0m
[2mUninstalled [1m1 package[0m [2min 145ms[0m[0m
 [31m-[39m [1mtorchaudio[0m[2m==2.3.1[0m


**Mount Google drive and copy kaggle.json to local disk**

In [18]:
from google.colab import drive
drive.mount('mount')

Drive already mounted at mount; to attempt to forcibly remount, call drive.mount("mount", force_remount=True).


In [19]:
!cp mount/MyDrive/kaggle.json .

**Download kaggle dataset for fraud detection**

In [20]:
!KAGGLE_CONFIG_DIR=$(pwd) kaggle competitions download -c california-house-prices

california-house-prices.zip: Skipping, found more recently modified local copy (use --force to force download)


**Extract dataset files**

In [21]:
![[ -d california-house-prices ]] && rm -rf california-house-prices
!ls
!unzip california-house-prices.zip -d california-house-prices
!ls
!pwd

california-house-prices.zip  kaggle_house.py  kaggle.json  logs  mount	sample_data
Archive:  california-house-prices.zip
  inflating: california-house-prices/sample_submission.csv  
  inflating: california-house-prices/test.csv  
  inflating: california-house-prices/train.csv  
california-house-prices      kaggle_house.py  logs   sample_data
california-house-prices.zip  kaggle.json      mount
/content


**Download Model Trainer Example python file from Github**

In [22]:
#!wget https://raw.githubusercontent.com/autogluon/autogluon/master/examples/automm/kaggle_california_house_price/example_kaggle_house.py

**Run the experiments**

In [23]:
!mkdir -p logs

In [24]:
%%writefile kaggle_house.py
import pandas as pd
import numpy as np
import argparse
import os
import random
from autogluon.tabular import TabularPredictor
from autogluon.multimodal import MultiModalPredictor
import torch as th


def get_parser():
    parser = argparse.ArgumentParser(
        description='The Basic Example of AutoGluon for House Price Prediction.')
    parser.add_argument('--mode',
                        choices=['stack5',
                                 'weighted',
                                 'single',
                                 'single_bag5'],
                        default='weighted',
                        help='"stack5" means 5-fold stacking. "weighted" means weighted ensemble.'
                             ' "single" means use a single model.'
                             ' "single_bag5" means 5-fold bagging via the AutoMM model.')
    parser.add_argument('--automm-mode', choices=['ft-transformer', 'mlp'],
                        default='ft-transformer', help='Fusion model in AutoMM.')
    parser.add_argument('--text-backbone', default='google/electra-small-discriminator')
    parser.add_argument('--cat-as-text', default=False)
    parser.add_argument('--data_path', type=str, default='california-house-prices')
    parser.add_argument('--seed', type=int, default=123)
    parser.add_argument('--exp_path', default=None)
    parser.add_argument('--with_tax_values', default=1, type=int)
    return parser


def get_automm_hyperparameters(mode, text_backbone, cat_as_text):
    if mode == "ft-transformer":
        hparams = {"model.names": ["ft_transformer",
                                   "hf_text",
                                   "fusion_transformer"],
                   "model.hf_text.checkpoint_name": text_backbone,
                   "data.categorical.convert_to_text": cat_as_text}
    elif mode == "mlp":
        hparams = {"model.names": ["categorical_mlp",
                                   "numerical_mlp",
                                   "hf_text",
                                   "fusion_mlp"],
                   "model.hf_text.checkpoint_name": text_backbone,
                   "data.categorical.convert_to_text": cat_as_text}
    else:
        raise NotImplementedError(f"mode={mode} is not supported!")
    return hparams


def preprocess(df, with_tax_values=True, log_scale_lot=True,
               log_scale_listed_price=True, has_label=True):
    new_df = df.copy()
    new_df.drop('Id', axis=1, inplace=True)
    new_df['Elementary School'] = new_df['Elementary School'].apply(lambda ele: str(ele)[:-len(' Elementary School')] if str(ele).endswith('Elementary School') else ele)
    if log_scale_lot:
        new_df['Lot'] = np.log(new_df['Lot'] + 1)
    if log_scale_listed_price:
        log_listed_price = np.log(new_df['Listed Price']).clip(0, None)
        new_df['Listed Price'] = log_listed_price
    if with_tax_values:
        new_df['Tax assessed value'] = np.log(new_df['Tax assessed value'] + 1)
        new_df['Annual tax amount'] = np.log(new_df['Annual tax amount'] + 1)
    else:
        new_df.drop('Tax assessed value', axis=1, inplace=True)
        new_df.drop('Annual tax amount', axis=1, inplace=True)
    if has_label:
        new_df['Sold Price'] = np.log(new_df['Sold Price'])
    return new_df


def set_seed(seed):
    import torch as th
    th.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)


def train(args):
    if args.exp_path is None:
        args.exp_path = f'automm_kaggle_house_{args.mode}_{args.automm_mode}_cat_to_text{args.cat_as_text}_{args.text_backbone}'

    set_seed(args.seed)
    train_df = pd.read_csv(os.path.join(args.data_path, 'train.csv'))
    test_df = pd.read_csv(os.path.join(args.data_path, 'test.csv'))
    # For the purpose of generating submission file
    submission_df = pd.read_csv(os.path.join(args.data_path, 'sample_submission.csv'))
    train_df = preprocess(train_df,
                          with_tax_values=args.with_tax_values, has_label=True)
    test_df = preprocess(test_df,
                         with_tax_values=args.with_tax_values, has_label=False)
    label_column = 'Sold Price'
    eval_metric = 'r2'

    automm_hyperparameters = get_automm_hyperparameters(args.automm_mode, args.text_backbone, args.cat_as_text)

    tabular_hyperparameters = {
        'GBM': [
            {},
            {'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}},
        ],
        'CAT': {},
        'AG_AUTOMM': automm_hyperparameters,
    }
    if args.mode == 'single':
        predictor = MultiModalPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path)
        predictor.fit(train_df, time_limit=8*60, hyperparameters=automm_hyperparameters, seed=args.seed)
    elif args.mode == 'weighted' or args.mode == 'stack5' or args.mode == 'single_bag5' or args.mode == 'single_bag4':
        predictor = TabularPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path)

        if args.mode == 'single_bag5':
            tabular_hyperparameters = {
                'AG_AUTOMM': automm_hyperparameters,
            }
            num_bag_folds, num_stack_levels = 5, 0
        elif args.mode == 'weighted':
            num_bag_folds, num_stack_levels = None, None
        elif args.mode == 'stack5':
            num_bag_folds, num_stack_levels = 5, 1
        else:
            raise NotImplementedError
        predictor.fit(train_df,
                      time_limit=8*60,
                      hyperparameters=tabular_hyperparameters,
                      num_bag_folds=num_bag_folds,
                      num_stack_levels=num_stack_levels)
        leaderboard = predictor.leaderboard()
        leaderboard.to_csv(os.path.join(args.exp_path, 'leaderboard.csv'))
    else:
        raise NotImplementedError
    predictions = np.exp(predictor.predict(test_df))
    submission_df['Sold Price'] = predictions
    submission_df.to_csv(os.path.join(args.exp_path, 'submission.csv'), index=None)


if __name__ == '__main__':
    parser = get_parser()
    args = parser.parse_args()
    th.manual_seed(args.seed)
    train(args)

Overwriting kaggle_house.py


In [25]:
# Single MultiModalPredictor (MLP)
!rm -rf /content/automm_kaggle_house_single_mlp_cat_to_textFalse_google/electra-small-discriminator
!python3 kaggle_house.py --automm-mode mlp --mode single 2>&1 | tee -a logs/automm_single_mlp.txt

2024-09-14 17:45:01.904794: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-14 17:45:02.232154: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-14 17:45:02.332874: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
AutoGluon Version:  1.1.1
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Pytorch Version:    2.3.1+cu121
CUDA Version:       12.1
M

In [26]:
# Single MultiModalPredictor (FT-Transformer For Tabular)
!python3 kaggle_house.py --automm-mode ft-transformer --mode single 2>&1 | tee -a logs/automm_single_ft.txt

2024-09-14 17:56:09.781867: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-14 17:56:09.802891: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-14 17:56:09.809146: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
AutoGluon Version:  1.1.1
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Pytorch Version:    2.3.1+cu121
CUDA Version:       12.1
M

In [27]:
# MultiModalPredictor + other Tree Models (Weighted Ensemble)
!python3 kaggle_house.py --automm-mode ft-transformer --mode weighted 2>&1 | tee -a logs/automm_ft_weighted.txt

2024-09-14 18:07:05.775193: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-14 18:07:05.795275: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-14 18:07:05.801468: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       9.83 G

In [28]:
# MultiModalPredictor + other Tree Models (5-fold Stack Ensemble)
!python3 kaggle_house.py --automm-mode ft-transformer --mode stack5 2>&1 | tee -a logs/automm_ft_stack5.txt

2024-09-14 18:16:13.510380: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-14 18:16:13.531139: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-14 18:16:13.537799: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       9.85 G

In [29]:
!ls /content/automm_kaggle_house*/*

/content/automm_kaggle_house_single_ft-transformer_cat_to_textFalse_google/electra-small-discriminator:
assets.json	     df_preprocessor.pkl				 hparams.yaml
config.yaml	     events.out.tfevents.1726336577.6f9fb638bc1d.4818.0  model.ckpt
data_processors.pkl  hf_text						 submission.csv

/content/automm_kaggle_house_single_mlp_cat_to_textFalse_google/electra-small-discriminator:
assets.json	     df_preprocessor.pkl				 hparams.yaml
config.yaml	     events.out.tfevents.1726335925.6f9fb638bc1d.1947.0  model.ckpt
data_processors.pkl  hf_text						 submission.csv

/content/automm_kaggle_house_stack5_ft-transformer_cat_to_textFalse_google/electra-small-discriminator:
leaderboard.csv  metadata.json	predictor.pkl	utils
learner.pkl	 models		submission.csv	version.txt

/content/automm_kaggle_house_weighted_ft-transformer_cat_to_textFalse_google/electra-small-discriminator:
leaderboard.csv  metadata.json	predictor.pkl	utils
learner.pkl	 models		submission.csv	version.txt


In [34]:
from autogluon.tabular import TabularPredictor
from autogluon.multimodal import MultiModalPredictor

loaded_predictor = MultiModalPredictor.load("/content/automm_kaggle_house_single_ft-transformer_cat_to_textFalse_google/electra-small-discriminator")
print(loaded_predictor.fit_summary())

loaded_predictor = MultiModalPredictor.load("/content/automm_kaggle_house_single_mlp_cat_to_textFalse_google/electra-small-discriminator")
print(loaded_predictor.fit_summary())

loaded_predictor = TabularPredictor.load("/content/automm_kaggle_house_stack5_ft-transformer_cat_to_textFalse_google/electra-small-discriminator")
print(loaded_predictor.leaderboard())

loaded_predictor = TabularPredictor.load("/content/automm_kaggle_house_weighted_ft-transformer_cat_to_textFalse_google/electra-small-discriminator")
print(loaded_predictor.leaderboard())


Load pretrained checkpoint: /content/automm_kaggle_house_single_ft-transformer_cat_to_textFalse_google/electra-small-discriminator/model.ckpt


{'val_r2': None, 'training_time': None}


Load pretrained checkpoint: /content/automm_kaggle_house_single_mlp_cat_to_textFalse_google/electra-small-discriminator/model.ckpt


{'val_r2': None, 'training_time': None}
                 model  score_val eval_metric  pred_time_val    fit_time  \
0      LightGBM_BAG_L1   0.941223          r2       9.946417  196.122740   
1  WeightedEnsemble_L3   0.941223          r2       9.947645  196.186393   
2  WeightedEnsemble_L2   0.941223          r2       9.948374  196.216867   
3    LightGBMXT_BAG_L1   0.077052          r2       6.396061  123.393576   

   pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  \
0                9.946417         196.122740            1       True   
1                0.001228           0.063653            3       True   
2                0.001957           0.094127            2       True   
3                6.396061         123.393576            1       True   

   fit_order  
0          1  
1          4  
2          3  
3          2  
                 model  score_val eval_metric  pred_time_val    fit_time  \
0  WeightedEnsemble_L2   0.959899          r2       1.611901  248.0

In [35]:
#!KAGGLE_CONFIG_DIR=$(pwd) kaggle competitions submit -c california-house-prices -f /content/automm_kaggle_house_single_ft-transformer_cat_to_textFalse_google/electra-small-discriminator/submission.csv -m "my first submission"