# Import and setup Kaggle data sources

In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'cs-480-2024-spring:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F81655%2F8915386%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240812%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240812T025217Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D71b4e54c8b99120fa037c2f4d3fcc954a957c02cab223bb17a2ba05e95a5c729ca3f5604f5ce704be44a86059e2012213343d5c5bab84daba0207501dea93f4595d6517b8556ded4fb91e7577b4c73e8fa816205b9aee839668924d2f350ccb624d68795d4a4253f6c97f36740fc8e6cc0466827e615a178db6af47b71a9bba0854a9490593e36ba6042df7aaa610ff8642140e610844cc2ac614d2c6fa036564452d6b12be1316d1af69140c53647de2992af7b5c50762798e6999252dead3efba4117ec6f51ff4f16e07c0ff8052e11a023ab430b9ae89e91344d680742bcec035e79ac7d51dc01da206d471a31ec7fbf4669ef3ac4ab9bf89c169ef37e236,dinov2-embeddings:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5529912%2F9154108%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240812%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240812T025217Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D79dee6dce645e5cc4de5577e79f961491c9b04d5c8328573ef71c2b55d19b71cce2da8db6cb118859c20003bb5bbeac8ac9ea943e7fac37806e4b8beb9d196c35a0ded5e44946f8c91eba20eb4366001259343762a6415c3a99924add23d501611d2b5ff09923ecc33cd55d9b9cfe94fbb617124bc1bf62a44e363bde59044271786e5e33ec8575a8cc9747d093e195bd0be71e194cf0543b200089051e773535d2aba9e2f77991e3a0e3103f7eff81d97e59ac03732c4bcf1835d35d3c5ab9de14c89196b12f41abbfc364b737d01d64d706c3566245e23ebe981b6f555490b17665ea3975122e20bf610321c483b82025159c27626144e2a040afda9d51ef4'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading cs-480-2024-spring, 300784602 bytes compressed
Downloaded and uncompressed: cs-480-2024-spring
Downloading dinov2-embeddings, 221066382 bytes compressed
Downloaded and uncompressed: dinov2-embeddings
Data source import complete.


# Import necessary libraries

In [2]:
%pip install catboost



In [3]:
from typing import Callable

import numpy as np
import pandas as pd
import torch
import torch_xla.core.xla_model as xm
from catboost import Pool, CatBoostRegressor
from PIL import Image
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from torchvision import models, transforms
from tqdm import tqdm

# Set data paths and seed

In [4]:
BASE_DATA_PATH = '/kaggle/input/cs-480-2024-spring/data'
SEED = 42

# Load data

In [5]:
train_df = pd.read_csv('/kaggle/input/cs-480-2024-spring/data/train.csv')
train_df['image_path'] = train_df['id'].apply(lambda id_: f"{BASE_DATA_PATH}/train_images/{id_}.jpeg")

test_df = pd.read_csv('/kaggle/input/cs-480-2024-spring/data/test.csv')
test_df['image_path'] = test_df['id'].apply(lambda id_: f"{BASE_DATA_PATH}/test_images/{id_}.jpeg")

# Split data

In [6]:
train_data, val_data = train_test_split(train_df, test_size=0.1, shuffle=True, random_state=SEED)

# After splitting, the original indices from train_df are retained, which can be non-sequential and confusing.
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)

In [7]:
NUM_FEATURES = 164
target_column_names = ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean', 'X3112_mean']
feature_column_names = train_data.columns[:NUM_FEATURES]

# Filter outliers

In [8]:
def filter_outliers(df: pd.DataFrame, column_names: list[str], lower: float = 0.01, upper: float = 0.99) -> pd.DataFrame:
    lower_quantile = df[column_names].quantile(0.01)
    upper_quantile = df[column_names].quantile(0.99)

    mask = (df[column_names] > lower_quantile) & (df[column_names] < upper_quantile)
    filtered_df = df[mask.all(axis=1)]

    return filtered_df

In [9]:
filtered_train_data = filter_outliers(train_data, target_column_names)
filtered_val_data = filter_outliers(val_data, target_column_names)

# Standardize features

In [10]:
scaler = StandardScaler()

X_train = scaler.fit_transform(filtered_train_data[feature_column_names].values.astype(np.float32))
X_val = scaler.fit_transform(filtered_val_data[feature_column_names].values.astype(np.float32))
X_test = scaler.transform(test_df[feature_column_names].values.astype(np.float32))

In [11]:
y_train = filtered_train_data[target_column_names].values
y_val = filtered_val_data[target_column_names].values

# Device setup

In [12]:
# Check for TPU
try:
    device = xm.xla_device()
    print("Using TPU")
except RuntimeError:
    # Fallback to GPU or CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using {device}")

Using TPU


# Define transformations and embedding function

In [13]:
transform = transforms.Compose([
    transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])

In [14]:
def get_embeddings(df: pd.DataFrame, transform: Callable, model: Callable, batch_size: int = 64) -> list[np.ndarray]:
    embeddings = []
    image_tensors = []

    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        image_path = row['image_path']
        image = Image.open(image_path).convert('RGB')
        image_tensor = transform(image).unsqueeze(0).to(device)
        image_tensors.append(image_tensor)

        if len(image_tensors) == batch_size:
            batch_tensor = torch.cat(image_tensors, dim=0)
            with torch.no_grad():
                batch_embeddings = model(batch_tensor).cpu().numpy()
            embeddings.extend(batch_embeddings)
            image_tensors = []

    if image_tensors:
        batch_tensor = torch.cat(image_tensors, dim=0)
        with torch.no_grad():
            batch_embeddings = model(batch_tensor).cpu().numpy()
        embeddings.extend(batch_embeddings)

    return embeddings

# Run ablation studies

## ResNet

In [15]:
resnet = models.resnet50(pretrained=True).to(device).eval()

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 198MB/s]


In [16]:
train_embeddings = get_embeddings(df=filtered_train_data, transform=transform, model=resnet)
val_embeddings = get_embeddings(df=filtered_val_data, transform=transform, model=resnet)

100%|██████████| 34902/34902 [02:04<00:00, 279.72it/s]
100%|██████████| 3874/3874 [00:13<00:00, 286.92it/s]


In [17]:
poly = PolynomialFeatures(2)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)
X_test_poly = poly.transform(X_test)

In [20]:
X_train_fused = pd.DataFrame(np.concatenate((X_train_poly, train_embeddings), axis=1))
X_train_fused["embeddings"] = list(train_embeddings)

X_val_fused = pd.DataFrame(np.concatenate((X_val_poly, val_embeddings), axis=1))
X_val_fused["embeddings"] = list(val_embeddings)

In [22]:
column_name_to_r2_score = {}
for idx, column in tqdm(enumerate(target_column_names), total=len(target_column_names)):
    col_y_train = y_train[:, idx]
    col_y_val = y_val[:, idx]

    train_pool = Pool(X_train_fused, col_y_train, embedding_features=["embeddings"])
    val_pool = Pool(X_val_fused, col_y_val, embedding_features=["embeddings"])

    # Train model
    model = CatBoostRegressor(iterations=1500, learning_rate=0.06, loss_function="RMSE", verbose=100, random_state=SEED)
    model.fit(train_pool, plot_file=f"resnet_plot.png")

    # Predict and evaluate
    col_y_val_pred = model.predict(val_pool)
    col_r2_score = r2_score(col_y_val, col_y_val_pred)
    column_name_to_r2_score[column] = col_r2_score

  0%|          | 0/6 [00:00<?, ?it/s]

0:	learn: 0.1289949	total: 487ms	remaining: 12m 9s
100:	learn: 0.1126989	total: 35.1s	remaining: 8m 6s
200:	learn: 0.1092677	total: 1m 9s	remaining: 7m 31s
300:	learn: 0.1054555	total: 1m 44s	remaining: 6m 56s
400:	learn: 0.1021016	total: 2m 19s	remaining: 6m 21s
500:	learn: 0.0991185	total: 2m 53s	remaining: 5m 46s
600:	learn: 0.0963134	total: 3m 28s	remaining: 5m 11s
700:	learn: 0.0936439	total: 4m 3s	remaining: 4m 37s
800:	learn: 0.0911833	total: 4m 37s	remaining: 4m 2s
900:	learn: 0.0887426	total: 5m 12s	remaining: 3m 27s
1000:	learn: 0.0864592	total: 5m 47s	remaining: 2m 52s
1100:	learn: 0.0842890	total: 6m 21s	remaining: 2m 18s
1200:	learn: 0.0822220	total: 6m 56s	remaining: 1m 43s
1300:	learn: 0.0802052	total: 7m 31s	remaining: 1m 9s
1400:	learn: 0.0782574	total: 8m 5s	remaining: 34.3s
1499:	learn: 0.0763719	total: 8m 40s	remaining: 0us


 17%|█▋        | 1/6 [11:05<55:26, 665.33s/it]

0:	learn: 6.2638464	total: 434ms	remaining: 10m 50s
100:	learn: 5.4557761	total: 35.2s	remaining: 8m 6s
200:	learn: 5.2838099	total: 1m 9s	remaining: 7m 32s
300:	learn: 5.0958309	total: 1m 44s	remaining: 6m 57s
400:	learn: 4.9374712	total: 2m 19s	remaining: 6m 22s
500:	learn: 4.7901450	total: 2m 54s	remaining: 5m 47s
600:	learn: 4.6536450	total: 3m 28s	remaining: 5m 12s
700:	learn: 4.5310481	total: 4m 3s	remaining: 4m 37s
800:	learn: 4.4124605	total: 4m 38s	remaining: 4m 2s
900:	learn: 4.3008675	total: 5m 12s	remaining: 3m 27s
1000:	learn: 4.1945155	total: 5m 47s	remaining: 2m 53s
1100:	learn: 4.0929426	total: 6m 22s	remaining: 2m 18s
1200:	learn: 3.9917464	total: 6m 57s	remaining: 1m 43s
1300:	learn: 3.8918756	total: 7m 31s	remaining: 1m 9s
1400:	learn: 3.7987440	total: 8m 6s	remaining: 34.4s
1499:	learn: 3.7100050	total: 8m 40s	remaining: 0us


 33%|███▎      | 2/6 [22:11<44:23, 665.76s/it]

0:	learn: 3.7276238	total: 458ms	remaining: 11m 27s
100:	learn: 2.8736196	total: 35.2s	remaining: 8m 7s
200:	learn: 2.7427102	total: 1m 10s	remaining: 7m 32s
300:	learn: 2.6093008	total: 1m 44s	remaining: 6m 57s
400:	learn: 2.4928311	total: 2m 19s	remaining: 6m 22s
500:	learn: 2.3941234	total: 2m 54s	remaining: 5m 47s
600:	learn: 2.3054147	total: 3m 28s	remaining: 5m 12s
700:	learn: 2.2234839	total: 4m 3s	remaining: 4m 37s
800:	learn: 2.1483889	total: 4m 38s	remaining: 4m 2s
900:	learn: 2.0798650	total: 5m 12s	remaining: 3m 27s
1000:	learn: 2.0115253	total: 5m 47s	remaining: 2m 53s
1100:	learn: 1.9480987	total: 6m 21s	remaining: 2m 18s
1200:	learn: 1.8884866	total: 6m 56s	remaining: 1m 43s
1300:	learn: 1.8322476	total: 7m 31s	remaining: 1m 9s
1400:	learn: 1.7825954	total: 8m 5s	remaining: 34.3s
1499:	learn: 1.7337476	total: 8m 40s	remaining: 0us


 50%|█████     | 3/6 [33:14<33:14, 664.67s/it]

0:	learn: 42.9122712	total: 443ms	remaining: 11m 3s
100:	learn: 37.3103878	total: 35s	remaining: 8m 5s
200:	learn: 35.6566631	total: 1m 9s	remaining: 7m 30s
300:	learn: 33.9095720	total: 1m 44s	remaining: 6m 55s
400:	learn: 32.4519186	total: 2m 18s	remaining: 6m 20s
500:	learn: 31.1610654	total: 2m 53s	remaining: 5m 46s
600:	learn: 30.0377226	total: 3m 28s	remaining: 5m 11s
700:	learn: 28.9804574	total: 4m 2s	remaining: 4m 36s
800:	learn: 28.0103366	total: 4m 37s	remaining: 4m 2s
900:	learn: 27.0753455	total: 5m 12s	remaining: 3m 27s
1000:	learn: 26.2056218	total: 5m 46s	remaining: 2m 52s
1100:	learn: 25.4489961	total: 6m 21s	remaining: 2m 18s
1200:	learn: 24.6501122	total: 6m 57s	remaining: 1m 43s
1300:	learn: 23.8983603	total: 7m 32s	remaining: 1m 9s
1400:	learn: 23.1797122	total: 8m 7s	remaining: 34.4s
1499:	learn: 22.5429698	total: 8m 42s	remaining: 0us


 67%|██████▋   | 4/6 [44:20<22:10, 665.26s/it]

0:	learn: 0.5396405	total: 441ms	remaining: 11m 1s
100:	learn: 0.4987711	total: 34.9s	remaining: 8m 3s
200:	learn: 0.4847864	total: 1m 9s	remaining: 7m 28s
300:	learn: 0.4686849	total: 1m 43s	remaining: 6m 53s
400:	learn: 0.4545944	total: 2m 18s	remaining: 6m 18s
500:	learn: 0.4417097	total: 2m 52s	remaining: 5m 44s
600:	learn: 0.4296889	total: 3m 27s	remaining: 5m 9s
700:	learn: 0.4183589	total: 4m 1s	remaining: 4m 35s
800:	learn: 0.4077869	total: 4m 36s	remaining: 4m 1s
900:	learn: 0.3976641	total: 5m 10s	remaining: 3m 26s
1000:	learn: 0.3876227	total: 5m 45s	remaining: 2m 52s
1100:	learn: 0.3783366	total: 6m 19s	remaining: 2m 17s
1200:	learn: 0.3688392	total: 6m 54s	remaining: 1m 43s
1300:	learn: 0.3598810	total: 7m 28s	remaining: 1m 8s
1400:	learn: 0.3515636	total: 8m 3s	remaining: 34.1s
1499:	learn: 0.3433679	total: 8m 37s	remaining: 0us


 83%|████████▎ | 5/6 [55:22<11:03, 663.96s/it]

0:	learn: 1877.2391535	total: 434ms	remaining: 10m 50s
100:	learn: 1550.9802270	total: 34.8s	remaining: 8m 2s
200:	learn: 1501.8361071	total: 1m 9s	remaining: 7m 27s
300:	learn: 1444.7401371	total: 1m 43s	remaining: 6m 53s
400:	learn: 1395.0052777	total: 2m 18s	remaining: 6m 19s
500:	learn: 1351.3172979	total: 2m 53s	remaining: 5m 45s
600:	learn: 1310.2192868	total: 3m 27s	remaining: 5m 11s
700:	learn: 1272.3935844	total: 4m 2s	remaining: 4m 36s
800:	learn: 1236.2227773	total: 4m 37s	remaining: 4m 1s
900:	learn: 1203.6890302	total: 5m 11s	remaining: 3m 27s
1000:	learn: 1171.4473284	total: 5m 46s	remaining: 2m 52s
1100:	learn: 1140.4215624	total: 6m 20s	remaining: 2m 18s
1200:	learn: 1110.4887994	total: 6m 55s	remaining: 1m 43s
1300:	learn: 1081.0061336	total: 7m 30s	remaining: 1m 8s
1400:	learn: 1053.4564589	total: 8m 4s	remaining: 34.2s
1499:	learn: 1026.8668434	total: 8m 39s	remaining: 0us


100%|██████████| 6/6 [1:06:23<00:00, 664.00s/it]


In [23]:
column_name_to_r2_score

{'X4_mean': 0.26663592530070646,
 'X11_mean': 0.2672305143032675,
 'X18_mean': 0.4271818786182754,
 'X26_mean': 0.19773881368366064,
 'X50_mean': 0.17483545490714691,
 'X3112_mean': 0.3350935325950368}

In [None]:
np.mean(list(column_name_to_r2_score.values()))

Delete ResNet model to free up RAM

In [24]:
del resnet

## DINOv2

In [25]:
dinov2 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14_reg').to(device).eval()

Using cache found in /root/.cache/torch/hub/facebookresearch_dinov2_main

xFormers is not available (SwiGLU)


xFormers is not available (Attention)


xFormers is not available (Block)



In [26]:
train_embeddings = get_embeddings(df=filtered_train_data, transform=transform, model=dinov2)
val_embeddings = get_embeddings(df=filtered_val_data, transform=transform, model=dinov2)

100%|██████████| 34902/34902 [23:30<00:00, 24.75it/s]
100%|██████████| 3874/3874 [02:34<00:00, 25.06it/s]


In [27]:
poly = PolynomialFeatures(2)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)
X_test_poly = poly.transform(X_test)

In [28]:
X_train_fused = pd.DataFrame(np.concatenate((X_train_poly, train_embeddings), axis=1))
X_train_fused["embeddings"] = list(train_embeddings)

X_val_fused = pd.DataFrame(np.concatenate((X_val_poly, val_embeddings), axis=1))
X_val_fused["embeddings"] = list(val_embeddings)

In [41]:
column_name_to_r2_score = {}
column_name_to_model = {}
for idx, column in tqdm(enumerate(target_column_names), total=len(target_column_names)):
    col_y_train = y_train[:, idx]
    col_y_val = y_val[:, idx]

    train_pool = Pool(X_train_fused, col_y_train, embedding_features=["embeddings"])
    val_pool = Pool(X_val_fused, col_y_val, embedding_features=["embeddings"])

    # Train model
    model = CatBoostRegressor(iterations=1500, learning_rate=0.06, loss_function="RMSE", verbose=100, random_state=SEED)
    model.fit(train_pool, plot=True, plot_file=f"dinov2_plot_{column}")
    column_name_to_model[column] = model

    # Predict and evaluate
    col_y_val_pred = model.predict(val_pool)
    col_r2_score = r2_score(col_y_val, col_y_val_pred)
    column_name_to_r2_score[column] = col_r2_score
    print(f'Column name: {column}, R2 score: {col_r2_score}')

  0%|          | 0/6 [00:00<?, ?it/s]

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.1270802	total: 472ms	remaining: 11m 48s
100:	learn: 0.0938710	total: 36.1s	remaining: 8m 19s
200:	learn: 0.0909747	total: 1m 11s	remaining: 7m 44s
300:	learn: 0.0874547	total: 1m 47s	remaining: 7m 9s
400:	learn: 0.0842850	total: 2m 23s	remaining: 6m 34s
500:	learn: 0.0814281	total: 2m 59s	remaining: 5m 58s
600:	learn: 0.0787441	total: 3m 35s	remaining: 5m 23s
700:	learn: 0.0762099	total: 4m 12s	remaining: 4m 47s
800:	learn: 0.0738505	total: 4m 48s	remaining: 4m 12s
900:	learn: 0.0715473	total: 5m 25s	remaining: 3m 36s
1000:	learn: 0.0693403	total: 6m 1s	remaining: 3m
1100:	learn: 0.0673041	total: 6m 38s	remaining: 2m 24s
1200:	learn: 0.0653078	total: 7m 14s	remaining: 1m 48s
1300:	learn: 0.0634175	total: 7m 51s	remaining: 1m 12s
1400:	learn: 0.0615789	total: 8m 27s	remaining: 35.9s
1499:	learn: 0.0597969	total: 9m 4s	remaining: 0us


 17%|█▋        | 1/6 [12:41<1:03:25, 761.07s/it]

Column name: X4_mean, R2 score: 0.5208552357545682


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 6.1987035	total: 455ms	remaining: 11m 21s
100:	learn: 4.8104555	total: 36.1s	remaining: 8m 19s
200:	learn: 4.6473360	total: 1m 11s	remaining: 7m 43s
300:	learn: 4.4604783	total: 1m 47s	remaining: 7m 8s
400:	learn: 4.3038558	total: 2m 23s	remaining: 6m 32s
500:	learn: 4.1561097	total: 2m 59s	remaining: 5m 57s
600:	learn: 4.0222523	total: 3m 35s	remaining: 5m 21s
700:	learn: 3.8933887	total: 4m 10s	remaining: 4m 45s
800:	learn: 3.7734560	total: 4m 46s	remaining: 4m 10s
900:	learn: 3.6597546	total: 5m 23s	remaining: 3m 34s
1000:	learn: 3.5482732	total: 5m 59s	remaining: 2m 59s
1100:	learn: 3.4407625	total: 6m 35s	remaining: 2m 23s
1200:	learn: 3.3402135	total: 7m 11s	remaining: 1m 47s
1300:	learn: 3.2452233	total: 7m 48s	remaining: 1m 11s
1400:	learn: 3.1565135	total: 8m 24s	remaining: 35.7s
1499:	learn: 3.0667081	total: 9m	remaining: 0us


 33%|███▎      | 2/6 [25:12<50:22, 755.52s/it]  

Column name: X11_mean, R2 score: 0.49992807222586866


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 3.6776857	total: 451ms	remaining: 11m 15s
100:	learn: 2.3436412	total: 36.2s	remaining: 8m 21s
200:	learn: 2.2088199	total: 1m 12s	remaining: 7m 45s
300:	learn: 2.0682600	total: 1m 47s	remaining: 7m 10s
400:	learn: 1.9510773	total: 2m 24s	remaining: 6m 34s
500:	learn: 1.8502273	total: 3m	remaining: 5m 59s
600:	learn: 1.7610516	total: 3m 36s	remaining: 5m 23s
700:	learn: 1.6788720	total: 4m 12s	remaining: 4m 48s
800:	learn: 1.6051629	total: 4m 49s	remaining: 4m 12s
900:	learn: 1.5369798	total: 5m 26s	remaining: 3m 37s
1000:	learn: 1.4741143	total: 6m 3s	remaining: 3m 1s
1100:	learn: 1.4137463	total: 6m 41s	remaining: 2m 25s
1200:	learn: 1.3597469	total: 7m 18s	remaining: 1m 49s
1300:	learn: 1.3057044	total: 7m 55s	remaining: 1m 12s
1400:	learn: 1.2551011	total: 8m 32s	remaining: 36.2s
1499:	learn: 1.2077663	total: 9m 9s	remaining: 0us


 50%|█████     | 3/6 [37:54<37:55, 758.56s/it]

Column name: X18_mean, R2 score: 0.6356877876413756


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 42.6334399	total: 446ms	remaining: 11m 8s
100:	learn: 34.2627790	total: 36.3s	remaining: 8m 22s
200:	learn: 32.1874999	total: 1m 12s	remaining: 7m 46s
300:	learn: 29.9284725	total: 1m 48s	remaining: 7m 11s
400:	learn: 28.1662141	total: 2m 24s	remaining: 6m 36s
500:	learn: 26.6248377	total: 3m	remaining: 6m
600:	learn: 25.2869536	total: 3m 36s	remaining: 5m 24s
700:	learn: 24.0158420	total: 4m 12s	remaining: 4m 48s
800:	learn: 22.8816814	total: 4m 48s	remaining: 4m 12s
900:	learn: 21.8177050	total: 5m 25s	remaining: 3m 36s
1000:	learn: 20.8504732	total: 6m 2s	remaining: 3m
1100:	learn: 19.9262072	total: 6m 38s	remaining: 2m 24s
1200:	learn: 19.1011179	total: 7m 16s	remaining: 1m 48s
1300:	learn: 18.3297858	total: 7m 52s	remaining: 1m 12s
1400:	learn: 17.5970087	total: 8m 30s	remaining: 36s
1499:	learn: 16.8992595	total: 9m 6s	remaining: 0us


 67%|██████▋   | 4/6 [50:32<25:16, 758.07s/it]

Column name: X26_mean, R2 score: 0.38526584429212707


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.5337493	total: 451ms	remaining: 11m 16s
100:	learn: 0.4400010	total: 35.9s	remaining: 8m 17s
200:	learn: 0.4266356	total: 1m 11s	remaining: 7m 41s
300:	learn: 0.4107900	total: 1m 47s	remaining: 7m 6s
400:	learn: 0.3965682	total: 2m 22s	remaining: 6m 31s
500:	learn: 0.3837956	total: 2m 58s	remaining: 5m 55s
600:	learn: 0.3712189	total: 3m 34s	remaining: 5m 21s
700:	learn: 0.3593318	total: 4m 10s	remaining: 4m 45s
800:	learn: 0.3485046	total: 4m 46s	remaining: 4m 9s
900:	learn: 0.3380886	total: 5m 22s	remaining: 3m 34s
1000:	learn: 0.3279481	total: 5m 58s	remaining: 2m 58s
1100:	learn: 0.3185307	total: 6m 34s	remaining: 2m 23s
1200:	learn: 0.3092318	total: 7m 10s	remaining: 1m 47s
1300:	learn: 0.3002789	total: 7m 47s	remaining: 1m 11s
1400:	learn: 0.2917687	total: 8m 23s	remaining: 35.6s
1499:	learn: 0.2837218	total: 8m 59s	remaining: 0us


 83%|████████▎ | 5/6 [1:03:03<12:35, 755.74s/it]

Column name: X50_mean, R2 score: 0.4066759023843505


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1860.1476529	total: 450ms	remaining: 11m 13s
100:	learn: 1392.5250305	total: 36.1s	remaining: 8m 20s
200:	learn: 1328.8061898	total: 1m 11s	remaining: 7m 45s
300:	learn: 1260.7569295	total: 1m 47s	remaining: 7m 9s
400:	learn: 1202.1639613	total: 2m 23s	remaining: 6m 34s
500:	learn: 1151.8290450	total: 3m	remaining: 5m 59s
600:	learn: 1104.1521725	total: 3m 36s	remaining: 5m 23s
700:	learn: 1062.7741936	total: 4m 12s	remaining: 4m 47s
800:	learn: 1024.3042532	total: 4m 48s	remaining: 4m 11s
900:	learn: 987.0512843	total: 5m 25s	remaining: 3m 36s
1000:	learn: 953.3305636	total: 6m 2s	remaining: 3m
1100:	learn: 921.5299227	total: 6m 38s	remaining: 2m 24s
1200:	learn: 888.6085458	total: 7m 15s	remaining: 1m 48s
1300:	learn: 860.2407821	total: 7m 52s	remaining: 1m 12s
1400:	learn: 831.5954436	total: 8m 29s	remaining: 36s
1499:	learn: 806.1520826	total: 9m 6s	remaining: 0us


100%|██████████| 6/6 [1:16:06<00:00, 761.08s/it]

Column name: X3112_mean, R2 score: 0.5114467797172562





In [42]:
column_name_to_r2_score

{'X4_mean': 0.5208552357545682,
 'X11_mean': 0.49992807222586866,
 'X18_mean': 0.6356877876413756,
 'X26_mean': 0.38526584429212707,
 'X50_mean': 0.4066759023843505,
 'X3112_mean': 0.5114467797172562}

In [43]:
np.mean(list(column_name_to_r2_score.values()))

0.49330993700259107

# Generate submission CSV

In [44]:
submission_col_names = [col_name.replace('_mean', '') for col_name in target_column_names]
submission = pd.DataFrame(
    {
        'id': test_df['id'],
        **{col: 0 for col in submission_col_names}
    }
)

In [34]:
test_embeddings = get_embeddings(df=test_df, transform=transform, model=dinov2)

100%|██████████| 6391/6391 [04:14<00:00, 25.10it/s]


In [45]:
X_test_poly = poly.fit_transform(X_test)
X_test_fused = pd.DataFrame(np.concatenate((X_test_poly, test_embeddings), axis=1))
X_test_fused["embeddings"] = list(test_embeddings)

In [46]:
for idx, column in enumerate(target_column_names):
    test_pool = Pool(X_test_fused, embedding_features=["embeddings"])
    column_pred = column_name_to_model[column].predict(test_pool)
    submission[column.replace('_mean', '')] = column_pred

submission.to_csv('/kaggle/working/submission.csv', index=False)
submission.head()

Unnamed: 0,id,X4,X11,X18,X26,X50,X3112
0,154220505,1.148918,145.908123,19707.555492,3545.797691,15.173563,400151.938039
1,195736552,0.998605,152.258345,19699.61222,3464.25153,14.860193,399004.355931
2,182701773,0.984228,147.717668,19699.704267,3467.272704,14.889166,397916.079079
3,27688500,0.977196,138.863974,19699.586297,3473.176814,15.990721,397919.024411
4,195825045,0.919066,153.460128,19699.519735,3462.389063,14.832491,398630.514076
