In [None]:
import os

import numpy as np
import pandas as pd
import torch
from PIL import Image
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from catboost import Pool, CatBoostRegressor
from torchvision import transforms

tqdm.pandas()

In [None]:
class Config():
    TARGET_COLUMNS = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
    N_VAL_SAMPLES0 = 4096
    # Others
    SEED = 20898485
    DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'

def seed_everything(seed: int):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

CONFIG = Config()
seed_everything(CONFIG.SEED)
CONFIG.DEVICE

'cpu'

## Read Dataset

In [None]:
data = pd.read_csv('/kaggle/input/cs-480-2024-spring/data/train.csv')
data['file_path'] = data['id'].apply(lambda s: f'/kaggle/input/cs-480-2024-spring/data/train_images/{s}.jpeg')

data_test = pd.read_csv('/kaggle/input/cs-480-2024-spring/data/test.csv')
data_test['file_path'] = data_test['id'].apply(lambda s: f'/kaggle/input/cs-480-2024-spring/data/test_images/{s}.jpeg')
test = data_test

CONFIG.FEATURE_COLUMNS = data_test.columns.values[1:-2]
train, val = train_test_split(data, test_size=CONFIG.N_VAL_SAMPLES0, shuffle=True, random_state=CONFIG.SEED)
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

In [None]:
def get_mask(df, labels_describe_df):
    lower = []
    higher = []
    mask = np.empty(shape=df[CONFIG.TARGET_COLUMNS].shape, dtype=bool)
    for idx, t in enumerate(CONFIG.TARGET_COLUMNS):
        labels = df[t].values
        v_min, v_max = labels_describe_df.loc[t]['0.1%'], labels_describe_df.loc[t]['98%']
        mask[:,idx] = ((labels > v_min) & (labels < v_max))
    return mask.min(axis=1)

labels_describe_df = train[CONFIG.TARGET_COLUMNS].describe(percentiles=[0.001, 0.98]).round(3).T
# Masks
mask_train = get_mask(train, labels_describe_df)
mask_val = get_mask(val, labels_describe_df)
# Masked DataFrames
train_mask = train[mask_train].reset_index(drop=True)
val_mask = val[mask_val].reset_index(drop=True)

for m, subset, full in zip([train_mask, val_mask], ['train', 'val'], [train, val]):
    print(f'===== {subset} shape: {m.shape} =====')
    n_masked = len(full) - len(m)
    perc_masked = (n_masked / len(full)) * 100
    print(f'{subset} \t| # Masked Samples: {n_masked}')
    print(f'{subset} \t| % Masked Samples: {perc_masked:.3f}%')

===== train shape: (34845, 171) =====
train 	| # Masked Samples: 4422
train 	| % Masked Samples: 11.261%
===== val shape: (3660, 171) =====
val 	| # Masked Samples: 436
val 	| % Masked Samples: 10.645%


In [None]:
# Standard Scaler for Features
FEATURE_SCALER = StandardScaler()
# Fit and transform on training features
train_features_mask = FEATURE_SCALER.fit_transform(train_mask[CONFIG.FEATURE_COLUMNS].values.astype(np.float32))
# Transform val/test features using scaler fitted on train data
val_features_mask = FEATURE_SCALER.transform(val_mask[CONFIG.FEATURE_COLUMNS].values.astype(np.float32))
test_features = FEATURE_SCALER.transform(test[CONFIG.FEATURE_COLUMNS].values.astype(np.float32))

y_train_mask = train_mask[CONFIG.TARGET_COLUMNS].values
y_val_mask = val_mask[CONFIG.TARGET_COLUMNS].values


#Read Embeddings
suffix = 'image_embs_dinov2_vitg14_reg'
train_image_embeddings = np.load(f'/kaggle/input/embedn/train_{suffix}.npy')
val_image_embeddings = np.load(f'/kaggle/input/embedn/val_{suffix}.npy')
test_image_embeddings = np.load(f'/kaggle/input/embedn/test_{suffix}.npy')

train_image_embeddings = train_image_embeddings[mask_train,:]
val_image_embeddings = val_image_embeddings[mask_val, :]
print(f'Embeddings {suffix} loaded from dataset.')

Embeddings image_embs_dinov2_vitg14_reg loaded from dataset.


## Get final features DataFrames

In [None]:
first_n_poly_feats = 1000
train_features_mask_all = np.concatenate(
    (PolynomialFeatures(2).fit_transform(train_features_mask)[:, :first_n_poly_feats], train_image_embeddings), axis=1
)
val_features_mask_all = np.concatenate(
    (PolynomialFeatures(2).fit_transform(val_features_mask)[:, :first_n_poly_feats], val_image_embeddings), axis=1
)
test_features_all = np.concatenate(
    (PolynomialFeatures(2).fit_transform(test_features)[:, :first_n_poly_feats], test_image_embeddings), axis=1
)

X_train = pd.DataFrame(train_features_mask_all)
X_train['emb'] = list(train_image_embeddings)

X_val = pd.DataFrame(val_features_mask_all)
X_val['emb'] = list(val_image_embeddings)

X_test = pd.DataFrame(test_features_all)
X_test['emb'] = list(test_image_embeddings)

In [None]:
%%time
models = {}
scores = {}
for i, col in tqdm(enumerate(CONFIG.TARGET_COLUMNS), total=len(CONFIG.TARGET_COLUMNS)):
    y_train = y_train_mask[:, i]
    y_val = y_val_mask[:, i]
    train_pool = Pool(X_train, y_train, embedding_features=['emb'])
    val_pool = Pool(X_val, y_val, embedding_features=['emb'])

    model = CatBoostRegressor(iterations=2000, learning_rate=0.06, loss_function='RMSE', verbose=0, random_state=CONFIG.SEED)
    model.fit(train_pool)
    models[col] = model

    y_pred = model.predict(val_pool)

    r2_col = r2_score(y_val, y_pred)
    scores[col] = r2_col
    print(f'Target: {col}, R2: {r2_col:.3f}')

print(f'Mean R2: {np.mean(list(scores.values())):.3f}')

  0%|          | 0/6 [00:00<?, ?it/s]

Target: X4_mean, R2: 0.530
Target: X11_mean, R2: 0.462
Target: X18_mean, R2: 0.649
Target: X50_mean, R2: 0.392
Target: X26_mean, R2: 0.360
Target: X3112_mean, R2: 0.531
Mean R2: 0.487
CPU times: user 7h 10min 45s, sys: 1min 24s, total: 7h 12min 10s
Wall time: 2h 12min 58s


In [None]:
submission = pd.DataFrame({'id': test['id']})
submission[CONFIG.TARGET_COLUMNS] = 0
submission.columns = submission.columns.str.replace('_mean', '')

for i, col in enumerate(CONFIG.TARGET_COLUMNS):
    test_pool = Pool(X_test, embedding_features=['emb'])
    col_pred = models[col].predict(test_pool)
    submission[col.replace('_mean', '')] = col_pred

submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,X4,X11,X18,X50,X26,X3112
0,154220505,1.13855,145.811684,19709.041663,15.210981,3518.882815,399895.181866
1,195736552,0.959577,154.435916,19699.562707,14.978896,3459.992266,398609.298455
2,182701773,0.982636,148.129923,19699.078895,15.015769,3460.289849,398033.48572
3,27688500,0.981447,140.566423,19699.497689,16.180347,3483.132325,397916.792551
4,195825045,0.911916,152.008563,19699.302744,14.889485,3460.025855,398914.879821
