In [None]:
import os

import numpy as np
import pandas as pd
import torch
from PIL import Image
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from catboost import Pool, CatBoostRegressor
from torchvision import transforms

tqdm.pandas()

In [None]:
class Config():
    TARGET_COLUMNS = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
    # Dataset
    RECOMPUTE_DATAFRAMES_TRAIN = True
    RECOMPUTE_DATAFRAMES_TEST = True
    RECOMPUTE_IMAGE_EMBEDDINGS = True
    N_VAL_SAMPLES0 = 4096
    # Others
    SEED = 20898485
    DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    
def seed_everything(seed: int):    
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
        
CONFIG = Config()
seed_everything(CONFIG.SEED)
CONFIG.DEVICE

In [None]:
# load pickled dataframes from a public dataset; split to train-val
if CONFIG.RECOMPUTE_DATAFRAMES_TRAIN:
    train0 = pd.read_csv('/kaggle/input/cs-480-2024-spring/data/train.csv')
    train0['file_path'] = train0['id'].apply(lambda s: f'/kaggle/input/cs-480-2024-spring/data/train_images/{s}.jpeg')
else:
    train0 = pd.read_pickle('/kaggle/input/planttraits2024-eda-training-pub-dataset/train.pkl')
    
if CONFIG.RECOMPUTE_DATAFRAMES_TEST:
    test = pd.read_csv('/kaggle/input/cs-480-2024-spring/data/test.csv')
    test['file_path'] = test['id'].apply(lambda s: f'/kaggle/input/cs-480-2024-spring/data/test_images/{s}.jpeg')
else:
    test = pd.read_pickle('/kaggle/input/planttraits2024-eda-training-pub-dataset/test.pkl')
CONFIG.FEATURE_COLUMNS = test.columns.values[1:-2]

train, val = train_test_split(train0, test_size=CONFIG.N_VAL_SAMPLES0, shuffle=True, random_state=CONFIG.SEED)
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

In [None]:
# def get_mask(df, labels_describe_df):
#     lower = []
#     higher = []
#     mask = np.empty(shape=df[CONFIG.TARGET_COLUMNS].shape, dtype=bool)
#     for idx, t in enumerate(CONFIG.TARGET_COLUMNS):
#         labels = df[t].values
#         v_min, v_max = labels_describe_df.loc[t]['0.1%'], labels_describe_df.loc[t]['98%']
#         mask[:,idx] = ((labels > v_min) & (labels < v_max))
#     return mask.min(axis=1)

# labels_describe_df = train[CONFIG.TARGET_COLUMNS].describe(percentiles=[0.001, 0.98]).round(3).T
# # Masks
# mask_train = get_mask(train, labels_describe_df)
# mask_val = get_mask(val, labels_describe_df)
# # Masked DataFrames
# train_mask = train[mask_train].reset_index(drop=True)
# val_mask = val[mask_val].reset_index(drop=True)

# for m, subset, full in zip([train_mask, val_mask], ['train', 'val'], [train, val]):
#     print(f'===== {subset} shape: {m.shape} =====')
#     n_masked = len(full) - len(m)
#     perc_masked = (n_masked / len(full)) * 100
#     print(f'{subset} \t| # Masked Samples: {n_masked}')
#     print(f'{subset} \t| % Masked Samples: {perc_masked:.3f}%')

## Get DINO Embedding

In [None]:
def get_image_embeddings_dino(model, preprocess, batch_size, df):
    image_embeddings = []
    for i in tqdm(range(0, len(df), batch_size)):
        paths = df['file_path'][i:i + batch_size]
        image_tensor = torch.stack([preprocess(Image.open(path)) for path in paths]).to(CONFIG.DEVICE)
        with torch.no_grad():
            curr_image_embeddings = model(image_tensor)
        image_embeddings.extend(curr_image_embeddings.cpu().numpy())
    return image_embeddings

In [None]:
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14_reg').to(CONFIG.DEVICE)
model.eval()
# the preprocessing differs from the original code, originally it was resize + crop
# but we lose info while cropping, so here we use only resize to 224
preprocess = transforms.Compose([
    transforms.Resize(126),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])

batch_size = 64
suffix = 'image_embs_dinov2_vitg14_reg'
train_image_embeddings = get_image_embeddings_dino(model, preprocess, batch_size, train)
np.save(f'train_{suffix}', np.array(train_image_embeddings))
val_image_embeddings = get_image_embeddings_dino(model, preprocess, batch_size, val)
np.save(f'val_{suffix}', np.array(val_image_embeddings))
test_image_embeddings = get_image_embeddings_dino(model, preprocess, batch_size, test)
np.save(f'test_{suffix}', np.array(test_image_embeddings))