In [7]:
import os

import numpy as np
import pandas as pd
import torch
from PIL import Image
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from catboost import Pool, CatBoostRegressor
from torchvision import transforms
from torch import nn

import imageio.v3 as imageio
import albumentations as A
from albumentations.pytorch import ToTensorV2

from transformers import ViTModel
tqdm.pandas()

In [8]:
class Config():
    TARGET_COLUMNS = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
    # Dataset
    RECOMPUTE_DATAFRAMES_TRAIN = True
    RECOMPUTE_DATAFRAMES_TEST = True
    RECOMPUTE_IMAGE_EMBEDDINGS = True
    N_VAL_SAMPLES0 = 4096
    # Others
    SEED = 20898485
    DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    
def seed_everything(seed: int):    
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
        
CONFIG = Config()
seed_everything(CONFIG.SEED)
CONFIG.DEVICE

'cuda:0'

In [9]:
#Read Data
data = pd.read_csv('/kaggle/input/cs-480-2024-spring/data/train.csv')
data['file_path'] = data['id'].apply(lambda s: f'/kaggle/input/cs-480-2024-spring/data/train_images/{s}.jpeg')

data_test = pd.read_csv('/kaggle/input/cs-480-2024-spring/data/test.csv')
data_test['file_path'] = data_test['id'].apply(lambda s: f'/kaggle/input/cs-480-2024-spring/data/test_images/{s}.jpeg')
test = data_test


#Transform the data
for target in CONFIG.TARGET_COLUMNS:
    v = data[target].values
    data[target] = np.log10(v)

SCALER = StandardScaler()
data[CONFIG.TARGET_COLUMNS] = SCALER.fit_transform(data[CONFIG.TARGET_COLUMNS])

#Train Test Split
CONFIG.FEATURE_COLUMNS = data_test.columns.values[1:-2]
train, val = train_test_split(data, test_size=CONFIG.N_VAL_SAMPLES0, shuffle=True, random_state=CONFIG.SEED)
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

In [12]:
class ViTWithMLP(nn.Module):
    def __init__(self, vit,hidden_dim,output_dim):
        super(ViTWithMLP, self).__init__()
        self.vit = vit
        self.mlp = nn.Sequential(
            nn.Linear(vit.config.hidden_size, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)  # Final layer to be removed
        )
    
    def forward(self, x):
        outputs = self.vit(pixel_values=x)
        features = outputs.last_hidden_state[:, 0, :]
        output = self.mlp(features)
        return output
    
model = torch.load("/kaggle/input/models/model.pth")
model.mlp = nn.Sequential(*list(model.mlp.children())[:-1])
model.to(CONFIG.DEVICE)

ViTWithMLP(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-23): 24 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features

In [None]:
def get_image_embeddings(model, preprocess, batch_size, df):
    image_embeddings = []
    for i in tqdm(range(0, len(df), batch_size)):
        paths = df['file_path'][i:i + batch_size]
        image_tensor = torch.stack([preprocess(image=imageio.imread(path))['image'] for path in paths]).to(CONFIG.DEVICE)
        with torch.no_grad():
            curr_image_embeddings = model(image_tensor)
        image_embeddings.extend(curr_image_embeddings.cpu().numpy())
    return image_embeddings

model.eval()
# the preprocessing differs from the original code, originally it was resize + crop
# but we lose info while cropping, so here we use only resize to 224
MEAN = np.array([0.485, 0.456, 0.406])
STD = np.array([0.229, 0.224, 0.225])

preprocess = A.Compose([
        A.Resize(384, 384),
        A.ToFloat(),
        A.Normalize(mean=MEAN, std=STD, max_pixel_value=1),
        ToTensorV2(),
    ])

batch_size = 64
suffix = 'ViTMLP'

train_image_embeddings = get_image_embeddings(model, preprocess, batch_size, train)
np.save(f'train_{suffix}', np.array(train_image_embeddings))
val_image_embeddings = get_image_embeddings(model, preprocess, batch_size, val)
np.save(f'val_{suffix}', np.array(val_image_embeddings))
test_image_embeddings = get_image_embeddings(model, preprocess, batch_size, test)
np.save(f'test_{suffix}', np.array(test_image_embeddings))