In [2]:
# Imports
import numpy as np
import pandas as pd
import torch
from catboost import Pool, CatBoostRegressor
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from torchvision import transforms
from tqdm import tqdm

tqdm.pandas()

In [6]:
# Config
class Config():
  TARGET_COLUMNS = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
  DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
  SEED = 4242564
  PERCENT_SAMPLES = 0.05

CONFIG = Config()

In [7]:
# Read train data frame
train_data = pd.read_csv('data/train.csv')

# Add associated image file path
train_data['file_path'] = train_data['id'].apply(lambda x: f'data/train_images/{x}.jpeg')

# Split train data
train, val = train_test_split(
  train_data,
  test_size=CONFIG.PERCENT_SAMPLES,
  shuffle=True,
  random_state=CONFIG.SEED
)

# Reset index of data frame
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

# Display train data frame
display(train.head(5))
display(train.info())

Unnamed: 0,id,WORLDCLIM_BIO1_annual_mean_temperature,WORLDCLIM_BIO12_annual_precipitation,WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month,WORLDCLIM_BIO15_precipitation_seasonality,WORLDCLIM_BIO4_temperature_seasonality,WORLDCLIM_BIO7_temperature_annual_range,SOIL_bdod_0.5cm_mean_0.01_deg,SOIL_bdod_100.200cm_mean_0.01_deg,SOIL_bdod_15.30cm_mean_0.01_deg,...,VOD_X_1997_2018_multiyear_mean_m10,VOD_X_1997_2018_multiyear_mean_m11,VOD_X_1997_2018_multiyear_mean_m12,X4_mean,X11_mean,X18_mean,X26_mean,X50_mean,X3112_mean,file_path
0,185066198,6.232639,830.866638,169.199997,91.062294,167.254593,23.683334,124,140,129,...,0.360532,0.338625,0.312269,1.297415,144.66109,19699.100545,3482.17549,16.343673,397956.544797,data/train_images/185066198.jpeg
1,191001275,16.356569,832.933655,132.469391,73.091423,379.680695,23.976021,122,134,131,...,0.412964,0.437285,0.462721,1.078814,153.490531,19699.407783,3459.432536,14.571836,398955.436925,data/train_images/191001275.jpeg
2,196117937,5.896667,2765.228516,172.028564,23.324215,420.81073,21.460476,87,115,97,...,0.495464,0.508885,0.508996,0.969344,151.182845,19699.227216,3459.605043,14.62612,397695.73716,data/train_images/196117937.jpeg
3,195251890,9.290575,1390.661865,95.323807,23.530909,428.569458,23.512857,95,125,105,...,0.553267,0.584244,0.588045,0.980909,151.479983,19699.299781,3460.73631,15.281598,397719.534941,data/train_images/195251890.jpeg
4,195930638,22.305506,1231.607178,147.5,56.647408,444.378998,24.0,126,158,141,...,0.326244,0.334493,0.326527,1.273202,143.873792,19708.109956,3620.482252,15.595735,399098.734134,data/train_images/195930638.jpeg


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41194 entries, 0 to 41193
Columns: 171 entries, id to file_path
dtypes: float64(48), int64(122), object(1)
memory usage: 53.7+ MB


None

In [8]:
# Read test data frame
test = pd.read_csv('data/test.csv')

# Add associated image file path
test['file_path'] = test['id'].apply(lambda x: f'data/test_images/{x}.jpeg')

# Display test data frame
display(test.head(5))
display(test.info())

# Set feature columns, ie. all columns except id and file_path
CONFIG.FEATURE_COLUMNS = test.columns.values[1:-1]
print(f'Num Feature Columns: {len(CONFIG.FEATURE_COLUMNS)}')

Unnamed: 0,id,WORLDCLIM_BIO1_annual_mean_temperature,WORLDCLIM_BIO12_annual_precipitation,WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month,WORLDCLIM_BIO15_precipitation_seasonality,WORLDCLIM_BIO4_temperature_seasonality,WORLDCLIM_BIO7_temperature_annual_range,SOIL_bdod_0.5cm_mean_0.01_deg,SOIL_bdod_100.200cm_mean_0.01_deg,SOIL_bdod_15.30cm_mean_0.01_deg,...,VOD_X_1997_2018_multiyear_mean_m04,VOD_X_1997_2018_multiyear_mean_m05,VOD_X_1997_2018_multiyear_mean_m06,VOD_X_1997_2018_multiyear_mean_m07,VOD_X_1997_2018_multiyear_mean_m08,VOD_X_1997_2018_multiyear_mean_m09,VOD_X_1997_2018_multiyear_mean_m10,VOD_X_1997_2018_multiyear_mean_m11,VOD_X_1997_2018_multiyear_mean_m12,file_path
0,154220505,26.591249,2279.300049,146.199997,22.313732,44.629532,7.960001,114,125,117,...,0.287904,0.281897,0.282957,0.285863,0.286542,0.284718,0.287036,0.292942,0.292871,data/test_images/154220505.jpeg
1,195736552,22.575659,1640.576538,304.744904,82.645508,456.922668,19.664286,137,157,148,...,0.282085,0.302374,0.29198,0.26374,0.256895,0.288942,0.283586,0.276328,0.270898,data/test_images/195736552.jpeg
2,182701773,1.646429,762.785706,62.035713,35.019753,628.131958,26.042858,108,138,121,...,0.355498,0.375334,0.40442,0.420305,0.410467,0.381971,0.402281,0.361373,0.355861,data/test_images/182701773.jpeg
3,27688500,18.799629,41.582222,3.986667,30.636101,266.586731,20.665779,131,146,134,...,0.129487,0.134308,0.137946,0.138145,0.136799,0.135013,0.127216,0.120512,0.120655,data/test_images/27688500.jpeg
4,195825045,7.098363,519.127563,29.540817,20.231646,1060.959473,37.541325,110,145,129,...,0.32722,0.374367,0.44245,0.455588,0.43815,0.390645,0.361673,0.360657,0.326193,data/test_images/195825045.jpeg


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6391 entries, 0 to 6390
Columns: 165 entries, id to file_path
dtypes: float64(42), int64(122), object(1)
memory usage: 8.0+ MB


None

Num Feature Columns: 163


In [9]:
# Normalize data
FEATURE_SCALER = StandardScaler()

train_features_mask = FEATURE_SCALER.fit_transform(
  train[CONFIG.FEATURE_COLUMNS].values.astype(np.float32)
)

val_features_mask = FEATURE_SCALER.transform(
  val[CONFIG.FEATURE_COLUMNS].values.astype(np.float32)
)

test_features_mask = FEATURE_SCALER.transform(
  test[CONFIG.FEATURE_COLUMNS].values.astype(np.float32)
)

In [10]:
# Get model image embeddgings
def get_image_embeddings(model, data_frame, preprocessing_transforms, batch_size=64):
  embeddings = []
  N = len(data_frame)

  for i in tqdm(range(0, N, batch_size)):
    file_paths = data_frame['file_path'][i:i + batch_size]

    # Apply image transformations for preprocessing
    image_tensor = torch.stack(
      [preprocessing_transforms(Image.open(path)) for path in file_paths]
    ).to(CONFIG.DEVICE)

    with torch.no_grad():
      cur_embeddings = model(image_tensor)

    embeddings.extend(cur_embeddings.cpu().numpy())
  return embeddings

In [11]:
# Use existing DinoV2 model
model = torch.hub.load(
  'facebookresearch/dinov2', 'dinov2_vitg14_reg'
  ).to(CONFIG.DEVICE)
model.eval()

Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_reg4_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vitg14_reg4_pretrain.pth
100%|██████████| 4.23G/4.23G [00:52<00:00, 87.0MB/s]


DinoVisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1536, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (blocks): ModuleList(
    (0-39): 40 x NestedTensorBlock(
      (norm1): LayerNorm((1536,), eps=1e-06, elementwise_affine=True)
      (attn): MemEffAttention(
        (qkv): Linear(in_features=1536, out_features=4608, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1536, out_features=1536, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (drop_path1): Identity()
      (norm2): LayerNorm((1536,), eps=1e-06, elementwise_affine=True)
      (mlp): SwiGLUFFNFused(
        (w12): Linear(in_features=1536, out_features=8192, bias=True)
        (w3): Linear(in_features=4096, out_features=1536, bias=True)
      )
      (ls2): LayerScale()
      (drop_path2): Identity()
    )
  )
  (norm): LayerNorm((1536,), eps=1e-06, elementwise_affine=True)
  (head

In [12]:
# Create preprocessing transforms
preprocessing_transforms = transforms.Compose([
  transforms.Resize(224),
  transforms.ToTensor(),
  transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

# Get image embeddings
print("Getting training image embeddings...")
train_image_embeddings = get_image_embeddings(model, train, preprocessing_transforms)

print("Getting validation image embeddings...")
val_image_embeddings = get_image_embeddings(model, val, preprocessing_transforms)

print("Getting test image embeddings...")
test_image_embeddings = get_image_embeddings(model, test, preprocessing_transforms)

Getting training image embeddings...


100%|██████████| 644/644 [2:01:06<00:00, 11.28s/it]


Getting validation image embeddings...


100%|██████████| 34/34 [06:22<00:00, 11.25s/it]


Getting test image embeddings...


100%|██████████| 100/100 [18:48<00:00, 11.28s/it]


In [13]:
# Get data frame after running dinoV2
train_features_dino =  pd.DataFrame(train_features_mask) # columns
train_features_dino['embedding'] = list(train_image_embeddings) # rows, ie. image embeddings

val_features_dino =  pd.DataFrame(val_features_mask) # columns
val_features_dino['embedding'] = list(val_image_embeddings) # rows, ie. image embeddings

test_features_dino =  pd.DataFrame(test_features_mask) # columns
test_features_dino['embedding'] = list(test_image_embeddings) # rows, ie. image embeddings

In [22]:
# Train CatBoost
catboost_models = {}
r2_scores = {}
N = len(CONFIG.TARGET_COLUMNS)

print("Training CatBoost...")
for i, col in tqdm(enumerate(CONFIG.TARGET_COLUMNS), total=N):

  # Get target column feature values
  y_train = train[CONFIG.TARGET_COLUMNS].values[:, i]
  y_val = val[CONFIG.TARGET_COLUMNS].values[:, i]

  # Create catboost internal pool for train and validation data
  train_pool = Pool(train_features_dino, y_train, embedding_features=['embedding'])
  val_pool = Pool(val_features_dino, y_val, embedding_features=['embedding'])

  # Train model
  catboost_model = CatBoostRegressor(iterations=10000, learning_rate=0.01, loss_function='RMSE', verbose=False, random_state=CONFIG.SEED)
  catboost_model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50) # Early stop to prevent overfitting
  catboost_models[col] = catboost_model

  # Predict validation data
  y_val_prediction = catboost_model.predict(val_pool)

  # Calculate R^2 score
  r2 = r2_score(y_val, y_val_prediction)
  r2_scores[col] = r2
  print(f'\nTarget Column: {col}, R2 Score: {r2}')

print("Finished training CatBoost...")
print(f"Mean R2 score: {np.mean(list(r2_scores.values()))}")

Training CatBoost...


 17%|█▋        | 1/6 [09:14<46:11, 554.38s/it]


Target Column: X4_mean, R2 Score: 0.5157840068209979


 33%|███▎      | 2/6 [17:25<34:28, 517.05s/it]


Target Column: X11_mean, R2 Score: 0.45718869269379847


 50%|█████     | 3/6 [24:26<23:40, 473.36s/it]


Target Column: X18_mean, R2 Score: 0.6302544480213915


 67%|██████▋   | 4/6 [33:10<16:26, 493.47s/it]


Target Column: X50_mean, R2 Score: 0.41917731095380795


 83%|████████▎ | 5/6 [39:03<07:22, 442.80s/it]


Target Column: X26_mean, R2 Score: 0.3075128072399489


100%|██████████| 6/6 [45:34<00:00, 455.80s/it]


Target Column: X3112_mean, R2 Score: 0.49980380497112
Finished training CatBoost...
Mean R2 score: 0.47162017845017745





In [23]:
# Format submission
submission = pd.DataFrame({'id': test['id']})
submission[CONFIG.TARGET_COLUMNS] = 0
submission.columns = submission.columns.str.replace('_mean', '')

# Populate submission .csv
for col in CONFIG.TARGET_COLUMNS:

  # Convert resulting dinoV2 features data frame to CatBoost internal Pool data structure
  test_features_pool = Pool(test_features_dino, embedding_features=['embedding'])

  # Create predictions for specific trait / column using trained CatBoost models
  test_trait_pred = catboost_models[col].predict(test_features_pool)
  submission[col.replace('_mean', '')] = test_trait_pred

# Write to csv
submission = submission[['id','X4', 'X11', 'X18', 'X26', 'X50', 'X3112']] # Swap X26 and X50 column order
submission.to_csv('submission.csv', index=False)

# Display submission data frame
display(submission.head(6))
display(submission.info())

Unnamed: 0,id,X4,X11,X18,X26,X50,X3112
0,154220505,1.122965,145.289236,19707.721644,3563.762889,15.136789,400916.982724
1,195736552,1.045086,150.976706,19699.769492,3468.927767,15.047818,398728.775878
2,182701773,0.986401,151.817092,19699.213446,3460.658594,14.69882,397964.491886
3,27688500,0.994772,140.863414,19699.608419,3476.022805,15.948696,398202.144716
4,195825045,0.926844,152.601349,19699.348197,3461.882317,14.951588,398743.899686
5,90218281,1.301447,138.863754,19703.403691,3478.554142,17.074962,397793.779896


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6391 entries, 0 to 6390
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      6391 non-null   int64  
 1   X4      6391 non-null   float64
 2   X11     6391 non-null   float64
 3   X18     6391 non-null   float64
 4   X26     6391 non-null   float64
 5   X50     6391 non-null   float64
 6   X3112   6391 non-null   float64
dtypes: float64(6), int64(1)
memory usage: 349.6 KB


None