# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from torchvision.models import resnet50, ResNet50_Weights
from torchvision.transforms import v2
from torchmetrics.regression import R2Score

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import albumentations as A
from albumentations.pytorch import ToTensorV2

import xgboost as xgb
from tqdm import tqdm
import imageio.v3 as imageio

from PIL import Image
import cv2
import timm
import os
import psutil

# Read Data

In [41]:
ROOT = "/kaggle/input/data"

df_train = pd.read_csv(os.path.join(ROOT, "train.csv"))
df_test = pd.read_csv(os.path.join(ROOT, "test.csv"))

# Set up list of column names

In [42]:
target_columns = ["X4_mean", "X11_mean", "X18_mean", "X26_mean", "X50_mean", "X3112_mean"]
log_columns = ["X4_mean", "X11_mean", "X18_mean", "X26_mean", "X50_mean", "X3112_mean"]
feature_columns = [col for col in df_train.columns if col not in target_columns + ['id']]

# Pre-process data

In [43]:
features_scaler = StandardScaler()

train_ids = pd.DataFrame(df_train['id'])
test_ids = pd.DataFrame(df_test['id'])

# z-score normalization on feature columns
train_features_normalized = features_scaler.fit_transform(df_train[feature_columns])
test_features_normalized = features_scaler.transform(df_test[feature_columns])

df_train_augmented = pd.DataFrame(train_features_normalized, columns=feature_columns, index=df_train.index)
df_test_augmented = pd.DataFrame(test_features_normalized, columns=feature_columns, index=df_test.index)

# remove outliers
mask = (df_train_augmented.abs() < 3.7).all(axis=1)
df_train_augmented = df_train_augmented[mask]
train_ids = train_ids[mask]

df_train_augmented['id'] = train_ids
df_test_augmented['id'] = test_ids

# z-score normalization on target columns
target_scaler = StandardScaler()
y_train_normalized = pd.DataFrame(target_scaler.fit_transform(df_train.loc[mask, target_columns]), columns=target_columns, index=y_train.index)
df_train_augmented[target_columns] = y_train_normalized


# take logs of target columns
# for idx, target in enumerate(target_columns):
#     if target in log_columns:
#         df_train_augmented[target] = np.log10(df_train[target].values)
#     else:
#         df_train_augmented[target] = df_train[target]

# remove outliers
# for column in target_columns:
#     lower_quantile = df_train_augmented[column].quantile(0.005)
#     upper_quantile = df_train_augmented[column].quantile(0.985)
#     df_train_augmented = df_train_augmented[(df_train_augmented[column] >= lower_quantile) & (df_train_augmented[column] <= upper_quantile)]

# df_train_augmented[target_columns] = scaler.fit_transform(df_train_augmented[target_columns])

# Define plant dataset class

In [45]:
class PlantDataset(Dataset):
    def __init__(self, df, image_path, image_transforms=None):
        self.df = df
        self.df['id'] = self.df['id'].astype(str)
        self.image_transforms = image_transforms
        self.image_path = image_path
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        # get image and apply transformations
        image = Image.open(os.path.join(ROOT, self.image_path, str(row['id']) + ".jpeg")).convert("RGB")
        image = self.image_transforms(image)
        features = torch.tensor(row[feature_columns].values.astype(float), dtype=torch.float32)
        
        return image, features
    
    def __len__(self):
        return len(self.df)

# Function to extract the features from the images

In [46]:
def evaluate_images(model, loader):
    model.eval()
    features = []
    with torch.no_grad():
        for images, _ in loader:
            images = images.to(device)
            output = model(images).cpu().numpy()
            features.append(output)
    return np.vstack(features)

# Extract image features using ResNet-50 CNN

In [49]:
# use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

# image transformations
train_transforms = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
test_transforms = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# test and train dataloaders
train_dataset = PlantDataset(df_train_augmented, "train_images", train_transforms)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)

test_dataset = PlantDataset(df_test_augmented, "test_images", test_transforms)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# extract image features using pre-trained ResNet-50
image_model = resnet50(weights=ResNet50_Weights.DEFAULT)
image_model.fc = nn.Identity()
image_model = image_model.to(device)
train_image_features = evaluate_images(image_model, train_loader)
test_image_features = evaluate_images(image_model, test_loader)

Device: cuda


# Combine extracted features with pre-processsed ancillary data

In [51]:
# combined data
X_train = np.hstack((train_image_features, df_train_augmented[feature_columns].values))
X_test = np.hstack((test_image_features, df_test_augmented[feature_columns].values))

# only ancillary features
# X_train = df_train_augmented[feature_columns].values
# X_test = df_test_augmented[feature_columns].values

# only image features
# X_train = train_image_features
# X_test = test_image_features

y_train = df_train_augmented[target_columns]

# Run XGBoost on each of the 6 target traits

In [52]:
models = {}
r2_scores = {}
for target in tqdm(target_columns):
    model = xgb.XGBRegressor(objective="reg:squarederror", learning_rate=0.01, n_estimators=500, max_depth=7)
    model.fit(X_train, y_train[target])
    predictions = model.predict(X_train)
    r2_scores[target] = r2_score(y_train[target], predictions)
    models[target] = model
    print(f"{target} R2: {r2_scores[target]}")
print(f"{np.mean(list(r2_scores.values()))}")

 17%|█▋        | 1/6 [12:33<1:02:45, 753.13s/it]

X4_mean R2: 0.4729572110056841


 33%|███▎      | 2/6 [25:27<51:02, 765.52s/it]  

X11_mean R2: 0.49924313434354506


 50%|█████     | 3/6 [36:49<36:21, 727.26s/it]

X18_mean R2: 0.6961518790337368


 67%|██████▋   | 4/6 [46:04<21:58, 659.34s/it]

X26_mean R2: 0.6103914821925748


 83%|████████▎ | 5/6 [58:23<11:28, 688.09s/it]

X50_mean R2: 0.42919814879324414


100%|██████████| 6/6 [1:09:36<00:00, 696.14s/it]

X3112_mean R2: 0.6052445591245019
0.5521977357488811





# Run models on test set

In [53]:
test_results = np.zeros((len(df_test), len(target_columns)))
for i, target_column in enumerate(target_columns):
    test_results[:, i] = models[target_column].predict(X_test)

In [54]:
submit_columns = ["X4", "X11", "X18", "X26", "X50", "X3112"]

# inverse normalize the targets
df_submission = pd.DataFrame(data=target_scaler.inverse_transform(test_results), columns=submit_columns)

# inverse any columns that were logged
# for col in log_columns:
#     sub_col = col.replace("_mean", "")
#     df_submission[sub_col] = 10 ** df_submission[sub_col]

df_submission.index = df_test['id']
df_submission.index.name = 'id'
df_submission.to_csv("submission.csv")

In [55]:
df_submission.head()

Unnamed: 0_level_0,X4,X11,X18,X26,X50,X3112
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
154220505,1.071805,146.134664,19709.442674,3575.966949,15.053401,402358.090562
195736552,1.068802,148.06224,19702.780221,3496.412239,15.156855,399195.816537
182701773,0.955643,151.903426,19699.621507,3464.190096,15.007744,398215.068552
27688500,1.018967,143.901742,19699.639298,3467.127458,15.548856,398240.031559
195825045,0.955423,150.142607,19699.582724,3464.374771,15.23172,398729.515808
