<a href="https://colab.research.google.com/github/adimehta9/CS480-PlantPredictTraits/blob/main/facebookresearch_dinov2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Kaggle Setup

In [None]:
!pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c cs-480-2024-spring
!unzip cs-480-2024-spring.zip

# Install and Import Dependences

In [None]:
!pip install pandas numpy scikit-learn torch torchvision pillow tqdm requests timm catboost



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import os
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
from PIL import Image
from tqdm.notebook import tqdm
import torch
from sklearn.model_selection import train_test_split
import requests
import timm
from catboost import CatBoostRegressor
import pickle
from sklearn.metrics import mean_squared_error

# Preprocessing Data

In [None]:
# Read Training and Testing Data
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

# Get ancillary part of data
X_train_ancillary = train_df.iloc[:, 1:164]
X_test_ancillary = test_df.iloc[:, 1:]

In [None]:
# Standardise
scaler = StandardScaler()
X_train_ancillary = scaler.fit_transform(X_train_ancillary)
X_test_ancillary = scaler.transform(X_test_ancillary)



# Find polynomial features of standardised data
poly = PolynomialFeatures(2)
X_train_poly = poly.fit_transform(X_train_ancillary)
X_test_poly = poly.transform(X_test_ancillary)

X_train_ancillary.shape, X_train_poly.shape

((43363, 163), (43363, 13530))

In [None]:
# Loading Data Class
class PlantDataset(Dataset):
    def __init__(self, image_folder, dataframe, transform=None):
        self.image_folder = image_folder
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = str(self.dataframe.iloc[idx, 0]) + ".jpeg"
        img_path = os.path.join(self.image_folder, img_name)
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, img_name

In [None]:
# Create Transformation and Data loaders
train_transform = transforms.Compose([
    transforms.Resize(size=(224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

test_transform = transforms.Compose([
    transforms.Resize(size=(224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_dataset = PlantDataset('./data/train_images', train_df, transform=train_transform)
test_dataset = PlantDataset('./data/test_images', test_df, transform=test_transform)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)




# Feature Extraction

In [None]:
# Load DINOV2 Model
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14_reg')
model.eval()


In [None]:
# Run model on dataset in batches to extract features of each image
def extract_features(dataloader):
    all_featuress = []
    for images, _ in tqdm(dataloader):
        with torch.no_grad():
            outputs = model(images).cpu().numpy()
            all_featuress.append(outputs)
    all_featuress = np.vstack(all_featuress)
    return all_featuress

X_train_images = extract_features(train_loader)
X_test_images = extract_features(test_loader)


In [None]:
# # Save Data (Not necessary to run)
# np.save('X_train_images.npy', X_train_images)
# np.save('X_test_images.npy', X_test_images)
# np.save('X_train_ancillary.npy', X_train_ancillary)
# np.save('X_test_ancillary.npy', X_test_ancillary)

In [None]:
# # Load Data (Not necessary to run)
# X_train_images = np.load('X_train_images.npy')
# X_test_images = np.load('X_test_images.npy')
# X_train_ancillary = np.load('X_train_ancillary.npy')
# X_test_ancillary = np.load('X_test_ancillary.npy')

# Regression Modelling

In [None]:
# Create complete dataset
X_train_combined = np.hstack((X_train_images, X_train_poly))
X_test_combined = np.hstack((X_test_images, X_test_poly))

In [None]:
targets = ['X4_mean', 'X11_mean', 'X18_mean','X26_mean', 'X50_mean', 'X3112_mean']
y_train = train_df.iloc[:, 164:]

In [None]:
# Create Catboost regressor model for each trait

models = {}
for i, target in enumerate(y_train.columns):
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_combined, y_train[target], test_size=0.2, random_state=42)
    model = CatBoostRegressor(iterations=1500, learning_rate=0.06, depth=6, loss_function='RMSE')
    model.fit(X_train_split, y_train_split, eval_set=(X_val_split, y_val_split), verbose=100)

    models[target] = model

    # Predict on val
    y_val_pred = model.predict(X_val_split)
    rmse = np.sqrt(mean_squared_error(y_val_split, y_val_pred))
    print(f'Validation RMSE for {target}: {rmse}')

0:	learn: 0.1354912	test: 0.1364235	best: 0.1364235 (0)	total: 506ms	remaining: 12m 38s
100:	learn: 0.1064679	test: 0.1097424	best: 0.1097424 (100)	total: 36.3s	remaining: 8m 22s
200:	learn: 0.1012646	test: 0.1065336	best: 0.1065336 (200)	total: 1m 12s	remaining: 7m 47s
300:	learn: 0.0965583	test: 0.1045033	best: 0.1045033 (300)	total: 1m 48s	remaining: 7m 11s
400:	learn: 0.0926189	test: 0.1032622	best: 0.1032622 (400)	total: 2m 24s	remaining: 6m 35s
500:	learn: 0.0890310	test: 0.1023670	best: 0.1023670 (500)	total: 3m	remaining: 5m 59s
600:	learn: 0.0857960	test: 0.1016795	best: 0.1016795 (600)	total: 3m 36s	remaining: 5m 23s
700:	learn: 0.0828380	test: 0.1010537	best: 0.1010537 (700)	total: 4m 12s	remaining: 4m 47s
800:	learn: 0.0801465	test: 0.1005796	best: 0.1005796 (800)	total: 4m 48s	remaining: 4m 12s
900:	learn: 0.0774593	test: 0.1001764	best: 0.1001764 (900)	total: 5m 25s	remaining: 3m 36s
1000:	learn: 0.0750578	test: 0.0998203	best: 0.0998203 (1000)	total: 6m 1s	remaining: 3m


# Predictions

In [None]:
submission = pd.DataFrame(0, index=np.arange(X_test_combined.shape[0]), columns=['X4', 'X11', 'X18', 'X26', 'X50', 'X3112'])

for target in ['X4', 'X11', 'X18', 'X26', 'X50', 'X3112']:
  submission[target] = models[f'{target}_mean'].predict(X_test_combined)

submission.insert(0, 'id', test_df['id'])
submission.to_csv('20897086_mehta.csv', index=False)


In [None]:
submission.head()

Unnamed: 0,id,X4,X11,X18,X26,X50,X3112
0,154220505,1.106794,144.14782,19708.745852,3532.091228,15.152983,400599.86928
1,195736552,0.989651,151.866886,19699.387707,3486.066635,15.275854,398757.704511
2,182701773,0.944683,149.308118,19699.837358,3459.703097,15.083917,398262.597396
3,27688500,0.955066,140.684184,19699.063726,3478.650101,16.003674,398480.240561
4,195825045,0.925726,152.962552,19698.930366,3471.571944,14.884578,399157.509068
