In [1]:
import sys
import os
sys.path.append(os.path.abspath('..'))

In [2]:
#imports
import torch
from torch.utils.data import Dataset
import numpy as np
from src.updated_model import HarvestModel
from torch.utils.data import DataLoader
import src.preprocessing as pre
from torch.utils.data import random_split
import torch.nn as nn
import torch.optim as optim

In [11]:
from sklearn.model_selection import KFold


In [3]:
meta, y, mapping_dict = pre.load_data('../data/planting_meta.json','../data/y.csv','../data/mapping_dict.json')

In [4]:
#class
class HarvestDataset(Dataset):
    """
    A PyTorch Dataset class for handling harvest data.

    This dataset manages features and various categorical IDs (ranch, class, type, variety) 
    along with target variables for kilos measurements.

    Parameters
    ----------
    features : numpy.ndarray, shape (N, 4)
        Static features for N samples (hectares, plants_per_hectare, avg_plant_height, avg_leaf_count)
    ranch_ids : numpy.ndarray, shape (N,)
        Ranch identifier integers
    class_ids : numpy.ndarray, shape (N,)
        Class identifier integers
    type_ids : numpy.ndarray, shape (N,)
        Type identifier integers
    variety_ids : numpy.ndarray, shape (N,)
        Variety identifier integers
    Y_kilos : numpy.ndarray, shape (N, 20)
        Target kilos measurements for 20 timesteps

    Returns
    -------
    tuple
        Contains tensors for features, IDs and targets when indexed
    """
    def __init__(self, 
                 features,         # (N, 5)
                 ranch_ids,        # (N,)
                 class_ids,        # (N,)
                 type_ids,         # (N,)
                 variety_ids,      # (N,)
                 climate_data,     # (N, 100, 3)
                 Y_kilos,         # (N, 20)
                 mean=None,
                 std=None
                ):
    
       

        # Convert to tensors
        self.features = torch.tensor(features, dtype=torch.float32)
        self.ranch_ids = torch.tensor(ranch_ids, dtype=torch.long)
        self.class_ids = torch.tensor(class_ids, dtype=torch.long)
        self.type_ids = torch.tensor(type_ids, dtype=torch.long)
        self.variety_ids = torch.tensor(variety_ids, dtype=torch.long)
        self.climate_data = torch.tensor(climate_data, dtype=torch.float32)
        self.Y = torch.tensor(Y_kilos, dtype=torch.float32)
        nonzero = self.Y != 0

        idx = torch.arange(self.Y.size(1)).expand_as(self.Y)
        start = torch.where(nonzero, idx, torch.full_like(idx, self.Y.size(1))).min(dim=1).values
        end = torch.where(nonzero, idx, torch.full_like(idx, -1)).max(dim=1).values

        self.bounds = torch.stack([start, end], dim=1)

        self.climate_mean = mean
        self.climate_std = std

             

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        climate_data = self.climate_data[idx]
        if self.climate_mean is not None and self.climate_std is not None:
            climate_data = (climate_data - self.climate_mean) / (self.climate_std + 1e-6)
        return (
            self.features[idx],
            self.ranch_ids[idx],
            self.class_ids[idx],
            self.type_ids[idx],
            self.variety_ids[idx],
            climate_data,
            self.Y[idx],
            self.bounds[idx]
        )
    
    
    def get_shapes(self):
        """
        Returns a dictionary containing the shapes of all data tensors
        
        Returns
        -------
        dict
            Dictionary with tensor names as keys and their shapes as values
        """
        shapes = {
            'features': self.features.shape,
            'ranch_ids': self.ranch_ids.shape,
            'class_ids': self.class_ids.shape,
            'type_ids': self.type_ids.shape,
            'variety_ids': self.variety_ids.shape,
            'climate_data': self.climate_data.shape,
            'Y_kilos': self.Y.shape
        }
        return shapes

In [5]:
features = np.column_stack([
    meta['Ha'].to_numpy(),                    # Hectares
    meta['WeekTransplanted_sin'].to_numpy(),  # Week sine
    meta['WeekTransplanted_cos'].to_numpy(),  # Week cosine
    meta['Year'].to_numpy() - 2010,                  # Year
    np.ones(len(meta))                    # Constant feature
    ])
ranches = meta['Ranch'].to_numpy()
varieties = meta['Variety'].to_numpy()
classes = meta['Class'].to_numpy()
types = meta['Type'].to_numpy()
climate_data = np.array(meta.ClimateSeries.to_list())
y_kilos = y.iloc[:,:20].to_numpy()

dataset = HarvestDataset(
        features,
        ranches,
        classes,
        types,
        varieties,
        climate_data,
        y_kilos
    )

In [6]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create a DataLoader over the entire training set (no shuffle needed)
full_train_loader = DataLoader(train_dataset, batch_size=len(train_dataset), shuffle=False)

# Get all climate data from training set
features, ranch_id, class_id, type_id, variety_id, climate_data, y, bounds = next(iter(full_train_loader))

# Flatten and compute mean/std on training climate data
climate_data_flat = climate_data.view(-1, climate_data.shape[-1])
climate_mean = climate_data_flat.mean(dim=0)
climate_std = climate_data_flat.std(dim=0)

# Assign these stats to train and val datasets
train_dataset.dataset.climate_mean = climate_mean  # type: ignore
train_dataset.dataset.climate_std = climate_std    # type: ignore
val_dataset.dataset.climate_mean = climate_mean    # type: ignore
val_dataset.dataset.climate_std = climate_std      # type: ignore

# Now create DataLoaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [10]:
model = HarvestModel()

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=25, gamma=0.1)
model.train()

HarvestModel(
  (encoder): ClimateEncoder(
    (feature_encoder): Sequential(
      (0): Linear(in_features=5, out_features=64, bias=True)
      (1): ReLU()
    )
    (climate_gru): GRU(3, 32, batch_first=True)
    (ranch_emb): Embedding(13, 4)
    (class_emb): Embedding(2, 4)
    (type_emb): Embedding(14, 4)
    (variety_emb): Embedding(59, 4)
    (type_to_class): Linear(in_features=4, out_features=4, bias=True)
    (variety_to_type): Linear(in_features=4, out_features=4, bias=True)
  )
  (harvest_clamp): Sequential(
    (0): Linear(in_features=112, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=2, bias=True)
  )
  (kilo_gru): GRU(3, 20, batch_first=True)
  (kilo_output): Sequential(
    (0): Linear(in_features=134, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=20, bias=True)
  )
)

In [8]:
for epoch in range(200):
    total_loss = 0
    for batch in train_loader:
        
        features, ranch_id, class_id, type_id, variety_id, climate_data, y, bounds = batch
        totals = y.sum().to(dtype=torch.float32)
        bounds = bounds.to(dtype=torch.float32)
        looped_loss = criterion(y,y)
        batch_size = y.size(0)
        log_kilos = torch.log1p(y) 
        week_numbers = torch.arange(0, 20).unsqueeze(0).repeat(batch_size,1)
        inputs = torch.stack([y, log_kilos, week_numbers], dim=2)

        for kilo_range in torch.randint(low =5,high=20,size=(5,)):
    
            inputs = inputs[:,:kilo_range,:]
            # Forward pass
            outputs, clamp = model(features, ranch_id, class_id, type_id, variety_id, climate_data, inputs)
            N = outputs.size(0)
            T = outputs.size(1)  # Should be 20

            # Create time indices [0, 1, ..., 19] and expand to shape (N, 20)
            time = torch.arange(T).unsqueeze(0).expand(N, T)

            # Get start and end indices
            start = clamp[:, 0].unsqueeze(1)  # Shape: (N, 1)
            end = clamp[:, 1].unsqueeze(1)    # Shape: (N, 1)

            # Create mask: 1 where i is within [start, end), 0 elsewhere
            mask = (time >= start) & (time < end)  # Shape: (N, 20)

            # Apply mask
            masked_harvests = outputs * mask
            loss_kilos = criterion(masked_harvests[:,kilo_range:], y[:,kilo_range:])
            loss_clamp = criterion(clamp/20, bounds/20)

            loss = loss_kilos + loss_clamp * totals
            looped_loss += loss

            # Backward and optimize
        optimizer.zero_grad()
        looped_loss.backward()
        optimizer.step()
        
        total_loss += looped_loss.item()
    
    scheduler.step()
    avg_loss = total_loss / len(train_loader)

    print(f"Epoch [{epoch+1}/{50}], Loss: {avg_loss:.4f}")



    



Epoch [1/50], Loss: 4786523.6717, Improvement: -43504.8102
Epoch [2/50], Loss: 4736850.7244, Improvement: -99622.1827
Epoch [3/50], Loss: 4695085.7093, Improvement: 50859.6544
Epoch [4/50], Loss: 4606462.7214, Improvement: 91444.7158
Epoch [5/50], Loss: 4525387.4428, Improvement: 134583.5979
Epoch [6/50], Loss: 4489573.3102, Improvement: 94956.7447
Epoch [7/50], Loss: 4429541.0994, Improvement: 205950.9670
Epoch [8/50], Loss: 4401808.7440, Improvement: 148980.5407
Epoch [9/50], Loss: 4363822.8358, Improvement: 90304.9225
Epoch [10/50], Loss: 4321295.7071, Improvement: -54894.5516
Epoch [11/50], Loss: 4314992.7756, Improvement: -97795.2593


KeyboardInterrupt: 