In [1]:
import sys
import os

# Add the parent directory to sys.path
sys.path.append(os.path.abspath('..'))

In [177]:
from scipy.optimize import curve_fit
import numpy as np
import pandas as pd
import torch



In [3]:
import src.preprocessing as pre

In [98]:
meta, weekly_summary, mapping_dict = pre.load_tomato(planting_meta_path='../data/planting_meta.json', weekly_summary_path='../data/weekly_summary.csv')

In [5]:
meta.head()

Unnamed: 0,TransplantDate,Year,WeekTransplanted,Ranch,Variety,Class,Type,Ha,WeekTransplanted_sin,WeekTransplanted_cos,ClimateSeries
2013-02-13_Felicity_ZJL_Z18_6_0.39,2013-02-13,2013,7,ZJL,Felicity,CHE,Cherry Rojo,0.3938,0.748511,0.663123,"[[-0.5605881878, -3.1935067896000002, -0.25850..."
2013-02-13_Shiren_ZJL_Z18_6_0.39,2013-02-13,2013,7,ZJL,Shiren,CHE,Cherry Rojo,0.3938,0.748511,0.663123,"[[-0.5605881878, -3.1935067896000002, -0.25850..."
2013-02-15_Amsterdam_ZJL_Z18_2_0.27,2013-02-15,2013,7,ZJL,Amsterdam,BSUF,Uva Roja,0.27,0.748511,0.663123,"[[0.3146673828, -1.2573452111, -0.258501094600..."
2013-02-15_Felicity_ZJL_Z18_5_0.21,2013-02-15,2013,7,ZJL,Felicity,CHE,Cherry Rojo,0.2138,0.748511,0.663123,"[[0.3146673828, -1.2573452111, -0.258501094600..."
2013-02-15_Olivia_ZJL_Z18_2_0.54,2013-02-15,2013,7,ZJL,Olivia,BSUF,Uva Roja,0.54,0.748511,0.663123,"[[0.3146673828, -1.2573452111, -0.258501094600..."


In [99]:
df = weekly_summary.pivot(columns='WeeksAfterTransplant', values='Kilos')

In [100]:
meta = meta[df.sum(axis=1) > 400]
df = df[df.sum(axis=1) > 400]
df = df[df.nunique(axis=1) > 2]

In [101]:
smoothed = df.fillna(0).T.rolling(window=3,min_periods=1).mean()
smoothed = smoothed * df.sum(axis=1) / smoothed.sum()

In [102]:
smoothed = smoothed.T

Fit Stats

In [103]:
def logistic(t, K, r, t0):
    return K / (1 + np.exp(-r * (t - t0)))

In [167]:
results = []
for i, row in smoothed.iterrows():
    y = row.cumsum().to_numpy()
    total_kilos = y[-1]
    x = np.arange(20)
    k0 = total_kilos
    r0 = 0.01
    t0 = 10
    p0 = [k0,r0,t0]
    bounds = (
    [total_kilos * 0.95, 1e-4, 0],   # lower bounds
    [total_kilos * 1.05, 1.5, len(x)]  # upper bounds
)
    try:
        popt,pcov = curve_fit(logistic,x,y,p0=p0,bounds=bounds)
    except:
        print(y)
        continue
    perr = np.sqrt(np.diag(pcov))           # standard deviation of parameters
    delta = 1.96 * perr                     # 95% confidence interval half-width
    results.append([popt,delta])

In [168]:
rf = pd.DataFrame([np.array(a).flatten() for a in results],columns=['K','r','t','K_err','r_err','t_err'],index=smoothed.index)

In [175]:
df1 = pd.concat([smoothed,rf],axis=1)

In [181]:
torch.tensor(df1.values).max(dim=0).values

torch.return_types.max(
values=tensor([0.0000e+00, 1.6138e+01, 2.7221e+02, 4.3204e+02, 2.0727e+03, 2.0727e+03,
        1.6407e+03, 8.7467e+02, 1.3333e+03, 1.9027e+03, 3.6467e+03, 6.8399e+03,
        9.0795e+03, 1.0058e+04, 1.1846e+04, 1.1675e+04, 1.4300e+04, 1.4709e+04,
        1.5432e+04, 1.6749e+04, 7.6055e+04, 1.5000e+00, 1.8136e+01, 1.0914e+04,
        1.6058e+00, 6.2950e+00], dtype=torch.float64),
indices=tensor([   0,  767,  500,  740,  740,  740,  740,  435, 3286, 3255, 3255,  763,
         503,  503,  438,  411,  411, 1677, 3053, 3053, 3053,  168, 3160,  177,
        2328,  686]))

Construct A Dataset Object

In [187]:
#imports
import torch
from torch.utils.data import Dataset
import torch.optim as optim
#class
class HarvestDataset(Dataset):
    def __init__(self, 
                 features,         # (N, 5)
                 ranch_ids,        # (N,)
                 class_ids,        # (N,)
                 type_ids,         # (N,)
                 variety_ids,      # (N,)
                 climate_data,     # (N, 100, 3)
                 Y_kilos = None,          # (N, 20)
                 stats = None          # (N, 6)
                ):
    

        # Convert to tensors
        self.features = torch.tensor(features, dtype=torch.float32)
        self.ranch_ids = torch.tensor(ranch_ids, dtype=torch.long)
        self.class_ids = torch.tensor(class_ids, dtype=torch.long)
        self.type_ids = torch.tensor(type_ids, dtype=torch.long)
        self.variety_ids = torch.tensor(variety_ids, dtype=torch.long)
        self.climate_data = torch.tensor(climate_data, dtype=torch.float32)
        Y_kilos = torch.tensor(Y_kilos, dtype=torch.float32)
        stats = torch.tensor(stats, dtype=torch.float32)
        self.outputs = torch.cat((Y_kilos, stats), dim=1)
        self.means = self.outputs.mean(dim=0)
        self.denom = 2*(self.outputs.max(dim=0).values - self.outputs.min(dim=0).values)
        self.Y = (self.outputs - self.means) / self.denom


    def __len__(self):
        return len(self.features)
    
    def get_shapes(self):
        """
        Returns a dictionary containing the shapes of all data tensors
        
        Returns
        -------
        dict
            Dictionary with tensor names as keys and their shapes as values
        """
        shapes = {
            'features': self.features.shape,
            'ranch_ids': self.ranch_ids.shape,
            'class_ids': self.class_ids.shape,
            'type_ids': self.type_ids.shape,
            'variety_ids': self.variety_ids.shape,
            'Y_kilos': self.Y_kilos.shape
        }
        return shapes
    
    def __getitem__(self, idx):
        return (
            self.features[idx],
            self.ranch_ids[idx],
            self.class_ids[idx],
            self.type_ids[idx],
            self.variety_ids[idx],
            self.climate_data[idx],
            self.Y[idx]
        )
    
    def revert(self, arr):
        return arr * self.denom + self.means

Pytorch Model

In [186]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

class ClimateEncoder(nn.Module):
    def __init__(self,
                 input_dim=5,
                 embedding_dim=4,
                 hidden_dim=64,
                 n_ranches=13,
                 n_classes=2,
                 n_types=14,
                 n_varieties=59,
                 climate_input_dim=3,
                 climate_hidden_dim=32):
        super().__init__()

        # Feature processing
        self.feature_encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )

        # Climate GRU
        self.climate_gru = nn.GRU(
            input_size=climate_input_dim,      # 3 features: temp_max, temp_min, precipitation
            hidden_size=climate_hidden_dim,     # You choose (maybe 32)
            batch_first=True
        )

        # Embedding dimensions
        self.ranch_dim = embedding_dim  # 12 ranches
        self.class_dim = embedding_dim  # 2 classes
        self.type_dim = embedding_dim  # 14 types
        self.variety_dim = embedding_dim  # 38 varieties

        self.ranch_emb = nn.Embedding(n_ranches, self.ranch_dim)
        self.class_emb = nn.Embedding(n_classes, self.class_dim)
        self.type_emb = nn.Embedding(n_types, self.type_dim)
        self.variety_emb = nn.Embedding(n_varieties, self.variety_dim)

        self.type_to_class = nn.Linear(self.type_dim, self.class_dim)
        self.variety_to_type = nn.Linear(self.variety_dim, self.type_dim)

        self.combined_dim = (
            hidden_dim +             # static features
            climate_hidden_dim +     # output from GRU
            self.ranch_dim + 
            self.class_dim + 
            self.type_dim + 
            self.variety_dim
        )


    def forward(self, features, ranch_id, class_id, type_id, variety_id, climate_data):
        """
        features: (batch_size, 5)
        climate_data: (batch_size, 100, 3)
        """

        # Static feature encoder
        h_features = self.feature_encoder(features)

        # Climate GRU
        batch_size = climate_data.size(0)
        h0 = torch.zeros(1, batch_size, self.climate_gru.hidden_size).to(climate_data.device)
        out, _ = self.climate_gru(climate_data, h0)  # out: (batch_size, seq_len, hidden_size)

        # Take last timestep
        climate_out = out[:, -1, :]  # (batch_size, climate_hidden_dim)

        # Embeddings
        r_emb = self.ranch_emb(ranch_id)
        c_emb = self.class_emb(class_id)
        t_emb = self.type_emb(type_id)
        v_emb = self.variety_emb(variety_id)

        # Hierarchy
        v_influence_on_type = self.variety_to_type(v_emb)
        t_emb = t_emb + v_influence_on_type

        t_influence_on_class = self.type_to_class(t_emb)
        c_emb = c_emb + t_influence_on_class

        # Combine all features
        combined = torch.cat([
            h_features,
            climate_out,
            r_emb,
            c_emb,
            t_emb,
            v_emb
        ], dim=-1)

        return combined


In [184]:
class StatsPredictor(nn.Module):
    def __init__(self,
                 encoder_dim,
                 hidden_dim = 32,   
                 output_dim = 6):
        super().__init__()

        self.output_dim = output_dim

        self.stats_predictor = nn.Sequential(
            nn.Linear(encoder_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, encoding):
        """
        encoding: (batch_size, encoder_dim)
        """
        return self.stats_predictor(encoding)

In [185]:
class HarvestModel(nn.Module):
    def __init__(self,
                 input_dim=5,
                 hidden_dim=64,
                 embedding_dim=4,
                 n_ranches=13,
                 n_classes=2,
                 n_types=14,
                 n_varieties=59,
                 climate_input_dim=3,
                 climate_hidden_dim=32,
                 output_dim=20):
        super().__init__()

        self.encoder = ClimateEncoder(
            input_dim=input_dim,
            hidden_dim=hidden_dim,
            embedding_dim=embedding_dim,
            n_ranches=n_ranches,
            n_classes=n_classes,
            n_types=n_types,
            n_varieties=n_varieties,
            climate_input_dim=climate_input_dim,
            climate_hidden_dim=climate_hidden_dim
        )

        self.stats_predictor = StatsPredictor(
            encoder_dim=self.encoder.combined_dim
        )

        self.t = torch.arange(output_dim)

        self.final_kilos = nn.Sequential(
            nn.Linear(self.encoder.combined_dim + output_dim, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim+self.stats_predictor.output_dim)
        )

    def forward(self, features, ranch_id, class_id, type_id, variety_id, climate_data):
        """
        features: (batch_size, 5)
        climate_data: (batch_size, 100, 3)
        """
        encoded = self.encoder(features, ranch_id, class_id, type_id, variety_id, climate_data)

        o2 = self.stats_predictor(encoded)
        pmf = self.logistic_pmf(o2)

        together = torch.cat((encoded,pmf),dim=1)
        o1 = self.final_kilos(together)

        return torch.cat((o1,o2),dim=1)


    def logistic_pmf(self, X) -> torch.Tensor:
        # Step 1: compute cumulative logistic
        K = X[0]
        r = X[1]
        t0 = X[2]
        cumulative = K / (1 + torch.exp(-r * (self.t - t0)))
        
        # Step 2: approximate PMF as discrete difference
        pmf = torch.diff(cumulative, prepend=torch.tensor([0.0], dtype=cumulative.dtype))
        
        return pmf

In [188]:

def train_harvest_model(train_dataset, num_epochs=5, batch_size=32, lr=1e-3):
    
    # Create DataLoader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Initialize Model
    model = HarvestModel()

    # Loss and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        for batch in train_loader:
            features, ranch_id, class_id, type_id, variety_id, climate_data, Y = batch

    

            # Forward pass
            outputs = model(features, ranch_id, class_id, type_id, variety_id, climate_data)
            loss = criterion(outputs, Y)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    return model

In [192]:
train_harvest_model(dataset)

TypeError: object of type 'HarvestDataset' has no len()

In [190]:
features = np.column_stack([
        meta['Ha'].values,                    # Hectares
        meta['WeekTransplanted_sin'].values,  # Week sine
        meta['WeekTransplanted_cos'].values,  # Week cosine
        meta['Year'].values,                  # Year
        np.ones(len(meta))                    # Constant feature
    ])
climate_data = np.stack(meta['ClimateSeries'].values)

ranch_ids = meta['Ranch'].map(mapping_dict['Ranch']).values
class_ids = meta['Class'].map(mapping_dict['Class']).values
type_ids = meta['Type'].map(mapping_dict['Type']).values
variety_ids = meta['Variety'].map(mapping_dict['Variety']).values

In [191]:
dataset = HarvestDataset(
        features=features,
        ranch_ids=ranch_ids,
        class_ids=class_ids,
        type_ids=type_ids,
        variety_ids=variety_ids,
        climate_data=climate_data,
        Y_kilos=  smoothed.values,
        stats = rf.values
    )