In [1]:
# Imports
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

(optional) Get datasets from github repo

In [2]:
# Access github repo
REPO_URL = "https://github.com/cchen744/CornYield_NN.git"
REPO_NAME = "CornYield_NN" # This is the folder name that will be created

# 4. Clone the repository
# We use the token for secure, authenticated access
!git clone https://github.com/cchen744/CornYield_NN.git

# 5. Change the working directory into the cloned repository folder
import os
os.chdir(REPO_NAME)

# Verify the files are there (you should see your notebook and dataset files)
print(f"Current directory contents in /{REPO_NAME}:")
!ls -F

Cloning into 'CornYield_NN'...
remote: Enumerating objects: 42, done.[K
remote: Counting objects: 100% (42/42), done.[K
remote: Compressing objects: 100% (36/36), done.[K
remote: Total 42 (delta 18), reused 23 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (42/42), 1.71 MiB | 6.79 MiB/s, done.
Resolving deltas: 100% (18/18), done.
Current directory contents in /CornYield_NN:
counties.csv		prism_weather_1984-1994.csv  soil_data.csv
final_dataset.csv	prism_weather_1995-2009.csv  weather_clean.csv
get_data.ipynb		prism_weather_2010-2024.csv  yield_clean.csv
LSTM_+_MLP_model.ipynb	prism_weather.csv	     yield.csv
nasa_weather.csv	soil_clean.csv


#Dataset preparation

We have 3 types of datasets:
1. yield data, which looks like: <br>
Year, County, Yield
2. monthly weather data, which looks like: <br>
County, Year, Month, solar_radiation, humidity, ...., vpd_max
3. soil dataset: <br>
County, bdod, cec, ....

In [3]:
# Load 3 datasets
yield_df_path = "/content/CornYield_NN/yield_clean.csv"
soil_df_path = "/content/CornYield_NN/soil_clean.csv"
weather_df_path = "/content/CornYield_NN/weather_clean.csv"
yield_df = pd.read_csv(yield_df_path)
soil_df = pd.read_csv(soil_df_path)
weather_df = pd.read_csv(weather_df_path)
print(yield_df.head())
print(soil_df.head())
print(weather_df.head())

   Year      County  Yield
0  2024       adams  120.1
1  2024  green lake  168.3
2  2024      juneau  141.0
3  2024   marquette  126.0
4  2024     portage  159.5
     County  bdod  cec  clay  nitrogen  phh2o  sand  silt  soc
0     adams   132  163   192       698     60   489   319  375
1   ashland   115  249   258       601     51   216   526  605
2    barron   133  150   185       516     55   317   498  677
3  bayfield   111  266   187       545     50   480   333  668
4     brown   134  289   308       781     66   328   364  429
  County  Year  Month  solar_radiation  humidity  wind_speed  wind_speed_max  \
0  adams  1984      4            16.85     75.80        4.40           12.36   
1  adams  1984      5            17.63     74.25        3.26            9.64   
2  adams  1984      6            20.75     78.96        3.42            9.15   
3  adams  1984      7            21.90     74.48        2.68            8.19   
4  adams  1984      8            18.52     68.76        2.31

We want the dataset put into LSTM has the format:<br>
(batch_size,num_months,num_features)

In [4]:
# prepare monthly sequences

# Select monthly columns for LSTM
monthly_features = weather_df.columns[3:]
# Sort and group monthly dataset by (County, Year)
grouped_weather = (weather_df
    .sort_values(["County", "Year", "Month"])
    .groupby(["County", "Year"])
)

# Build dictionary: key = (county, year), value = monthly sequence array
monthly_weather_dict = {}

for (county, year), g in grouped_weather:
    seq = g[monthly_features].values  # shape = (num_months, num_features)
    monthly_weather_dict[(county, year)] = seq

# Show an example
first_key = list(monthly_weather_dict.keys())[0]
print("Example key:", first_key)
print("Sequence shape:", monthly_weather_dict[first_key].shape)
print(monthly_weather_dict[first_key][:3])  # first 3 months


Example key: ('adams', np.int64(1984))
Sequence shape: (6, 11)
[[16.85 75.8   4.4  12.36  3.83 35.5  46.5  57.5  29.7   1.57 10.85]
 [17.63 74.25  3.26  9.64  2.27 41.4  53.7  65.9  40.2   1.12 13.4 ]
 [20.75 78.96  3.42  9.15  5.96 56.2  68.1  80.   57.    1.2  18.43]]


Train test split

In [None]:
# Original
"""
# Find all (county, year) pairs with yield + weather keys, store keys in a list
all_keys = list(monthly_weather_dict.keys())
# Perform train_test_split on key_list
train_keys, test_keys = train_test_split(all_keys, test_size=0.2, random_state=42)
"""

In [26]:
# UPDATE skip data that is not present in both yield and weather

# Get all (county, year) pairs from monthly_weather_dict
all_weather_keys = set(monthly_weather_dict.keys())

# Ensure 'Year' column in yield_df is np.int64 to match weather_dict keys
yield_df['Year'] = yield_df['Year'].astype(np.int64)

# Get all (county, year) pairs from yield_df with consistent Year type
all_yield_keys = set(tuple(row) for row in yield_df[['County', 'Year']].values)

# Find the intersection of keys (only keep pairs that have both weather and yield data)
all_keys = list(all_weather_keys.intersection(all_yield_keys))

# Perform train_test_split on key_list
train_keys, test_keys = train_test_split(all_keys, test_size=0.2, random_state=42)

In [34]:
# UPDATE fixed issue with incorrect datatypes and assignments
# Dataset preparations -- GitHub
class YieldDataset(Dataset):
    def __init__(self, keys, yield_df, soil_df, weather_dict):
        self.keys = keys
        self.yield_df = yield_df
        self.soil_df = soil_df
        self.weather_dict = weather_dict

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
      county, year = self.keys[idx]
      # get weather
      weather_seq = self.weather_dict[(county, year)] # shape: (n_month,n_features)
      weather_seq = torch.tensor(weather_seq, dtype=torch.float32) # UPDATE seq TO weather_seq
      # WAS weather_seq = torch.tensor(seq, dtype=torch.float32)
      # get yield
      y = self.yield_df[
      (self.yield_df["County"]==county) &
       (self.yield_df["Year"]==year)]["Yield"].values[0]
      # get soil
      soil_row = self.soil_df[self.soil_df["County"] == county].iloc[0]
      soil_vec = soil_row.drop("County").values
      soil_vec = torch.tensor(soil_vec.astype(np.float32), dtype=torch.float32) # shape: (n_soil_featuresï¼Œ) UPDATE soil_vec TO soil_vec.astype(np.float32)
      # WAS soil_vec = torch.tensor(soil_vec, dtype=torch.float32)

      return weather_seq, soil_vec, torch.tensor([y], dtype=torch.float32)

In [35]:
# Create DataLoader
batch_size = 32
train_dataset = YieldDataset(train_keys, yield_df, soil_df, monthly_weather_dict)
test_dataset = YieldDataset(test_keys, yield_df, soil_df, monthly_weather_dict)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [29]:
# Model
# Model assumes that there is at least 1 dimension in monthly, and static
class YieldLSTMMLPConnected(nn.Module):
    def __init__(self,
                    monthly_dim=11,     # Avg's by month (seq features, should be 11 features from weather_clean.csv)
                    monthly_layers=1,
                    monthly_hidden=64,
                    static_dim=8,    # number of static features (should be 8 features from soil data)
                    static_hidden=32,
                    head_hidden=64,
                    output_dim=1,
                    dropout=0.1
                    ):
        super().__init__()

        # Monthly branch LSTM
        self.lstm = nn.LSTM(
            input_size=monthly_dim,
            hidden_size=monthly_hidden,
            num_layers=monthly_layers,
            batch_first=True,
            bidirectional=False
        )
        self.monthly_proj = nn.Sequential(
            nn.Linear(monthly_hidden, monthly_hidden),
            nn.ReLU(),
            nn.Dropout(dropout)
        )


        # Static branch MLP
        self.static_proj = nn.Sequential(
            nn.Linear(static_dim, static_hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(static_hidden, static_hidden),
            nn.ReLU(),
        )

        # Combined head (output of combined branches)
        combined_dim = monthly_hidden + static_hidden
        self.head = nn.Sequential(
            nn.Linear(combined_dim, head_hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(head_hidden, head_hidden//2),
            nn.ReLU(),
            nn.Linear(head_hidden//2, output_dim)
        )

    def forward(self, weather_seq, static):
        feats = []

        # Monthly shape (batch, seq_len, monthly_dim)
        # LSTM: take last hidden state
        lstm_out, (h_n, c_n) = self.lstm(weather_seq)
        # h_n shape: (num_layers, batch, hidden)
        last_h = h_n[-1] # (batch, monthly_hidden)
        monthly_emb = self.monthly_proj(last_h)
        feats.append(monthly_emb)

        feats.append(self.static_proj(static))

        combined = torch.cat(feats, dim=1)
        out = self.head(combined)
        return out

In [36]:
# UPDATE removed yearly in for loops within train_loader
# Training
def train_model(
    model,
    train_loader,
    val_loader,
    num_epochs=50,
    lr=1e-3,
    weight_decay=1e-5,
    device="cuda" if torch.cuda.is_available() else "cpu",
    early_stop_patience=8
):

    model = model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_val_loss = float("inf")
    patience_counter = 0

    for epoch in range(1, num_epochs+1):
        # -------- TRAIN MODE --------
        model.train()
        train_losses = []

        for monthly, static, target in train_loader: # UPDATE removed yearly
            monthly = monthly.to(device)
            static = static.to(device)
            target = target.to(device)

            optimizer.zero_grad()
            preds = model(monthly, static) # UPDATE removed yearly

            loss = criterion(preds, target)
            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())

        # -------- VAL MODE --------
        model.eval()
        val_losses = []

        with torch.no_grad():
            for monthly, static, target in val_loader: # UPDATE removed yearly
                monthly = monthly.to(device)
                static = static.to(device)
                target = target.to(device)

                preds = model(monthly, static) # UPDATE removed yearly
                loss = criterion(preds, target)
                val_losses.append(loss.item())

        train_loss = np.mean(train_losses)
        val_loss = np.mean(val_losses)

        print(f"Epoch {epoch:03d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

        # ---- EARLY STOP ----
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), "best_yield_model.pt")
        else:
            patience_counter += 1
            if patience_counter >= early_stop_patience:
                print("Early stopping triggered!")
                break

    print("Training completed. Best model saved as best_yield_model.pt")

In [37]:
# Run Testing
model = YieldLSTMMLPConnected()
train_model(model, train_loader, test_loader)

Epoch 001 | Train Loss: 6000.5525 | Val Loss: 1131.2688
Epoch 002 | Train Loss: 1184.9809 | Val Loss: 1133.1634
Epoch 003 | Train Loss: 1091.6131 | Val Loss: 1001.7599
Epoch 004 | Train Loss: 996.0172 | Val Loss: 1442.7393
Epoch 005 | Train Loss: 1085.8099 | Val Loss: 1114.8598
Epoch 006 | Train Loss: 980.5539 | Val Loss: 1253.8137
Epoch 007 | Train Loss: 1004.3943 | Val Loss: 1143.7836
Epoch 008 | Train Loss: 976.9973 | Val Loss: 1017.7160
Epoch 009 | Train Loss: 974.0819 | Val Loss: 1386.3222
Epoch 010 | Train Loss: 937.3584 | Val Loss: 896.0145
Epoch 011 | Train Loss: 933.5188 | Val Loss: 962.7823
Epoch 012 | Train Loss: 895.8220 | Val Loss: 869.6092
Epoch 013 | Train Loss: 863.6295 | Val Loss: 838.9163
Epoch 014 | Train Loss: 865.8422 | Val Loss: 1398.5749
Epoch 015 | Train Loss: 817.3990 | Val Loss: 992.1067
Epoch 016 | Train Loss: 789.6973 | Val Loss: 1406.3847
Epoch 017 | Train Loss: 798.9112 | Val Loss: 747.2514
Epoch 018 | Train Loss: 738.9269 | Val Loss: 918.7277
Epoch 019 | 

In [39]:
# Load the best model
best_model = YieldLSTMMLPConnected()
best_model.load_state_dict(torch.load("best_yield_model.pt"))

<All keys matched successfully>