In [1]:
!pip install -q einops

## IMPORT LIBS

In [2]:
import os
import numpy as np
import random
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from torch.amp import autocast
from torch.utils.data import Dataset,DataLoader
import torch.nn as nn
import torchvision.models as models
from tqdm import tqdm
import gc
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
import h5py
from einops import rearrange
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr
from matplotlib import pyplot as plt
import seaborn as sns
from typing import List, Tuple
from datetime import datetime, timedelta
from pprint import pprint
import json
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

## REPRODUCTIVITY

In [3]:
# Set environment variable
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

global_seed = 0

random.seed(global_seed)
np.random.seed(global_seed)

torch.manual_seed(global_seed)
torch.use_deterministic_algorithms(True)

In [4]:
def make_config(years: List[int], state: str, state_ansi: str, fips: str, crop_type: str, grow_season: List[int]):
    """
    Creates configuration for winter wheat data collection
    grow_season: List containing [start_month, end_month] of the growing cycle
                 For winter wheat, this spans across year boundary
    """
    config = {
        "FIPS": fips,
        "years": years,
        "state": state.upper(),
        "crop_type": crop_type,
        "data": {
            "HRRR": {
                "short_term": []
            },
            "USDA": [],
            "sentinel": []
        }
    }
   
    for year in years:
        # HRRR data - need to consider months from previous year's fall
        hrrr_files = []
        # Previous year's fall months (planting)
        for month in range(9, 13):  # September to December
            hrrr_files.append(f"HRRR/{year-1}/{state.upper()}/HRRR_{state_ansi}_{state.upper()}_{year-1}-{month:02d}.csv")
        # Current year's winter and spring months (growing and harvest)
        for month in range(1, 7):  # January to July
            hrrr_files.append(f"HRRR/{year}/{state.upper()}/HRRR_{state_ansi}_{state.upper()}_{year}-{month:02d}.csv")
        
        config["data"]["HRRR"]["short_term"].append(hrrr_files)
       
        # USDA data
        config["data"]["USDA"].append(f"USDA/{crop_type}/{year}/USDA_WinterWheat_County_{year}.csv")
       
        # Sentinel data - need to cover previous fall to current summer
        quarters = [
            # Previous year quarters
            (f"{year-1}-10-01", f"{year-1}-12-31"),  # Q4 (planting)
            # Current year quarters
            (f"{year}-01-01", f"{year}-03-31"),      # Q1 (winter growth)
            (f"{year}-04-01", f"{year}-06-30"),      # Q2 (spring growth)
        ]
       
        sentinel_files = []
        for start, end in quarters:
            sentinel_files.append(f"AG/{state.upper()}/{start[:4]}/Agriculture_{state_ansi}_{state.upper()}_{start}_{end}.h5")
       
        config["data"]["sentinel"].append(sentinel_files)
   
    return config

# Train
years = list(range(2018, 2022))
state = "IL"
state_ansi = "17"
fips = ['17011', '17013', '17023', '17025', '17027', '17033', '17037', '17047', '17049', '17067', 
        '17083', '17089', '17095', '17119', '17121', '17125', '17133', '17141', '17157', '17159', 
        '17163', '17173', '17177', '17179', '17189', '17201']
crop_type = "WinterWheat"
grow_season = [9, 6]  # September to July (spanning across years)

train_config = make_config(years, state, state_ansi, fips, crop_type, grow_season)
with open('train_config.json', 'w') as file:
    json.dump(train_config, file)

# Test
years = [2022]
test_config = make_config(years, state, state_ansi, fips, crop_type, grow_season)
with open('test_config.json', 'w') as file:
    json.dump(test_config, file)

## DATA LOADER

In [5]:
class Sentinel2Imagery(Dataset):
    def __init__(self, base_dir, config_file, transform=None):
        self.transform = transform
        self.base_dir = base_dir
        
        with open(config_file, 'r') as f:
            obj = json.load(f)
        
        self.fips_codes = obj["FIPS"]
        self.years = obj["years"]
        self.file_paths = obj["data"]["sentinel"]
    
    def __len__(self):
        return len(self.fips_codes) * len(self.years)

    def __getitem__(self, index):
        fips_index = index // len(self.years)
        year_index = index % len(self.years)
        
        fips_code = self.fips_codes[fips_index]
        year = self.years[year_index]
        file_paths = self.file_paths[year_index]
        
        temporal_list = []
        for file_path in file_paths:
            with h5py.File(os.path.join(self.base_dir, file_path), 'r') as hf:
                groups = hf[fips_code]
                for d in groups.keys():
                    grids = groups[d]["data"]
                    grids = torch.from_numpy(np.asarray(grids))
                    temporal_list.append(grids)
                hf.close()
        x = torch.stack(temporal_list)
        x = x.to(torch.float32)
        x = rearrange(x, 't g h w c -> t g c h w')
        if self.transform:
            t, g, _, _, _ = x.shape
            x = rearrange(x, 't g c h w -> (t g) c h w')
            x = self.transform(x)
            x = rearrange(x, '(t g) c h w -> t g c h w', t=t, g=g)
        return x, fips_code, year

class HRRRComputedDataset(Dataset):
    def __init__(self, base_dir, config_file, column_names=None):
        self.base_dir = base_dir
        self.day_range = [i + 1 for i in range(28)]
        
        with open(config_file, 'r') as f:
            obj = json.load(f)
        
        self.fips_codes = obj["FIPS"]
        self.years = obj["years"]
        self.short_term_file_path = obj["data"]["HRRR"]["short_term"]
        
        if column_names:
            self.column_names = column_names
        else:
            self.column_names = [
                'Avg Temperature (K)', 'Max Temperature (K)', 'Min Temperature (K)',
                'Precipitation (kg m**-2)', 'Relative Humidity (%)', 'Wind Gust (m s**-1)',
                'Wind Speed (m s**-1)', 'Downward Shortwave Radiation Flux (W m**-2)',
                'Vapor Pressure Deficit (kPa)'
            ]

    def __len__(self):
        return len(self.fips_codes) * len(self.years)

    def __getitem__(self, index):
        fips_index = index // len(self.years)
        year_index = index % len(self.years)
        
        fips_code = self.fips_codes[fips_index]
        year = self.years[year_index]
        short_term_file_paths = self.short_term_file_path[year_index]
        x_short = self.get_short_term_val(fips_code, short_term_file_paths)
        x_short = x_short.to(torch.float32)
        return x_short, fips_code, year

    def get_short_term_val(self, fips_code, file_paths):
        df_list = []
        for file_path in file_paths:
            tmp_df = pd.read_csv(os.path.join(self.base_dir, file_path))
            df_list.append(tmp_df)

        df = pd.concat(df_list, ignore_index=True)
        df["FIPS Code"] = df["FIPS Code"].astype(str).str.zfill(5)
        df = df[(df["FIPS Code"] == fips_code) & (df["Daily/Monthly"] == "Daily")]
        df.columns = df.columns.str.strip()

        group_month = df.groupby(['Month'])

        temporal_list = []
        for month, df_month in group_month:
            group_grid = df_month.groupby(['Grid Index'])

            time_series = []
            for grid, df_grid in group_grid:
                df_grid = df_grid.sort_values(by=['Day'], ascending=[True], na_position='first')
                df_grid = df_grid[df_grid.Day.isin(self.day_range)]
                df_grid = df_grid[self.column_names]
                val = self.signed_log_transform(torch.from_numpy(df_grid.values))
                time_series.append(val)

            temporal_list.append(torch.stack(time_series))

        x_short = torch.stack(temporal_list)
        x_short = rearrange(x_short, 'm g d p -> m d g p')
        return x_short

    def signed_log_transform(self, data):
        epsilon = 1e-9  # small constant to avoid log(0)
        return torch.sign(data) * torch.log10(torch.abs(data) + epsilon)

class USDACropDataset(Dataset):
    def __init__(self, base_dir, config_file, crop_type):
        self.base_dir = base_dir
        self.crop_type = crop_type
        
        with open(config_file, 'r') as f:
            obj = json.load(f)
        
        self.fips_codes = obj["FIPS"]
        self.years = obj["years"]
        self.file_paths = obj["data"]["USDA"]

        if crop_type == "Cotton":
            self.column_names = ['PRODUCTION, MEASURED IN 480 LB BALES', 'YIELD, MEASURED IN LB / ACRE']
        else:
            self.column_names = ['PRODUCTION, MEASURED IN BU', 'YIELD, MEASURED IN BU / ACRE']

        
    def __len__(self):
        return len(self.fips_codes) * len(self.years)
    def get_num_classes(self):
        return len(self.fips_encoder.classes_)
    def __getitem__(self, index):
        fips_index = index // len(self.years)
        year_index = index % len(self.years)
        
        fips_code = self.fips_codes[fips_index]
        year = self.years[year_index]
        file_path = self.file_paths[year_index]
        df = pd.read_csv(os.path.join(self.base_dir, file_path))

        df['state_ansi'] = df['state_ansi'].astype(str).str.zfill(2)
        df['county_ansi'] = df['county_ansi'].astype(str).str.zfill(3)

        df = df[(df["state_ansi"] == fips_code[:2]) & (df["county_ansi"] == fips_code[-3:])]

        df = df[self.column_names]
        x = torch.from_numpy(df.values)
        x = x.to(torch.float32)
        x = torch.log(torch.flatten(x, start_dim=0))
        return x, fips_code, year

## MODAL ARCHITECTURE

In [6]:
class RandomForestCropPredictor:
    def __init__(self, n_estimators=100):
        """
        Initialize the Random Forest predictor for crop yield/production
        """
        self.model = RandomForestRegressor(
            n_estimators=n_estimators,
            random_state=global_seed,
            n_jobs=-1
        )
        
    def aggregate_grids(self, hrrr_batch):
        """
        Aggregate features across grids using statistical measures
        hrrr_batch shape: [batch, months, days, grids, parameters]
        """
        # Move grids dimension to end for easier aggregation
        # New shape: [batch, months, days, parameters, grids]
        batch_data = hrrr_batch.permute(0, 1, 2, 4, 3)
        
        # Calculate statistics across grids
        grid_mean = batch_data.mean(dim=-1)
        grid_std = batch_data.std(dim=-1)
        grid_min = batch_data.min(dim=-1)[0]
        grid_max = batch_data.max(dim=-1)[0]
        
        # Concatenate all statistics
        # Shape: [batch, months, days, parameters * 4]
        aggregated = np.concatenate([
            grid_mean.numpy(),
            grid_std.numpy(),
            grid_min.numpy(),
            grid_max.numpy()
        ], axis=-1)
        
        # Flatten all dimensions except batch
        # Shape: [batch, months * days * parameters * 4]
        return aggregated.reshape(aggregated.shape[0], -1)
        
    def train(self, hrrr_loader, usda_loader):
        """
        Train the random forest model
        """
        X, y = [], []
        
        for (hrrr_batch, hrrr_fips, _), (usda_batch, usda_fips, _) in zip(hrrr_loader, usda_loader):
            # Verify data alignment
            assert all(h == u for h, u in zip(hrrr_fips, usda_fips)), "HRRR and USDA FIPS mismatch"
            
            # Aggregate grid features
            X.append(self.aggregate_grids(hrrr_batch))
            y.append(usda_batch.numpy())
        
        X = np.concatenate(X)
        y = np.concatenate(y)
        
        # Scale features
        X_scaled = X
        
        # Train model
        self.model.fit(X_scaled, y)
        
        # Calculate training metrics
        y_pred = self.model.predict(X_scaled)
        
        return {
            'rmse': np.sqrt(mean_squared_error(y, y_pred)),
            'mae': mean_absolute_error(y, y_pred)
        }
    
    def predict(self, hrrr_loader):
        """
        Make predictions using the trained model
        """
        X, fips = [], []
        
        for hrrr_batch, batch_fips, _ in hrrr_loader:
            X.append(self.aggregate_grids(hrrr_batch))
            fips.extend(batch_fips)
        
        X = np.concatenate(X)
        X_scaled = X
        
        return self.model.predict(X_scaled), fips
    
    def evaluate(self, hrrr_loader, usda_loader):
        """
        Evaluate the model performance
        """
        all_predictions, pred_fips = self.predict(hrrr_loader)
        
        all_ground_truth, true_fips = [], []
        for usda_batch, batch_fips, _ in usda_loader:
            all_ground_truth.append(usda_batch.numpy())
            true_fips.extend(batch_fips)
            
        all_ground_truth = np.concatenate(all_ground_truth)

        # Verify data alignment
        assert all(p == t for p, t in zip(pred_fips, true_fips)), "Prediction and true FIPS mismatch"
        results = {}
        for i, metric_name in enumerate(["Production", "Yield"]):
            y_true = torch.from_numpy(all_ground_truth[:, i])
            y_pred = torch.from_numpy(all_predictions[:, i])
            mae = torch.abs(y_true - y_pred).mean()
            mse = ((y_pred - y_true) ** 2).mean()
            rmse = torch.sqrt(mse)
            mape = (torch.abs(y_true - y_pred) / torch.abs(y_true)).mean() * 100
            smape = 100 * (torch.abs(y_true - y_pred) / ((torch.abs(y_true) + torch.abs(y_pred)) / 2)).mean()
            max_error = torch.abs(y_true - y_pred).max()
            corr = torch.corrcoef(torch.stack((y_pred, y_true)))
            metrics = {
                'MAE': round(mae.item(), 2),
                'MSE': round(mse.item(), 2),
                'RMSE': round(rmse.item(), 2),
                'MAPE': round(mape.item(), 2),
                'SMAPE': round(smape.item(), 2)
            }
            results[metric_name] = metrics
        return results
    
    def get_feature_importance(self, parameters, months):
        """
        Get feature importance scores for different statistics and parameters
        """
        importances = self.model.feature_importances_
        n_days = 28  # As defined in HRRRComputedDataset
        n_params = len(parameters)
        n_stats = 4  # mean, std, min, max
        
        # Reshape importances to [months, days, parameters, stats]
        shaped_imp = importances.reshape(len(months), n_days, n_params, n_stats)
        
        # Calculate importance per parameter and statistic
        param_importance = shaped_imp.mean(axis=(0, 1))  # Average across months and days
        
        importance_dict = {
            'parameters': {},
            'statistics': {
                'mean': shaped_imp[..., 0].mean(),
                'std': shaped_imp[..., 1].mean(),
                'min': shaped_imp[..., 2].mean(),
                'max': shaped_imp[..., 3].mean()
            }
        }
        
        for i, param in enumerate(parameters):
            importance_dict['parameters'][param] = {
                'mean': param_importance[i, 0],
                'std': param_importance[i, 1],
                'min': param_importance[i, 2],
                'max': param_importance[i, 3]
            }
            
        return importance_dict

In [7]:
class DecisionTreeCropPredictor:
    def __init__(self):
        """
        Initialize the Decision Tree predictor for crop yield/production
        """
        self.model = DecisionTreeRegressor(
            random_state=global_seed
        )
        
    def aggregate_grids(self, hrrr_batch):
        """
        Aggregate features across grids using statistical measures
        hrrr_batch shape: [batch, months, days, grids, parameters]
        """
        # Move grids dimension to end for easier aggregation
        # New shape: [batch, months, days, parameters, grids]
        batch_data = hrrr_batch.permute(0, 1, 2, 4, 3)
        
        # Calculate statistics across grids
        grid_mean = batch_data.mean(dim=-1)
        grid_std = batch_data.std(dim=-1)
        grid_min = batch_data.min(dim=-1)[0]
        grid_max = batch_data.max(dim=-1)[0]
        
        # Concatenate all statistics
        # Shape: [batch, months, days, parameters * 4]
        aggregated = np.concatenate([
            grid_mean.numpy(),
            grid_std.numpy(),
            grid_min.numpy(),
            grid_max.numpy()
        ], axis=-1)
        
        # Flatten all dimensions except batch
        # Shape: [batch, months * days * parameters * 4]
        return aggregated.reshape(aggregated.shape[0], -1)
        
    def train(self, hrrr_loader, usda_loader):
        """
        Train the decision tree model
        """
        X, y = [], []
        
        for (hrrr_batch, hrrr_fips, _), (usda_batch, usda_fips, _) in zip(hrrr_loader, usda_loader):
            # Verify data alignment
            assert all(h == u for h, u in zip(hrrr_fips, usda_fips)), "HRRR and USDA FIPS mismatch"
            
            # Aggregate grid features
            X.append(self.aggregate_grids(hrrr_batch))
            y.append(usda_batch.numpy())
        
        X = np.concatenate(X)
        y = np.concatenate(y)
        
        # Scale features
        X_scaled = X
        
        # Train model
        self.model.fit(X_scaled, y)
        
        # Calculate training metrics
        y_pred = self.model.predict(X_scaled)
        
        return {
            'rmse': np.sqrt(mean_squared_error(y, y_pred)),
            'mae': mean_absolute_error(y, y_pred)
        }
    
    def predict(self, hrrr_loader):
        """
        Make predictions using the trained model
        """
        X, fips = [], []
        
        for hrrr_batch, batch_fips, _ in hrrr_loader:
            X.append(self.aggregate_grids(hrrr_batch))
            fips.extend(batch_fips)
        
        X = np.concatenate(X)
        X_scaled = X
        
        return self.model.predict(X_scaled), fips
    
    def evaluate(self, hrrr_loader, usda_loader):
        """
        Evaluate the model performance
        """
        all_predictions, pred_fips = self.predict(hrrr_loader)
        
        all_ground_truth, true_fips = [], []
        for usda_batch, batch_fips, _ in usda_loader:
            all_ground_truth.append(usda_batch.numpy())
            true_fips.extend(batch_fips)
            
        all_ground_truth = np.concatenate(all_ground_truth)

        # Verify data alignment
        assert all(p == t for p, t in zip(pred_fips, true_fips)), "Prediction and true FIPS mismatch"
        results = {}
        for i, metric_name in enumerate(["Production", "Yield"]):
            y_true = torch.from_numpy(all_ground_truth[:, i])
            y_pred = torch.from_numpy(all_predictions[:, i])
            mae = torch.abs(y_true - y_pred).mean()
            mse = ((y_pred - y_true) ** 2).mean()
            rmse = torch.sqrt(mse)
            mape = (torch.abs(y_true - y_pred) / torch.abs(y_true)).mean() * 100
            smape = 100 * (torch.abs(y_true - y_pred) / ((torch.abs(y_true) + torch.abs(y_pred)) / 2)).mean()
            max_error = torch.abs(y_true - y_pred).max()
            corr = torch.corrcoef(torch.stack((y_pred, y_true)))
            metrics = {
                'MAE': round(mae.item(), 2),
                'MSE': round(mse.item(), 2),
                'RMSE': round(rmse.item(), 2),
                'MAPE': round(mape.item(), 2),
                'SMAPE': round(smape.item(), 2)
            }
            results[metric_name] = metrics
        return results
    
    def get_feature_importance(self, parameters, months):
        """
        Get feature importance scores for different statistics and parameters
        """
        importances = self.model.feature_importances_
        n_days = 28  # As defined in HRRRComputedDataset
        n_params = len(parameters)
        n_stats = 4  # mean, std, min, max
        
        # Reshape importances to [months, days, parameters, stats]
        shaped_imp = importances.reshape(len(months), n_days, n_params, n_stats)
        
        # Calculate importance per parameter and statistic
        param_importance = shaped_imp.mean(axis=(0, 1))  # Average across months and days
        
        importance_dict = {
            'parameters': {},
            'statistics': {
                'mean': shaped_imp[..., 0].mean(),
                'std': shaped_imp[..., 1].mean(),
                'min': shaped_imp[..., 2].mean(),
                'max': shaped_imp[..., 3].mean()
            }
        }
        
        for i, param in enumerate(parameters):
            importance_dict['parameters'][param] = {
                'mean': param_importance[i, 0],
                'std': param_importance[i, 1],
                'min': param_importance[i, 2],
                'max': param_importance[i, 3]
            }
            
        return importance_dict

## TRAIN AND TEST

## Winter Wheat

In [8]:
base_dir = "/kaggle/input/cropnetv2"

train_config = "train_config.json"
test_config = "test_config.json"
train_hrrr_dataset = HRRRComputedDataset(base_dir,train_config)
train_usda_dataset = USDACropDataset(base_dir,train_config,crop_type)

test_hrrr_dataset = HRRRComputedDataset(base_dir,test_config)
test_usda_dataset = USDACropDataset(base_dir,test_config,crop_type)

# Create data loaders
train_hrrr_loader = DataLoader(train_hrrr_dataset, batch_size=1, shuffle=False)
train_usda_loader = DataLoader(train_usda_dataset, batch_size=1, shuffle=False)
test_hrrr_loader = DataLoader(test_hrrr_dataset, batch_size=1, shuffle=False)
test_usda_loader = DataLoader(test_usda_dataset, batch_size=1, shuffle=False)

# Initialize and train model
predictor = DecisionTreeCropPredictor()

# Train and get metrics
train_metrics = predictor.train(train_hrrr_loader, train_usda_loader)

# Evaluate on test set
test_metrics = predictor.evaluate(test_hrrr_loader, test_usda_loader)
print(crop_type)
pprint(test_metrics)

WinterWheat
{'Production': {'MAE': 1.37,
                'MAPE': 9.93,
                'MSE': 3.69,
                'RMSE': 1.92,
                'SMAPE': 10.91},
 'Yield': {'MAE': 0.2, 'MAPE': 4.56, 'MSE': 0.06, 'RMSE': 0.25, 'SMAPE': 4.72}}


## Cotton

In [9]:
def make_config(years: List[int], state: str, state_ansi: str, fips: str, crop_type: str, grow_season: List[int]):

    config = {
        "FIPS": fips,
        "years": years,
        "state": state.upper(),
        "crop_type": crop_type,
        "data": {
            "HRRR": {
                "short_term": []
            },
            "USDA": [],
            "sentinel": []
        }
    }
    
    for year in years:
        # HRRR data
        hrrr_files = [
            f"HRRR/{year}/{state.upper()}/HRRR_{state_ansi}_{state.upper()}_{year}-{month:02d}.csv"
            for month in range(grow_season[0], grow_season[1] + 1)
        ]
        config["data"]["HRRR"]["short_term"].append(hrrr_files)
        
        # USDA data
        if crop_type=="Soybeans":
            config["data"]["USDA"].append(f"USDA/{crop_type}/{year}/USDA_Soybean_County_{year}.csv")
        else:
            config["data"]["USDA"].append(f"USDA/{crop_type}/{year}/USDA_{crop_type}_County_{year}.csv")
        
        # Sentinel data
        quarters = [
            (f"{year}-01-01", f"{year}-03-31"),
            (f"{year}-04-01", f"{year}-06-30"),
            (f"{year}-07-01", f"{year}-09-30"),
            (f"{year}-10-01", f"{year}-12-31")
        ]
        
        sentinel_files = []
        for start, end in quarters:
            quarter_start = datetime.strptime(start, "%Y-%m-%d")
            quarter_end = datetime.strptime(end, "%Y-%m-%d")
            if (grow_season[0] <= quarter_start.month <= grow_season[1]) or \
               (grow_season[0] <= quarter_end.month <= grow_season[1]):
                sentinel_files.append(f"AG/{state.upper()}/{year}/Agriculture_{state_ansi}_{state.upper()}_{start}_{end}.h5")
        
        config["data"]["sentinel"].append(sentinel_files)
    
    return config

# Train
years = list(range(2018,2022))
state = "AL"
state_ansi = "01"
fips = ['01003', '01015', '01019', '01031', '01039', '01045', '01047', '01053', '01061', 
        '01067', '01069', '01077', '01079', '01083', '01089', '01097', '01099', '01117'] 

crop_type = "Cotton"
grow_season = [4, 9]  # April to September

train_config = make_config(years, state, state_ansi, fips, crop_type, grow_season)
with open('train_config.json', 'w') as file:
    json.dump(train_config, file)

# Test
years = [2022]
test_config = make_config(years,  state, state_ansi, fips, crop_type, grow_season)
with open('test_config.json', 'w') as file:
    json.dump(test_config, file)

In [10]:
base_dir = "/kaggle/input/cropnetv2"

train_config = "train_config.json"
test_config = "test_config.json"
train_hrrr_dataset = HRRRComputedDataset(base_dir,train_config)
train_usda_dataset = USDACropDataset(base_dir,train_config,crop_type)

test_hrrr_dataset = HRRRComputedDataset(base_dir,test_config)
test_usda_dataset = USDACropDataset(base_dir,test_config,crop_type)

# Create data loaders
train_hrrr_loader = DataLoader(train_hrrr_dataset, batch_size=1, shuffle=False)
train_usda_loader = DataLoader(train_usda_dataset, batch_size=1, shuffle=False)
test_hrrr_loader = DataLoader(test_hrrr_dataset, batch_size=1, shuffle=False)
test_usda_loader = DataLoader(test_usda_dataset, batch_size=1, shuffle=False)

# Initialize and train model
predictor = DecisionTreeCropPredictor()

# Train and get metrics
train_metrics = predictor.train(train_hrrr_loader, train_usda_loader)

# Evaluate on test set
test_metrics = predictor.evaluate(test_hrrr_loader, test_usda_loader)
print(crop_type)
pprint(test_metrics)

Cotton
{'Production': {'MAE': 0.44,
                'MAPE': 4.46,
                'MSE': 0.32,
                'RMSE': 0.56,
                'SMAPE': 4.33},
 'Yield': {'MAE': 0.19, 'MAPE': 2.76, 'MSE': 0.05, 'RMSE': 0.21, 'SMAPE': 2.76}}


## Corn

In [11]:

def make_config(years: List[int], state: str, state_ansi: str, fips: str, crop_type: str, grow_season: List[int]):

    config = {
        "FIPS": fips,
        "years": years,
        "state": state.upper(),
        "crop_type": crop_type,
        "data": {
            "HRRR": {
                "short_term": []
            },
            "USDA": [],
            "sentinel": []
        }
    }
    
    for year in years:
        # HRRR data
        hrrr_files = [
            f"HRRR/{year}/{state.upper()}/HRRR_{state_ansi}_{state.upper()}_{year}-{month:02d}.csv"
            for month in range(grow_season[0], grow_season[1] + 1)
        ]
        config["data"]["HRRR"]["short_term"].append(hrrr_files)
        
        # USDA data
        if crop_type=="Soybeans":
            config["data"]["USDA"].append(f"USDA/{crop_type}/{year}/USDA_Soybean_County_{year}.csv")
        else:
            config["data"]["USDA"].append(f"USDA/{crop_type}/{year}/USDA_{crop_type}_County_{year}.csv")
        
        # Sentinel data
        quarters = [
            (f"{year}-01-01", f"{year}-03-31"),
            (f"{year}-04-01", f"{year}-06-30"),
            (f"{year}-07-01", f"{year}-09-30"),
            (f"{year}-10-01", f"{year}-12-31")
        ]
        
        sentinel_files = []
        for start, end in quarters:
            quarter_start = datetime.strptime(start, "%Y-%m-%d")
            quarter_end = datetime.strptime(end, "%Y-%m-%d")
            if (grow_season[0] <= quarter_start.month <= grow_season[1]) or \
               (grow_season[0] <= quarter_end.month <= grow_season[1]):
                sentinel_files.append(f"AG/{state.upper()}/{year}/Agriculture_{state_ansi}_{state.upper()}_{start}_{end}.h5")
        
        config["data"]["sentinel"].append(sentinel_files)
    
    return config

# Train
years = list(range(2018,2022))
state = "IL"
state_ansi = "17"
fips = ['17007', '17011', '17015', '17017', '17019', '17021', '17025', '17027',  
        '17037', '17049', '17053', '17055', '17057', '17059', '17061', '17063', '17073', 
        '17075', '17077', '17081', '17085', '17089', '17093', '17095', '17101', '17103', 
        '17105', '17107', '17113', '17115', '17117', '17119', '17121', '17123', '17133', 
        '17135', '17139', '17141', '17143', '17147', '17157', '17163', '17167', '17169', 
        '17173', '17175', '17177', '17179', '17189', '17193', '17195', '17201', '17203'] 
crop_type = "Corn"
grow_season = [4, 9]  # April to September

train_config = make_config(years, state, state_ansi, fips, crop_type, grow_season)
with open('train_config.json', 'w') as file:
    json.dump(train_config, file)

# Test
years = [2022]
test_config = make_config(years,  state, state_ansi, fips, crop_type, grow_season)
with open('test_config.json', 'w') as file:
    json.dump(test_config, file)

In [12]:
base_dir = "/kaggle/input/cropnetv2"

train_config = "train_config.json"
test_config = "test_config.json"
train_hrrr_dataset = HRRRComputedDataset(base_dir,train_config)
train_usda_dataset = USDACropDataset(base_dir,train_config,crop_type)

test_hrrr_dataset = HRRRComputedDataset(base_dir,test_config)
test_usda_dataset = USDACropDataset(base_dir,test_config,crop_type)

# Create data loaders
train_hrrr_loader = DataLoader(train_hrrr_dataset, batch_size=1, shuffle=False)
train_usda_loader = DataLoader(train_usda_dataset, batch_size=1, shuffle=False)
test_hrrr_loader = DataLoader(test_hrrr_dataset, batch_size=1, shuffle=False)
test_usda_loader = DataLoader(test_usda_dataset, batch_size=1, shuffle=False)

# Initialize and train model
predictor = DecisionTreeCropPredictor()

# Train and get metrics
train_metrics = predictor.train(train_hrrr_loader, train_usda_loader)

# Evaluate on test set
test_metrics = predictor.evaluate(test_hrrr_loader, test_usda_loader)
print(crop_type)
pprint(test_metrics)

Corn
{'Production': {'MAE': 0.63,
                'MAPE': 3.73,
                'MSE': 0.61,
                'RMSE': 0.78,
                'SMAPE': 3.71},
 'Yield': {'MAE': 0.08, 'MAPE': 1.54, 'MSE': 0.01, 'RMSE': 0.1, 'SMAPE': 1.54}}


## Soybeans

In [13]:
def make_config(years: List[int], state: str, state_ansi: str, fips: str, crop_type: str, grow_season: List[int]):

    config = {
        "FIPS": fips,
        "years": years,
        "state": state.upper(),
        "crop_type": crop_type,
        "data": {
            "HRRR": {
                "short_term": []
            },
            "USDA": [],
            "sentinel": []
        }
    }
    
    for year in years:
        # HRRR data
        hrrr_files = [
            f"HRRR/{year}/{state.upper()}/HRRR_{state_ansi}_{state.upper()}_{year}-{month:02d}.csv"
            for month in range(grow_season[0], grow_season[1] + 1)
        ]
        config["data"]["HRRR"]["short_term"].append(hrrr_files)
        
        # USDA data
        if crop_type=="Soybeans":
            config["data"]["USDA"].append(f"USDA/{crop_type}/{year}/USDA_Soybean_County_{year}.csv")
        else:
            config["data"]["USDA"].append(f"USDA/{crop_type}/{year}/USDA_{crop_type}_County_{year}.csv")
        
        # Sentinel data
        quarters = [
            (f"{year}-01-01", f"{year}-03-31"),
            (f"{year}-04-01", f"{year}-06-30"),
            (f"{year}-07-01", f"{year}-09-30"),
            (f"{year}-10-01", f"{year}-12-31")
        ]
        
        sentinel_files = []
        for start, end in quarters:
            quarter_start = datetime.strptime(start, "%Y-%m-%d")
            quarter_end = datetime.strptime(end, "%Y-%m-%d")
            if (grow_season[0] <= quarter_start.month <= grow_season[1]) or \
               (grow_season[0] <= quarter_end.month <= grow_season[1]):
                sentinel_files.append(f"AG/{state.upper()}/{year}/Agriculture_{state_ansi}_{state.upper()}_{start}_{end}.h5")
        
        config["data"]["sentinel"].append(sentinel_files)
    
    return config

# Train
years = list(range(2018,2022))
state = "IL"
state_ansi = "17"
fips = ['17005', '17007', '17009', '17011', '17015', '17019', '17025', '17027', '17037', 
        '17045', '17049', '17053', '17055', '17057', '17059', '17063', '17073', '17075', '17077', 
        '17081', '17089', '17091', '17095', '17101', '17103', '17105', '17113', '17115', '17117', 
        '17119', '17121', '17129', '17133', '17139', '17141', '17143', '17145', '17153', '17157', 
        '17163', '17167', '17173', '17177', '17179', '17189', '17193', '17197', '17201', '17203']

crop_type = "Soybeans"
grow_season = [4, 9]  # April to September

train_config = make_config(years, state, state_ansi, fips, crop_type, grow_season)
with open('train_config.json', 'w') as file:
    json.dump(train_config, file)

# Test
years = [2022]
test_config = make_config(years,  state, state_ansi, fips, crop_type, grow_season)
with open('test_config.json', 'w') as file:
    json.dump(test_config, file)


In [14]:
base_dir = "/kaggle/input/cropnetv2"

train_config = "train_config.json"
test_config = "test_config.json"
train_hrrr_dataset = HRRRComputedDataset(base_dir,train_config)
train_usda_dataset = USDACropDataset(base_dir,train_config,crop_type)

test_hrrr_dataset = HRRRComputedDataset(base_dir,test_config)
test_usda_dataset = USDACropDataset(base_dir,test_config,crop_type)

# Create data loaders
train_hrrr_loader = DataLoader(train_hrrr_dataset, batch_size=1, shuffle=False)
train_usda_loader = DataLoader(train_usda_dataset, batch_size=1, shuffle=False)
test_hrrr_loader = DataLoader(test_hrrr_dataset, batch_size=1, shuffle=False)
test_usda_loader = DataLoader(test_usda_dataset, batch_size=1, shuffle=False)

# Initialize and train model
predictor = DecisionTreeCropPredictor()

# Train and get metrics
train_metrics = predictor.train(train_hrrr_loader, train_usda_loader)

# Evaluate on test set
test_metrics = predictor.evaluate(test_hrrr_loader, test_usda_loader)
print(crop_type)
pprint(test_metrics)

Soybeans
{'Production': {'MAE': 0.7,
                'MAPE': 4.46,
                'MSE': 0.71,
                'RMSE': 0.84,
                'SMAPE': 4.52},
 'Yield': {'MAE': 0.13, 'MAPE': 3.15, 'MSE': 0.03, 'RMSE': 0.16, 'SMAPE': 3.18}}
