# PyTorch Neural Network for Cattle Milk Yield Prediction

This notebook implements nested cross-validation for PyTorch neural networks to predict milk yield.
- Uses MPS (Metal Performance Shaders) for GPU acceleration on Apple Silicon
- Uses entire dataset (no sampling)
- Saves models to models_nn folder


In [1]:
# Imports
import pandas as pd
import numpy as np
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from itertools import product
import joblib

# Create models_nn directory if it doesn't exist
os.makedirs('models_nn', exist_ok=True)

# Set device (MPS for Apple Silicon, fallback to CPU)
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Metal Performance Shaders)")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA")
else:
    device = torch.device("cpu")
    print("Using CPU")

print(f"Device: {device}")


Using MPS (Metal Performance Shaders)
Device: mps


## Load and Prepare Data

In [2]:
# Load cleaned data
TRAIN_PATH = "cleaned_train_nn.csv"
TEST_PATH = "cleaned_test_nn.csv"

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTrain columns: {train.columns.tolist()}")


Train shape: (209926, 41)
Test shape: (40000, 41)

Train columns: ['Cattle_ID', 'Age_Months', 'Weight_kg', 'Parity', 'Lactation_Stage', 'Days_in_Milk', 'Feed_Type', 'Feed_Quantity_kg', 'Feeding_Frequency', 'Water_Intake_L', 'Walking_Distance_km', 'Grazing_Duration_hrs', 'Resting_Hours', 'Ambient_Temperature_C', 'Humidity_percent', 'Housing_Score', 'FMD_Vaccine', 'Brucellosis_Vaccine', 'HS_Vaccine', 'BQ_Vaccine', 'Anthrax_Vaccine', 'IBR_Vaccine', 'BVD_Vaccine', 'Rabies_Vaccine', 'Previous_Week_Avg_Yield', 'Body_Condition_Score', 'Milking_Interval_hrs', 'Farm_ID', 'Mastitis', 'Milk_Yield_L', 'Breed_Brown Swiss', 'Breed_Brown Swiss ', 'Breed_Guernsey', 'Breed_Holstein', 'Breed_Holstien', 'Breed_Jersey', 'Management_System_Intensive', 'Management_System_Mixed', 'Management_System_Pastoral', 'Management_System_Semi_Intensive', 'Date_Ordinal']


In [3]:
# Separate features and target
id_cols = ['Cattle_ID'] if 'Cattle_ID' in train.columns else []
target_col = 'Milk_Yield_L'

X_train = train.drop(columns=[target_col] + id_cols, errors='ignore')
y_train = train[target_col]

X_test = test.drop(columns=id_cols, errors='ignore')

print(f"Training features shape: {X_train.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Test features shape: {X_test.shape}")


Training features shape: (209926, 39)
Training target shape: (209926,)
Test features shape: (40000, 40)


In [4]:
# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")
print(f"\nNumerical columns ({len(numerical_cols)}): {numerical_cols}")


Categorical columns (1): ['Feed_Type']

Numerical columns (28): ['Age_Months', 'Weight_kg', 'Parity', 'Lactation_Stage', 'Days_in_Milk', 'Feed_Quantity_kg', 'Feeding_Frequency', 'Water_Intake_L', 'Walking_Distance_km', 'Grazing_Duration_hrs', 'Resting_Hours', 'Ambient_Temperature_C', 'Humidity_percent', 'Housing_Score', 'FMD_Vaccine', 'Brucellosis_Vaccine', 'HS_Vaccine', 'BQ_Vaccine', 'Anthrax_Vaccine', 'IBR_Vaccine', 'BVD_Vaccine', 'Rabies_Vaccine', 'Previous_Week_Avg_Yield', 'Body_Condition_Score', 'Milking_Interval_hrs', 'Farm_ID', 'Mastitis', 'Date_Ordinal']


## Preprocessing Pipeline


In [5]:
# Create preprocessing pipeline (same as modeling.ipynb)
preprocessor_scaled = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_cols)
    ],
    remainder='drop'
)

print("Preprocessing pipeline created")


Preprocessing pipeline created


## PyTorch Dataset and Model Definitions


In [6]:
# PyTorch Dataset class
class CattleDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y) if y is not None else None
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

# Neural Network Model
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_sizes, activation='relu', dropout=0.0):
        super(NeuralNetwork, self).__init__()
        
        layers = []
        prev_size = input_size
        
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            if activation == 'relu':
                layers.append(nn.ReLU())
            elif activation == 'tanh':
                layers.append(nn.Tanh())
            if dropout > 0:
                layers.append(nn.Dropout(dropout))
            prev_size = hidden_size
        
        # Output layer (regression)
        layers.append(nn.Linear(prev_size, 1))
        
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x).squeeze()


## Training Function


In [7]:
def train_model(model, train_loader, val_loader, learning_rate, weight_decay, epochs, device):
    """Train a PyTorch neural network model"""
    model = model.to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    best_val_loss = float('inf')
    patience = 20
    patience_counter = 0
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()
        
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
    
    return model, best_val_loss


## Nested Cross-Validation Setup


In [8]:
# Setup nested CV
OUTER_CV = 5
INNER_CV = 3

outer_cv = KFold(n_splits=OUTER_CV, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=INNER_CV, shuffle=True, random_state=42)

print(f"Outer CV folds: {OUTER_CV}")
print(f"Inner CV folds: {INNER_CV}")
print("Using entire dataset (no sampling)")


Outer CV folds: 5
Inner CV folds: 3
Using entire dataset (no sampling)


## Hyperparameter Grid


In [9]:
# Hyperparameter grid (similar to sklearn MLPRegressor)
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (200, 100)],
    'alpha': [0.0001, 0.001, 0.01],  # weight_decay (L2 regularization)
    'learning_rate': [0.001, 0.01],
    'activation': ['relu', 'tanh']
}

print("Hyperparameter grid:")
for key, values in param_grid.items():
    print(f"  {key}: {values}")


Hyperparameter grid:
  hidden_layer_sizes: [(50,), (100,), (100, 50), (200, 100)]
  alpha: [0.0001, 0.001, 0.01]
  learning_rate: [0.001, 0.01]
  activation: ['relu', 'tanh']


## Nested Cross-Validation Function


In [10]:
def nested_cv_evaluation_pytorch(X, y, preprocessor, param_grid, outer_cv, inner_cv, device, batch_size=512, epochs=200):
    """
    Perform nested cross-validation for PyTorch neural network.
    Saves each fold's best model to the models_nn folder.
    """
    print(f"\n{'='*60}")
    print(f"Evaluating PyTorch Neural Network")
    print(f"{'='*60}")
    print(f"Using entire dataset: {len(X):,} samples")
    
    # Fit preprocessor on full data to get feature dimension
    X_processed = preprocessor.fit_transform(X)
    input_size = X_processed.shape[1]
    print(f"Input feature size after preprocessing: {input_size}")
    
    outer_scores = []
    best_params_list = []
    best_models = []
    
    for fold_idx, (train_idx, val_idx) in enumerate(outer_cv.split(X, y)):
        print(f"\nOuter Fold {fold_idx + 1}/{outer_cv.n_splits}")
        
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        
        # Fit preprocessor on training fold
        X_train_processed = preprocessor.fit_transform(X_train_fold)
        X_val_processed = preprocessor.transform(X_val_fold)
        
        # Inner CV for hyperparameter tuning
        best_inner_score = float('inf')
        best_inner_params = None
        best_inner_model = None
        
        # Generate all parameter combinations
        param_combinations = list(product(*param_grid.values()))
        param_keys = list(param_grid.keys())
        
        print(f"  Testing {len(param_combinations)} hyperparameter combinations...")
        
        for param_idx, param_values in enumerate(param_combinations):
            params = dict(zip(param_keys, param_values))
            
            inner_scores = []
            
            for inner_train_idx, inner_val_idx in inner_cv.split(X_train_fold, y_train_fold):
                X_inner_train = X_train_processed[inner_train_idx]
                X_inner_val = X_train_processed[inner_val_idx]
                y_inner_train = y_train_fold.iloc[inner_train_idx].values
                y_inner_val = y_train_fold.iloc[inner_val_idx].values
                
                # Create datasets and dataloaders
                train_dataset = CattleDataset(X_inner_train, y_inner_train)
                val_dataset = CattleDataset(X_inner_val, y_inner_val)
                train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
                val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
                
                # Create and train model
                model = NeuralNetwork(
                    input_size=input_size,
                    hidden_sizes=params['hidden_layer_sizes'],
                    activation=params['activation'],
                    dropout=0.0
                )
                
                _, val_loss = train_model(
                    model, train_loader, val_loader,
                    learning_rate=params['learning_rate'],
                    weight_decay=params['alpha'],
                    epochs=epochs,
                    device=device
                )
                
                inner_scores.append(val_loss)
            
            avg_inner_score = np.mean(inner_scores)
            
            if avg_inner_score < best_inner_score:
                best_inner_score = avg_inner_score
                best_inner_params = params
            
            if (param_idx + 1) % 5 == 0:
                print(f"    Tested {param_idx + 1}/{len(param_combinations)} combinations...")
        
        # Train best model on full training fold with best parameters
        print(f"  Best hyperparameters found. Training final model...")
        train_dataset = CattleDataset(X_train_processed, y_train_fold.values)
        val_dataset = CattleDataset(X_val_processed, y_val_fold.values)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        
        best_inner_model = NeuralNetwork(
            input_size=input_size,
            hidden_sizes=best_inner_params['hidden_layer_sizes'],
            activation=best_inner_params['activation'],
            dropout=0.0
        )
        best_inner_model, _ = train_model(
            best_inner_model, train_loader, val_loader,
            learning_rate=best_inner_params['learning_rate'],
            weight_decay=best_inner_params['alpha'],
            epochs=epochs,
            device=device
        )
        
        # Evaluate best model on outer validation set
        best_inner_model.eval()
        with torch.no_grad():
            X_val_tensor = torch.FloatTensor(X_val_processed).to(device)
            y_pred = best_inner_model(X_val_tensor).cpu().numpy()
        
        mse = mean_squared_error(y_val_fold, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_val_fold, y_pred)
        r2 = r2_score(y_val_fold, y_pred)
        
        outer_scores.append({
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2
        })
        
        best_params_list.append(best_inner_params)
        best_models.append((best_inner_model, preprocessor))
        
        # Save this fold's best model and preprocessor
        model_path = f'models_nn/neuralnetwork_fold_{fold_idx + 1}.pth'
        preprocessor_path = f'models_nn/preprocessor_fold_{fold_idx + 1}.pkl'
        
        torch.save({
            'model_state_dict': best_inner_model.state_dict(),
            'model_params': {
                'input_size': input_size,
                'hidden_sizes': best_inner_params['hidden_layer_sizes'],
                'activation': best_inner_params['activation']
            },
            'hyperparams': best_inner_params
        }, model_path)
        
        joblib.dump(preprocessor, preprocessor_path)
        
        print(f"  Saved model to {model_path}")
        print(f"  Saved preprocessor to {preprocessor_path}")
        print(f"  Best params: {best_inner_params}")
        print(f"  Validation RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
    
    # Aggregate results
    avg_rmse = np.mean([s['rmse'] for s in outer_scores])
    avg_mae = np.mean([s['mae'] for s in outer_scores])
    avg_r2 = np.mean([s['r2'] for s in outer_scores])
    std_rmse = np.std([s['rmse'] for s in outer_scores])
    
    print(f"\nPyTorch Neural Network Results:")
    print(f"  Average RMSE: {avg_rmse:.4f} (±{std_rmse:.4f})")
    print(f"  Average MAE: {avg_mae:.4f}")
    print(f"  Average R²: {avg_r2:.4f}")
    
    return {
        'outer_scores': outer_scores,
        'best_params': best_params_list,
        'best_models': best_models,
        'avg_rmse': avg_rmse,
        'avg_mae': avg_mae,
        'avg_r2': avg_r2,
        'std_rmse': std_rmse
    }


## Run Nested Cross-Validation


In [11]:
# Run nested CV
start_time = time.time()

results = nested_cv_evaluation_pytorch(
    X_train, y_train,
    preprocessor_scaled,
    param_grid,
    outer_cv,
    inner_cv,
    device=device,
    batch_size=512,
    epochs=200
)

total_time = time.time() - start_time
print(f"\nTotal time: {total_time/60:.2f} minutes")



Evaluating PyTorch Neural Network
Using entire dataset: 209,926 samples
Input feature size after preprocessing: 36

Outer Fold 1/5
  Testing 48 hyperparameter combinations...
    Tested 5/48 combinations...
    Tested 10/48 combinations...
    Tested 15/48 combinations...
    Tested 20/48 combinations...
    Tested 25/48 combinations...
    Tested 30/48 combinations...
    Tested 35/48 combinations...
    Tested 40/48 combinations...
    Tested 45/48 combinations...
  Best hyperparameters found. Training final model...
  Saved model to models_nn/neuralnetwork_fold_1.pth
  Saved preprocessor to models_nn/preprocessor_fold_1.pkl
  Best params: {'hidden_layer_sizes': (200, 100), 'alpha': 0.01, 'learning_rate': 0.001, 'activation': 'tanh'}
  Validation RMSE: 4.2592, MAE: 3.3082, R²: 0.3682

Outer Fold 2/5
  Testing 48 hyperparameter combinations...
    Tested 5/48 combinations...
    Tested 10/48 combinations...
    Tested 15/48 combinations...
    Tested 20/48 combinations...
    Tested 

  return F.mse_loss(input, target, reduction=self.reduction)


  Saved model to models_nn/neuralnetwork_fold_2.pth
  Saved preprocessor to models_nn/preprocessor_fold_2.pkl
  Best params: {'hidden_layer_sizes': (200, 100), 'alpha': 0.01, 'learning_rate': 0.001, 'activation': 'tanh'}
  Validation RMSE: 4.2490, MAE: 3.3099, R²: 0.3673

Outer Fold 3/5
  Testing 48 hyperparameter combinations...
    Tested 5/48 combinations...
    Tested 10/48 combinations...
    Tested 15/48 combinations...
    Tested 20/48 combinations...
    Tested 25/48 combinations...
    Tested 30/48 combinations...
    Tested 35/48 combinations...
    Tested 40/48 combinations...
    Tested 45/48 combinations...
  Best hyperparameters found. Training final model...


  return F.mse_loss(input, target, reduction=self.reduction)


  Saved model to models_nn/neuralnetwork_fold_3.pth
  Saved preprocessor to models_nn/preprocessor_fold_3.pkl
  Best params: {'hidden_layer_sizes': (200, 100), 'alpha': 0.01, 'learning_rate': 0.01, 'activation': 'relu'}
  Validation RMSE: 4.2780, MAE: 3.3522, R²: 0.3592

Outer Fold 4/5
  Testing 48 hyperparameter combinations...
    Tested 5/48 combinations...
    Tested 10/48 combinations...


KeyboardInterrupt: 

## Results Summary


In [None]:
# Display results summary
print("\n" + "="*60)
print("RESULTS SUMMARY")
print("="*60)
print(f"Average RMSE: {results['avg_rmse']:.4f} (±{results['std_rmse']:.4f})")
print(f"Average MAE: {results['avg_mae']:.4f}")
print(f"Average R²: {results['avg_r2']:.4f}")

print("\nBest parameters for each fold:")
for i, params in enumerate(results['best_params']):
    print(f"\nFold {i+1}:")
    for key, value in params.items():
        print(f"  {key}: {value}")


In [None]:
# Load cleaned data
TRAIN_PATH = "cleaned_train.csv"
TEST_PATH = "cleaned_test.csv"

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTrain columns: {train.columns.tolist()}")
