In [1]:
import sys
from pathlib import Path

# Add project root to sys.path (works in Jupyter)
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [2]:
import os
import time
import sys
import torch
import numpy as np
import yaml
import logging
from pathlib import Path
from src import architecture, dataset, trainer
from utils import experiment, logger, metrics, modules, profiler

### Neural Network Architecture Configuration

In [3]:
from config.schema import ArchitectureConfig
from src.architecture import NeuralNetwork

arch_config = ArchitectureConfig(
    in_size=9,
    out_size=3,
    hidden_layers=[64, 32],
    activation="ReLU",
    use_dropout=False,
    dropout=0.5,
    dropout_inplace=False,
    final_activation=None
)

model = NeuralNetwork(arch_config)

In [4]:
# Print current model architecture

from pprint import pprint

pprint(arch_config.model_dump(), sort_dicts=False)
print()
print(model)
print()
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {total_params}")

{'in_size': 9,
 'out_size': 3,
 'hidden_layers': [64, 32],
 'activation': 'ReLU',
 'use_dropout': False,
 'dropout': 0.5,
 'dropout_inplace': False,
 'final_activation': None}

NeuralNetwork(
  (layers): ModuleList(
    (0): Linear(in_features=9, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=3, bias=True)
  )
)

Number of trainable parameters: 2819


### Data Loading Configuration

In [5]:
from config.schema import DataConfig
from src.dataset import prepare_dataloaders

data_config = DataConfig(
    path_to_data="c:/Users/cervinka/cervinka/dataset_compressible_flow_60M_training_nstep180.csv",
    num_samples=5000,
    batch_size=50,
    in_cols=["A11", "A21", "A31", "A12", "A22", "A32", "A13", "A23", "A33"],
    out_cols=["Shear"],
    val_split=0.1,
    shuffle=False,
    sliding_window=None
)

train_loader, val_loader = prepare_dataloaders(data_config)

In [6]:
# Print current data configuration

from pprint import pprint

pprint(data_config.model_dump(), sort_dicts=False)
print()
def print_loader_stats(train_loader, val_loader):
    try:
        train_samples = len(train_loader.dataset)
        val_samples = len(val_loader.dataset)
    except Exception:
        # Fallback for custom loaders
        train_samples = sum(1 for _ in train_loader)
        val_samples = sum(1 for _ in val_loader)

    train_batches = len(train_loader)
    val_batches = len(val_loader)

    print(f"Train samples: {train_samples}")
    print(f"Validation samples: {val_samples}")
    print(f"Train batches: {train_batches}")
    print(f"Validation batches: {val_batches}")

    if hasattr(train_loader, 'batch_size'):
        print(f"Batch size: {train_loader.batch_size}")

    # Example batch shapes
    for batch_x, batch_y in train_loader:
        print(f"Example train batch X shape: {batch_x.shape}")
        print(f"Example train batch Y shape: {batch_y.shape}")
        break
    for batch_x, batch_y in val_loader:
        print(f"Example val batch X shape: {batch_x.shape}")
        print(f"Example val batch Y shape: {batch_y.shape}")
        break

print_loader_stats(train_loader, val_loader)

{'path_to_data': 'c:/Users/cervinka/cervinka/dataset_compressible_flow_60M_training_nstep180.csv',
 'num_samples': 5000,
 'batch_size': 50,
 'in_cols': ['A11', 'A21', 'A31', 'A12', 'A22', 'A32', 'A13', 'A23', 'A33'],
 'out_cols': ['Shear'],
 'val_split': 0.1,
 'shuffle': False,
 'sliding_window': None}

Train samples: 4500
Validation samples: 500
Train batches: 90
Validation batches: 10
Batch size: 50
Example train batch X shape: torch.Size([50, 9])
Example train batch Y shape: torch.Size([50, 1])
Example val batch X shape: torch.Size([50, 9])
Example val batch Y shape: torch.Size([50, 1])


### Neural Network Training Configuration

In [7]:
from config.schema import TrainingConfig

training_config = TrainingConfig(
    learning_rate=0.001,
    optimizer="Adam",
    loss_function="MSELoss",
    epochs=30,
    early_stopping=True,
    patience=10,
    scheduler="ReduceLROnPlateau",
    scheduler_patience=3,
    scheduler_factor=0.5,
    scheduler_threshold=0.01
)

In [8]:
# Print current training configuration

from pprint import pprint

pprint(training_config.model_dump(), sort_dicts=False)

{'learning_rate': 0.001,
 'optimizer': 'Adam',
 'loss_function': 'MSELoss',
 'epochs': 30,
 'early_stopping': True,
 'patience': 10,
 'scheduler': 'ReduceLROnPlateau',
 'scheduler_patience': 3,
 'scheduler_factor': 0.5,
 'scheduler_threshold': 0.01}


# Creating new experiment

In [9]:
# Set experiment name and seed
experiment_name = "exp1"
seed = 42

In [10]:
# Create new jupyter experiment

import random
import hashlib
from utils.experiment import create_experiment_dir
from utils.logger import setup_logger

# Set output directory for experiments (relative to notebooks/outputs)
output_dir = Path.cwd() / "outputs"
exp_dir = create_experiment_dir(str(output_dir), experiment_name)

# Set random seeds for reproducibility
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# --- Logging setup ---
log_file = os.path.join(exp_dir, "notebook.log")
logger = setup_logger(verbose=True, log_to_file=True, log_file=log_file)

logger.info(f"Experiment directory: {exp_dir}")
logger.info(f"Seed: {seed}")

# --- Logging setup ---
log_file = os.path.join(exp_dir, "notebook.log")
logger = setup_logger(verbose=True, log_to_file=True, log_file=log_file)

# --- Config hash for reproducibility ---
import json
def config_hash(*configs):
    # Combine all config dicts and hash them
    config_str = json.dumps([c.model_dump() for c in configs], sort_keys=True)
    return hashlib.md5(config_str.encode("utf-8")).hexdigest()

hash_val = config_hash(arch_config, training_config, data_config)
logger.info(f"Config hash: {hash_val}")

# --- CUDA/Device info ---
logger.info(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    logger.info(f"CUDA device: {torch.cuda.get_device_name(0)}")
    cuda_version = getattr(torch.version, "cuda", "N/A")  # type: ignore
    logger.info(f"CUDA version: {cuda_version}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# --- Model parameter count ---
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
logger.info(f"Total trainable parameters: {num_params}")

# --- Log experiment and configs ---
logger.info(f"Experiment: {experiment_name}")
logger.info(f"Model architecture:\n{model}")
logger.info(f"Training config: {training_config}")
logger.info(f"Data config: {data_config}")

# --- Log dataset stats ---
try:
    train_samples = len(train_loader.dataset) # type: ignore
    val_samples = len(val_loader.dataset) # type: ignore
except Exception:
    train_samples = sum(1 for _ in train_loader)
    val_samples = sum(1 for _ in val_loader)
logger.info(f"Training samples: {train_samples} | Validation samples: {val_samples} | Batch size: {data_config.batch_size}")

2025-08-19 14:14:36,178 | INFO | Experiment directory: c:\Users\cervinka\cervinka\GitHub\MathCAS\notebooks\outputs\2025-08-19_14-14-36_exp1
2025-08-19 14:14:36,178 | INFO | Seed: 42
2025-08-19 14:14:36,180 | INFO | Config hash: 549e7dd113d2d29099a8369ad1613133
2025-08-19 14:14:36,180 | INFO | CUDA available: True
2025-08-19 14:14:36,181 | INFO | CUDA device: NVIDIA GeForce RTX 3070
2025-08-19 14:14:36,182 | INFO | CUDA version: 12.1
2025-08-19 14:14:36,182 | INFO | Using device: cuda
2025-08-19 14:14:36,183 | INFO | Total trainable parameters: 2819
2025-08-19 14:14:36,183 | INFO | Experiment: exp1
2025-08-19 14:14:36,184 | INFO | Model architecture:
NeuralNetwork(
  (layers): ModuleList(
    (0): Linear(in_features=9, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=3, bias=True)
  )
)
2025-08-19 14:14:36,185 | INFO | Training config: learning_rate=0.001 optimizer='Adam' 

### Training

In [None]:
# Run the training loop #

from src.trainer import train_one_epoch, validate
from utils.modules import get_loss_function, get_optimizer, get_scheduler
from utils.metrics import format_time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = get_loss_function(training_config.loss_function)
optimizer = get_optimizer(
    training_config.optimizer,
    model.parameters(),
    training_config.learning_rate
)

scheduler = None
if training_config.scheduler == "ReduceLROnPlateau":
    scheduler = get_scheduler(
        "ReduceLROnPlateau",
        optimizer,
        patience=training_config.scheduler_patience or 2,
        factor=training_config.scheduler_factor or 0.5,
        threshold=training_config.scheduler_threshold or 1e-4,
        verbose=True
    )

best_val_loss = float("inf")
best_epoch = -1
patience_counter = 0
train_losses, val_losses, elapsed_times = [], [], []
start_time = time.time()

# Training loop
try:
    for epoch in range(training_config.epochs):

        train_loader, val_loader = prepare_dataloaders(data_config, epoch=epoch)

        train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
        val_loss = validate(model, val_loader, criterion, device)

        train_losses.append(train_loss)
        val_losses.append(val_loss)

        # Step the scheduler
        if scheduler is not None:
            scheduler.step(val_loss)

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch + 1
            best_model_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1

        elapsed_time = time.time() - start_time
        elapsed_times.append(elapsed_time)

        # Log epoch information
        current_lr = optimizer.param_groups[0]['lr']
        logger.info(
            f"Epoch {epoch + 1}/{training_config.epochs} | "
            f"Train Loss: {train_loss:.4e} | Val Loss: {val_loss:.4e} | "
            f"LR: {current_lr:.2e} | Time: {format_time(elapsed_time)}"
        )

        # Log scheduler events (ReduceLROnPlateau)
        if scheduler is not None and hasattr(scheduler, 'num_bad_epochs') and scheduler.num_bad_epochs == 0 and epoch > 0:
            logger.info(f"Learning rate reduced to {current_lr:.2e}")

        if training_config.early_stopping and patience_counter >= training_config.patience:
                    logger.info(f"Early stopping at epoch {epoch + 1}")
                    break
        
    total_time = time.time() - start_time
    logger.info("Training complete.")
    logger.info(f"Best model at epoch {best_epoch} | Val Loss: {best_val_loss:.4e}")
    logger.info(f"Total training time: {format_time(total_time)}")

except Exception as e:
    import traceback
    logger.error("Exception during training:\n" + traceback.format_exc())
    raise

  return F.mse_loss(input, target, reduction=self.reduction)
2025-08-19 14:14:42,215 | INFO | Epoch 1/30 | Train Loss: 8.5302e-01 | Val Loss: 1.2368e-01 | LR: 1.00e-03 | Time: 0:00:05
  return F.mse_loss(input, target, reduction=self.reduction)
2025-08-19 14:14:47,230 | INFO | Epoch 2/30 | Train Loss: 1.2306e-01 | Val Loss: 9.7242e-02 | LR: 1.00e-03 | Time: 0:00:10
2025-08-19 14:14:47,231 | INFO | Learning rate reduced to 1.00e-03
  return F.mse_loss(input, target, reduction=self.reduction)
2025-08-19 14:14:52,251 | INFO | Epoch 3/30 | Train Loss: 9.7776e-02 | Val Loss: 8.4598e-02 | LR: 1.00e-03 | Time: 0:00:15
2025-08-19 14:14:52,252 | INFO | Learning rate reduced to 1.00e-03
  return F.mse_loss(input, target, reduction=self.reduction)
2025-08-19 14:14:57,333 | INFO | Epoch 4/30 | Train Loss: 8.4708e-02 | Val Loss: 7.8318e-02 | LR: 1.00e-03 | Time: 0:00:20
2025-08-19 14:14:57,334 | INFO | Learning rate reduced to 1.00e-03
  return F.mse_loss(input, target, reduction=self.reduction)
20

### Save results

In [None]:
# Save best model #

best_model_path = os.path.join(exp_dir, "best_model.pt")
torch.save(best_model_state, best_model_path)
print(f"Best model saved to {best_model_path}")

Best model saved to c:\Users\cervinka\cervinka\GitHub\MathCAS\notebooks\outputs\2025-08-19_14-14-36_exp1\best_model.pt


In [None]:
# Save training history #

history_path = os.path.join(exp_dir, "training_history.pt")
torch.save({
    "optimizer_state": optimizer.state_dict(),
    "train_losses": train_losses,
    "val_losses": val_losses,
    "best_epoch": best_epoch,
    "best_val_loss": best_val_loss,
}, history_path)
print(f"Training history saved to {history_path}")

### Additional training (WIP)

In [None]:
# Reload best model weights
model.load_state_dict(torch.load(best_model_path))

In [None]:
# Reload optimizer state (if you want to preserve momentum, etc.)
optimizer.load_state_dict(torch.load(history_path)["optimizer_state"])