In [21]:
# Cell 1: Mount Drive and install dependencies
from google.colab import drive
drive.mount('/content/drive')

# Install required packages (should mostly be pre-installed)
!pip install torch torchvision torchaudio
!pip install matplotlib numpy pillow scikit-image opencv-python

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Cell 2: Create directories
import os
from pathlib import Path

# Create project structure
project_dir = Path('/content/drive/MyDrive/ResearchProject')
project_dir.mkdir(parents=True, exist_ok=True)

# Create subdirectories
(project_dir / 'checkpoints').mkdir(exist_ok=True)
(project_dir / 'logs').mkdir(exist_ok=True)

print("Project structure ready!")
print(f"Project dir: {project_dir}")

Project structure ready!
Project dir: /content/drive/MyDrive/ResearchProject


In [7]:
# Cell 3: Import and setup
import sys
sys.path.append(str('/content/drive/MyDrive/ResearchProject'))

import torch
from unet_denoiser import BlindVideoDenoiserUNet
from dataloader import BlindDenoiseDataset
from unet_denoiser_training import train, TemporalDenoiseDataset, create_train_val_split

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
print(f"GPU: {torch.cuda.get_device_name(0) if device == 'cuda' else 'CPU'}")

# Check GPU memory (important!)
if device == 'cuda':
    print(f"GPU Memory Available: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

Using device: cuda
GPU: NVIDIA A100-SXM4-80GB
GPU Memory Available: 85.1 GB


In [12]:
"""
# At the start of your notebook, BEFORE training:
# Copying data to local Colab SSD
!cp -r "/content/drive/MyDrive/ResearchProject/DAVISDataset" /content/DAVISDataset
davis_root = '/content/DAVISDataset'
"""

"""
# One-time: create a zip on Drive (run once, then comment out)
!cd "/content/drive/MyDrive/ResearchProject" && zip -r DAVISDataset.zip DAVISDataset
"""



[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
  adding: DAVISDataset/juggling-selfie/00050.jpg (deflated 0%)
  adding: DAVISDataset/juggling-selfie/00038.jpg (deflated 0%)
  adding: DAVISDataset/juggling-selfie/00032.jpg (deflated 0%)
  adding: DAVISDataset/juggling-selfie/00004.jpg (deflated 0%)
  adding: DAVISDataset/juggling-selfie/00041.jpg (deflated 0%)
  adding: DAVISDataset/juggling-selfie/00023.jpg (deflated 0%)
  adding: DAVISDataset/juggling-selfie/00049.jpg (deflated 0%)
  adding: DAVISDataset/juggling-selfie/00067.jpg (deflated 0%)
  adding: DAVISDataset/juggling-selfie/00030.jpg (deflated 0%)
  adding: DAVISDataset/juggling-selfie/00042.jpg (deflated 0%)
  adding: DAVISDataset/juggling-selfie/00036.jpg (deflated 0%)
  adding: DAVISDataset/juggling-selfie/00070.jpg (deflated 0%)
  adding: DAVISDataset/juggling-selfie/00008.jpg (deflated 0%)
  adding: DAVISDataset/juggling-selfie/00047.jpg (deflated 0%)
  adding: DAVISDataset/juggling-selfie/0007

In [8]:
!cp "/content/drive/MyDrive/ResearchProject/DAVISDataset.zip" /content/
!unzip -q /content/DAVISDataset.zip -d /content/

In [10]:
# Cell 4: Create train/val split from DAVIS dataset

"""
# Create train/val split (80/20 split, videos are not mixed between train and val)
train_dataset, val_dataset = create_train_val_split(
    davis_root,
    val_split=0.2,  # 20% of videos for validation
    seed=42
)

# Create temporal loaders with resizing
batch_size = 8  # Adjust based on GPU memory
resize_to = (384, 384)  # Resize all frames to this size. Change if needed for your GPU memory
"""
"""
train_loader = TemporalDataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    resize_to=resize_to
)
val_loader = TemporalDataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    resize_to=resize_to
)
"""
from unet_denoiser_training import create_data_loaders, create_train_val_split
train_dataset, val_dataset = create_train_val_split(
    '/content/DAVISDataset', val_split=0.2, seed=42,
    resize_to=(256, 256), use_fp16=True
)
train_loader, val_loader = create_data_loaders(
    train_dataset, val_dataset, batch_size=16, num_workers=2
)

print(f"\nTrain loader batches per epoch: {len(train_loader)}")
print(f"Val loader batches per epoch: {len(val_loader)}")
#print(f"Frame resolution: {resize_to}")

Total videos: 150
Train videos: 120 (8714 frames)
Val videos: 30 (2017 frames)
Resolution: 256x256, FP16: True

DataLoader config:
  Batch size: 16
  Num workers: 2
  Pin memory: True
  Train batches/epoch: 544
  Val batches/epoch: 127

Train loader batches per epoch: 544
Val loader batches per epoch: 127


In [11]:
# Cell 5: Initialize model and train
model = BlindVideoDenoiserUNet(
    in_channels=9,
    out_channels=3,
    base_channels=64,
    num_stages=3
)

print(f"Total model parameters: {sum(p.numel() for p in model.parameters()):,}")

"""
# Start training with your choice of loss
trained_model, logger = train(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    num_epochs=100,
    initial_lr=1e-3,
    device=device,
    checkpoint_dir=str(project_dir / 'checkpoints'),
    log_dir=str(project_dir / 'logs'),
    loss_type='combined',  # 'l1', 'l2', or 'combined'
    loss_alpha=0.7  # Only used if loss_type='combined'. 0.7 = 70% L1, 30% L2
)
"""
trained_model, logger = train(
    model, train_loader, val_loader,
    num_epochs=100, initial_lr=1e-3, device='cuda',
    checkpoint_dir='/content/checkpoints',   # save locally, fast
    log_dir='/content/logs',
    loss_type='combined', loss_alpha=0.7,
    use_amp=True, use_torch_compile=True
)

Total model parameters: 4,541,184


  scaler = GradScaler() if (use_amp and device == "cuda") else None


torch.compile enabled (reduce-overhead mode)

Training config: AMP=ON, Loss=combined, LR=0.001
Epochs: 1-100, Patience: 15



  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


Epoch    1 | Train Loss: 0.148041 | Val Loss: 0.049157 | LR: 1.00e-03 | Time: 749.9s
  → Best model saved! (Val Loss: 0.049157)


  with autocast():
  with autocast():


Epoch    2 | Train Loss: 0.046428 | Val Loss: 0.048124 | LR: 1.00e-03 | Time: 732.8s
  → Best model saved! (Val Loss: 0.048124)


  with autocast():
  with autocast():


Epoch    3 | Train Loss: 0.043799 | Val Loss: 0.043528 | LR: 9.99e-04 | Time: 724.3s
  → Best model saved! (Val Loss: 0.043528)
Epoch    4 | Train Loss: 0.041486 | Val Loss: 0.039928 | LR: 9.98e-04 | Time: 728.1s
  → Best model saved! (Val Loss: 0.039928)
Epoch    5 | Train Loss: 0.039911 | Val Loss: 0.039606 | LR: 9.96e-04 | Time: 727.6s
  → Best model saved! (Val Loss: 0.039606)
Epoch    6 | Train Loss: 0.038740 | Val Loss: 0.039172 | LR: 9.94e-04 | Time: 734.3s
  → Best model saved! (Val Loss: 0.039172)
Epoch    7 | Train Loss: 0.037137 | Val Loss: 0.039927 | LR: 9.91e-04 | Time: 729.7s
Epoch    8 | Train Loss: 0.036751 | Val Loss: 0.036548 | LR: 9.88e-04 | Time: 728.6s
  → Best model saved! (Val Loss: 0.036548)
Epoch    9 | Train Loss: 0.036316 | Val Loss: 0.041450 | LR: 9.84e-04 | Time: 727.4s
Epoch   10 | Train Loss: 0.036096 | Val Loss: 0.034323 | LR: 9.80e-04 | Time: 726.6s
  Checkpoint saved: /content/checkpoints/checkpoint_epoch_010.pt
  → Best model saved! (Val Loss: 0.03432

KeyboardInterrupt: 

In [22]:
# Cell 6: Plot training curves (run this periodically or after training)
from unet_denoiser_training import TrainingLogger
import matplotlib.pyplot as plt

logger = TrainingLogger(log_dir=str(project_dir / 'logs'))
logger.plot_metrics()  # This will save and display the curves

Metrics saved to /content/drive/MyDrive/ResearchProject/logs


In [23]:
!cp /content/checkpoints/best_model.pt "/content/drive/MyDrive/ResearchProject/checkpoints/"

In [24]:
# Cell 7: Load best model for inference
checkpoint = torch.load(str(project_dir / 'checkpoints' / 'best_model.pt'), map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

print(f"Best model loaded from epoch {checkpoint['epoch']}")
print(f"Best val loss: {checkpoint['val_loss']:.6f}")

Best model loaded from epoch 42
Best val loss: 0.027644


In [25]:
# Copy best model to Drive when done
!cp /content/checkpoints/best_model.pt "/content/drive/MyDrive/ResearchProject/checkpoints/"

In [26]:
!ls -lh "/content/drive/MyDrive/ResearchProject/checkpoints/best_model.pt"

-rw------- 1 root root 52M Feb 15 23:11 /content/drive/MyDrive/ResearchProject/checkpoints/best_model.pt


In [27]:
!md5sum /content/drive/MyDrive/ResearchProject/checkpoints/best_model.pt

f070564bade98ad8133698af629a1ed4  /content/drive/MyDrive/ResearchProject/checkpoints/best_model.pt


In [28]:
# Check what's in the Drive copy
ckpt_drive = torch.load("/content/drive/MyDrive/ResearchProject/checkpoints/best_model.pt", map_location='cpu')
print(f"Drive copy - Epoch: {ckpt_drive['epoch']}, Val loss: {ckpt_drive['val_loss']:.6f}")

# Check what's in the local copy
ckpt_local = torch.load("/content/checkpoints/best_model.pt", map_location='cpu')
print(f"Local copy - Epoch: {ckpt_local['epoch']}, Val loss: {ckpt_local['val_loss']:.6f}")

Drive copy - Epoch: 42, Val loss: 0.027644
Local copy - Epoch: 42, Val loss: 0.027644
