In [None]:
# !ls -R /content/wide-resnet.pytorch | sed -n '1,200p'

In [1]:
# Clean any previous clones
import shutil, os, sys, subprocess, textwrap

REPO_DIR = "/content/wide-resnet.pytorch"
if os.path.isdir(REPO_DIR):
    print("Removing old repo at", REPO_DIR)
    shutil.rmtree(REPO_DIR)

# Fresh clone
!git clone https://github.com/bmsookim/wide-resnet.pytorch.git /content/wide-resnet.pytorch

# Verify the clone really exists and has files
import os, itertools
assert os.path.isdir(REPO_DIR), f"Repo not found at {REPO_DIR}"
top = os.listdir(REPO_DIR)
print("Top-level entries:", top[:50])

# Print first ~200 lines of the tree
print(subprocess.run(
    ["bash","-lc",f"ls -la {REPO_DIR}; echo '---'; ls -R {REPO_DIR} | sed -n '1,200p'"],
    capture_output=True, text=True
).stdout)


Cloning into '/content/wide-resnet.pytorch'...
remote: Enumerating objects: 124, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 124 (delta 29), reused 29 (delta 29), pack-reused 89 (from 1)[K
Receiving objects: 100% (124/124), 670.63 KiB | 8.71 MiB/s, done.
Resolving deltas: 100% (66/66), done.
Top-level entries: ['.git', '.gitignore', 'INSTALL.md', 'LICENSE', 'README.md', 'SERVER.md', 'config.py', 'imgs', 'main.py', 'networks', 'scripts']
total 48
drwxr-xr-x 6 root root 4096 Aug 24 18:22 .
drwxr-xr-x 3 root root   41 Aug 24 18:22 ..
drwxr-xr-x 8 root root 4096 Aug 24 18:22 .git
-rw-r--r-- 1 root root 1185 Aug 24 18:22 .gitignore
-rw-r--r-- 1 root root 4999 Aug 24 18:22 INSTALL.md
-rw-r--r-- 1 root root 1067 Aug 24 18:22 LICENSE
-rw-r--r-- 1 root root 3619 Aug 24 18:22 README.md
-rw-r--r-- 1 root root 2819 Aug 24 18:22 SERVER.md
-rw-r--r-- 1 root root  787 Aug 24 18:22 config.py
drwxr-xr-x 2 root root  141 Aug 

In [2]:
!ls -R /content/wide-resnet.pytorch | sed -n '1,200p'

/content/wide-resnet.pytorch:
INSTALL.md
LICENSE
README.md
SERVER.md
config.py
imgs
main.py
networks
scripts

/content/wide-resnet.pytorch/imgs:
cifar100_image.png
cifar10_image.png
img_356.lua
pytorch.png
svhn_image.png

/content/wide-resnet.pytorch/networks:
__init__.py
lenet.py
resnet.py
vggnet.py
wide_resnet.py

/content/wide-resnet.pytorch/scripts:
cifar100_train.sh
cifar10_train.sh
resnet_cifar100_train.sh


In [3]:
!grep -nE "class |def " /content/wide-resnet.pytorch/networks/wide_resnet.py | head -n 30

10:def conv3x3(in_planes, out_planes, stride=1):
13:def conv_init(m):
22:class wide_basic(nn.Module):
23:    def __init__(self, in_planes, planes, dropout_rate, stride=1):
37:    def forward(self, x):
44:class Wide_ResNet(nn.Module):
45:    def __init__(self, depth, widen_factor, dropout_rate, num_classes):
63:    def _wide_layer(self, block, planes, num_blocks, dropout_rate, stride):
73:    def forward(self, x):


In [8]:
# Requirements
import numpy as np

In [9]:
# Import & define the model from this repo variant
import sys, torch
sys.path.append('/content/wide-resnet.pytorch')

from networks.wide_resnet import Wide_ResNet  # ← underscore!

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_CLASSES = 100

# Constructor: (depth, widen_factor, dropout_rate, num_classes)
model = Wide_ResNet(depth=28, widen_factor=10, dropout_rate=0.0, num_classes=NUM_CLASSES).to(device)

print("✅ Model defined:", type(model).__name__, "on", next(model.parameters()).device)

# Quick forward sanity check (CIFAR-sized input 3x32x32)
with torch.no_grad():
    x = torch.randn(2, 3, 32, 32, device=device)
    y = model(x)
    print("✅ Forward OK — output shape:", tuple(y.shape))
    assert y.shape[1] == NUM_CLASSES, "Output classes mismatch"


| Wide-Resnet 28x10
✅ Model defined: Wide_ResNet on cuda:0
✅ Forward OK — output shape: (2, 100)


In [21]:
# =========================
# Wide_ResNet + SI + Replay + KD on Split CIFAR-100
# =========================

# 0) Imports
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from copy import deepcopy
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset
import numpy as np
import random
from tqdm import tqdm

# 1) Model import (adjust path to where Wide_ResNet is cloned)
sys.path.append('/content/wide-resnet.pytorch')  # adjust if needed
from networks.wide_resnet import Wide_ResNet

# 2) Device and constants
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_CLASSES = 100  # CIFAR-100
NUM_TASKS = 10
BATCH_SIZE = 32
EPOCHS = 2  # increase for real training
MEMORY_MAX_PER_CLASS = 10  # replay buffer size per class

# 3) Data preparation: Split CIFAR-100 into 10 disjoint tasks
transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5071, 0.4865, 0.4409),
                         (0.2673, 0.2564, 0.2762))
])

train_dataset = datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)

classes_per_task = NUM_CLASSES // NUM_TASKS
class_splits = [list(range(i * classes_per_task, (i + 1) * classes_per_task)) for i in range(NUM_TASKS)]

def make_loader(dataset, class_ids, batch_size=32, shuffle=True):
    idx = [i for i, (_, y) in enumerate(dataset) if y in class_ids]
    subset = Subset(dataset, idx)
    return DataLoader(subset, batch_size=batch_size, shuffle=shuffle, num_workers=2)

train_loaders = [make_loader(train_dataset, cls_ids, BATCH_SIZE, True) for cls_ids in class_splits]
test_loaders = [make_loader(test_dataset, cls_ids, BATCH_SIZE, False) for cls_ids in class_splits]

print(f" Created {len(train_loaders)} train loaders and {len(test_loaders)} test loaders")

# 4) Model definition
model = Wide_ResNet(depth=28, widen_factor=10, dropout_rate=0.0, num_classes=NUM_CLASSES).to(device)
with torch.no_grad():
    x = torch.randn(2, 3, 32, 32, device=device)
    y = model(x)
    print("Forward OK — output shape:", tuple(y.shape))

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)

# 5) Evaluation helper
# Fix evaluate() for Split CIFAR-10
def evaluate(model, loader, class_offset):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), (y + class_offset).to(device)
            outputs = model(x)
            _, preds = outputs.max(1)
            correct += preds.eq(y).sum().item()
            total += y.size(0)
    return 100.0 * correct / total

# 6) Synaptic Intelligence class
class SynapticIntelligence:
    def __init__(self, model, c=600, xi=0.1, device=None):
        self.model = model
        self.c = c
        self.xi = xi
        self.device = device or next(model.parameters()).device
        self.params = [p for p in model.parameters() if p.requires_grad]
        self.omega = [torch.zeros_like(p.data, device=self.device) for p in self.params]
        self.theta_ref = [p.data.clone().detach() for p in self.params]
        self.w = [torch.zeros_like(p.data, device=self.device) for p in self.params]

    @torch.no_grad()
    def begin_task(self):
        for k in range(len(self.params)):
            self.w[k].zero_()

    def penalty(self):
        reg = 0.0
        for p, omega_i, theta_star in zip(self.params, self.omega, self.theta_ref):
            reg = reg + torch.sum(omega_i * (p - theta_star) ** 2)
        return self.c * reg

    @torch.no_grad()
    def update_w_after_step(self, old_params):
        for k, p in enumerate(self.params):
            if p.grad is None: continue
            delta = p.data - old_params[k]
            self.w[k].add_((-p.grad).detach() * delta)

    @torch.no_grad()
    def end_task(self):
        for k, p in enumerate(self.params):
            # denom = (p.data - self.theta_ref[k]) ** 2 + self.xi
            denom = torch.clamp((p.data - self.theta_ref[k]) ** 2 + self.xi, min=1e-6)
            self.omega[k].add_(self.w[k] / denom)
            self.theta_ref[k] = p.data.clone().detach()
            self.w[k].zero_()

# 7) Replay + KD helpers
memory = {}
seen_classes = set()

@torch.no_grad()
def update_memory_from_loader(loader, per_class=MEMORY_MAX_PER_CLASS):
    global memory, seen_classes
    counts = {}
    for xb, yb in loader:
        for x, y in zip(xb, yb):
            y = int(y.item())
            if counts.get(y, 0) < per_class:
                memory.setdefault(y, []).append((x.cpu(), y))
                counts[y] = counts.get(y, 0) + 1
                seen_classes.add(y)

def sample_memory(batch_size):
    if not memory or not seen_classes:
        return None, None
    pool = []
    for c in seen_classes:
        pool.extend(memory.get(c, []))
    if not pool:
        return None, None
    k = min(batch_size, len(pool))
    samples = random.sample(pool, k)
    X = torch.stack([s[0] for s in samples]).to(device)
    y = torch.tensor([s[1] for s in samples], device=device)
    return X, y

def kd_loss(student_logits, teacher_logits, T=2.0):
    p_s = F.log_softmax(student_logits / T, dim=1)
    p_t = F.softmax(teacher_logits / T, dim=1)
    return F.kl_div(p_s, p_t, reduction='batchmean') * (T * T)

# 8) SI instantiation
si = SynapticIntelligence(model, c=300.0, xi=0.1, device=device)

# 9) Training loop
results = np.zeros((NUM_TASKS, NUM_TASKS))
teacher = None

for t in range(NUM_TASKS):
    print(f"\n=== Starting Task {t + 1}/{NUM_TASKS} ===")

    if t > 0:
        teacher = deepcopy(model).to(device)
        teacher.eval()
        for p in teacher.parameters():
            p.requires_grad = False

    model.train()
    si.begin_task()

    loader = train_loaders[t]
    if len(loader) == 0:
        print(f"train_loaders[{t}] is empty! Skipping.")
        continue

    for epoch in range(EPOCHS):
        print(f"-- Epoch {epoch + 1}/{EPOCHS}")
        running_loss = 0.0

        for i, (x, y) in enumerate(tqdm(loader, desc=f"Task {t} | Epoch {epoch}")):
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()

            logits_cur = model(x)
            ce_cur = criterion(logits_cur, y)
            reg_loss = si.penalty() if t > 0 else 0.0

            kd = 0.0
            ce_rep = 0.0
            if teacher is not None:
                xr, yr = sample_memory(batch_size=max(1, BATCH_SIZE // 4))
                if xr is not None:
                    with torch.no_grad():
                        t_logits = teacher(xr)
                    s_logits = model(xr)
                    ce_rep = criterion(s_logits, yr)
                    kd = kd_loss(s_logits, t_logits, T=2.0)
            """
            loss = ce_cur + 0.5 * (ce_rep if isinstance(ce_rep, torch.Tensor) else 0.0) \
                         + 0.5 * (kd if isinstance(kd, torch.Tensor) else 0.0) \
                         + (reg_loss if isinstance(reg_loss, torch.Tensor) else 0.0)
            """
            loss = ce_cur + 0.25 * ce_rep + 0.25 * kd + reg_loss  # lower weights for stability

            old_params = [p.data.clone() for p in si.params]
            #loss.backward()
            #optimizer.step()
            #si.update_w_after_step(old_params)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()

            running_loss += float(ce_cur.detach().cpu())
            if i % 50 == 0:
                print(f"Batch {i}/{len(loader)} | CE: {ce_cur:.4f} | SI: {reg_loss:.4f} | REP: {float(ce_rep):.4f} | KD: {float(kd):.4f}")

        avg_loss = running_loss / len(loader)
        print(f"Avg CE for Task {t}, Epoch {epoch+1}: {avg_loss:.4f}")
        torch.cuda.empty_cache()

    si.end_task()
    update_memory_from_loader(loader, per_class=MEMORY_MAX_PER_CLASS)

    # Eval
    for eval_t in range(t + 1):
        acc = evaluate(model, test_loaders[eval_t])
        results[t][eval_t] = acc
        print(f"[Task {t}] Eval on Task {eval_t}: {acc:.2f}%")

    print("\nAccuracy Summary After Task", t)
    for prev_task in range(t + 1):
        print(f"    Task {prev_task} accuracy: {results[t][prev_task]:.2f}%")

    if t > 0:
        print("\nForgetting After Task", t)
        for prev_task in range(t):
            acc_before = results[prev_task][prev_task]
            acc_now = results[t][prev_task]
            forgetting = acc_before - acc_now
            print(f"    Task {prev_task}: Before={acc_before:.2f}% Now={acc_now:.2f}% Forgotten={forgetting:.2f}%")

print("\nTraining complete. Results matrix:")
print(results)

Files already downloaded and verified
Files already downloaded and verified


KeyboardInterrupt: 

In [None]:
# =========================
# Wide_ResNet + SI + Replay + KD on Split CIFAR-10
# =========================

import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from copy import deepcopy
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset
import numpy as np
import random
from tqdm import tqdm

# 1) WideResNet import (adjust path if needed)
sys.path.append('/content/wide-resnet.pytorch')  # path to WRN implementation
from networks.wide_resnet import Wide_ResNet

# 2) Device and constants
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_CLASSES = 10
NUM_TASKS = 5            # 5 splits → 2 classes each
BATCH_SIZE = 32
EPOCHS = 2               # increase for better results
MEMORY_MAX_PER_CLASS = 10
LR = 0.001                
SI_C = 600               # strong SI regularization as per your finding

# 3) Data preparation: Split CIFAR-10 into 5 disjoint tasks (2 classes each)
transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                         (0.2470, 0.2435, 0.2616))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

classes_per_task = NUM_CLASSES // NUM_TASKS
class_splits = [list(range(i * classes_per_task, (i + 1) * classes_per_task)) for i in range(NUM_TASKS)]

def make_loader(dataset, class_ids, batch_size=32, shuffle=True):
    idx = [i for i, (_, y) in enumerate(dataset) if y in class_ids]
    subset = Subset(dataset, idx)
    return DataLoader(subset, batch_size=batch_size, shuffle=shuffle, num_workers=2)

train_loaders = [make_loader(train_dataset, cls_ids, BATCH_SIZE, True) for cls_ids in class_splits]
test_loaders = [make_loader(test_dataset, cls_ids, BATCH_SIZE, False) for cls_ids in class_splits]

print(f"Created {len(train_loaders)} train loaders and {len(test_loaders)} test loaders")

# 4) Model definition
model = Wide_ResNet(depth=28, widen_factor=10, dropout_rate=0.0, num_classes=NUM_CLASSES).to(device)
with torch.no_grad():
    x = torch.randn(2, 3, 32, 32, device=device)
    y = model(x)
    print("Forward OK — output shape:", tuple(y.shape))

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=LR, momentum=0.9, weight_decay=5e-4)

# 5) Evaluation helper
def evaluate(model, loader, class_offset=0):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), (y + class_offset).to(device)
            outputs = model(x)
            _, preds = outputs.max(1)
            correct += preds.eq(y).sum().item()
            total += y.size(0)
    return 100.0 * correct / total

# 6) Synaptic Intelligence
class SynapticIntelligence:
    def __init__(self, model, c=SI_C, xi=0.1, device=None):
        self.model = model
        self.c = c
        self.xi = xi
        self.device = device or next(model.parameters()).device
        self.params = [p for p in model.parameters() if p.requires_grad]
        self.omega = [torch.zeros_like(p.data, device=self.device) for p in self.params]
        self.theta_ref = [p.data.clone().detach() for p in self.params]
        self.w = [torch.zeros_like(p.data, device=self.device) for p in self.params]

    @torch.no_grad()
    def begin_task(self):
        for k in range(len(self.params)):
            self.w[k].zero_()

    def penalty(self):
        reg = 0.0
        for p, omega_i, theta_star in zip(self.params, self.omega, self.theta_ref):
            reg = reg + torch.sum(omega_i * (p - theta_star) ** 2)
        return self.c * reg

    @torch.no_grad()
    def update_w_after_step(self, old_params):
        for k, p in enumerate(self.params):
            if p.grad is None: continue
            delta = p.data - old_params[k]
            self.w[k].add_((-p.grad).detach() * delta)

    @torch.no_grad()
    def end_task(self):
        for k, p in enumerate(self.params):
            denom = torch.clamp((p.data - self.theta_ref[k]) ** 2 + self.xi, min=1e-6)
            self.omega[k].add_(self.w[k] / denom)
            self.theta_ref[k] = p.data.clone().detach()
            self.w[k].zero_()

# 7) Replay + KD helpers
memory = {}
seen_classes = set()

@torch.no_grad()
def update_memory_from_loader(loader, per_class=MEMORY_MAX_PER_CLASS):
    global memory, seen_classes
    counts = {}
    for xb, yb in loader:
        for x, y in zip(xb, yb):
            y = int(y.item())
            if counts.get(y, 0) < per_class:
                memory.setdefault(y, []).append((x.cpu(), y))
                counts[y] = counts.get(y, 0) + 1
                seen_classes.add(y)

def sample_memory(batch_size):
    if not memory or not seen_classes:
        return None, None
    pool = []
    for c in seen_classes:
        pool.extend(memory.get(c, []))
    if not pool:
        return None, None
    k = min(batch_size, len(pool))
    samples = random.sample(pool, k)
    X = torch.stack([s[0] for s in samples]).to(device)
    y = torch.tensor([s[1] for s in samples], device=device)
    return X, y

def kd_loss(student_logits, teacher_logits, T=2.0):
    p_s = F.log_softmax(student_logits / T, dim=1)
    p_t = F.softmax(teacher_logits / T, dim=1)
    return F.kl_div(p_s, p_t, reduction='batchmean') * (T * T)

# 8) SI instantiation
si = SynapticIntelligence(model, c=SI_C, xi=0.1, device=device)

# 9) Training loop
results = np.zeros((NUM_TASKS, NUM_TASKS))
teacher = None

for t in range(NUM_TASKS):
    print(f"\n=== Starting Task {t + 1}/{NUM_TASKS} ===")

    if t > 0:
        teacher = deepcopy(model).to(device)
        teacher.eval()
        for p in teacher.parameters():
            p.requires_grad = False

    model.train()
    si.begin_task()
    loader = train_loaders[t]

    for epoch in range(EPOCHS):
        print(f"-- Epoch {epoch + 1}/{EPOCHS}")
        running_loss = 0.0

        for i, (x, y) in enumerate(tqdm(loader, desc=f"Task {t} | Epoch {epoch}")):
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()

            logits_cur = model(x)
            ce_cur = criterion(logits_cur, y)
            reg_loss = si.penalty() if t > 0 else 0.0

            kd, ce_rep = 0.0, 0.0
            if teacher is not None:
                xr, yr = sample_memory(batch_size=max(1, BATCH_SIZE // 4))
                if xr is not None:
                    with torch.no_grad():
                        t_logits = teacher(xr)
                    s_logits = model(xr)
                    ce_rep = criterion(s_logits, yr)
                    kd = kd_loss(s_logits, t_logits, T=2.0)

            loss = ce_cur + 0.5 * ce_rep + 0.5 * kd + reg_loss

            old_params = [p.data.clone() for p in si.params]
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()
            si.update_w_after_step(old_params)

            running_loss += float(ce_cur.detach().cpu())
            if i % 50 == 0:
                print(f"Batch {i}/{len(loader)} | CE: {ce_cur:.4f} | SI: {float(reg_loss):.4f} | REP: {float(ce_rep):.4f} | KD: {float(kd):.4f}")

        print(f"Avg CE for Task {t}, Epoch {epoch+1}: {running_loss / len(loader):.4f}")
        torch.cuda.empty_cache()

    si.end_task()
    update_memory_from_loader(loader, per_class=MEMORY_MAX_PER_CLASS)

    # Eval
    for eval_t in range(t + 1):
        acc = evaluate(model, test_loaders[eval_t], class_offset=eval_t * classes_per_task)
        results[t][eval_t] = acc
        print(f"[Task {t}] Eval on Task {eval_t}: {acc:.2f}%")

    print("\nAccuracy Summary After Task", t)
    for prev_task in range(t + 1):
        print(f"    Task {prev_task}: {results[t][prev_task]:.2f}%")

print("\nTraining complete. Results matrix:")
print(results)

Files already downloaded and verified
Files already downloaded and verified
Created 5 train loaders and 5 test loaders
| Wide-Resnet 28x10
Forward OK — output shape: (2, 10)

=== Starting Task 1/5 ===
-- Epoch 1/2


Task 0 | Epoch 0:   1%|          | 3/313 [00:00<00:31,  9.87it/s]

Batch 0/313 | CE: 2.2624 | SI: 0.0000 | REP: 0.0000 | KD: 0.0000


Task 0 | Epoch 0:  17%|█▋        | 53/313 [00:03<00:15, 16.73it/s]

Batch 50/313 | CE: 0.5748 | SI: 0.0000 | REP: 0.0000 | KD: 0.0000


Task 0 | Epoch 0:  33%|███▎      | 103/313 [00:06<00:12, 16.65it/s]

Batch 100/313 | CE: 0.3596 | SI: 0.0000 | REP: 0.0000 | KD: 0.0000


Task 0 | Epoch 0:  49%|████▉     | 153/313 [00:09<00:09, 16.65it/s]

Batch 150/313 | CE: 0.5101 | SI: 0.0000 | REP: 0.0000 | KD: 0.0000


Task 0 | Epoch 0:  65%|██████▍   | 203/313 [00:12<00:06, 16.70it/s]

Batch 200/313 | CE: 0.1791 | SI: 0.0000 | REP: 0.0000 | KD: 0.0000


Task 0 | Epoch 0:  81%|████████  | 253/313 [00:15<00:03, 16.73it/s]

Batch 250/313 | CE: 0.3499 | SI: 0.0000 | REP: 0.0000 | KD: 0.0000


Task 0 | Epoch 0:  97%|█████████▋| 303/313 [00:18<00:00, 16.64it/s]

Batch 300/313 | CE: 0.4567 | SI: 0.0000 | REP: 0.0000 | KD: 0.0000


Task 0 | Epoch 0: 100%|██████████| 313/313 [00:18<00:00, 16.52it/s]


Avg CE for Task 0, Epoch 1: 0.4599
-- Epoch 2/2


Task 0 | Epoch 1:   1%|          | 3/313 [00:00<00:31,  9.76it/s]

Batch 0/313 | CE: 0.4839 | SI: 0.0000 | REP: 0.0000 | KD: 0.0000


Task 0 | Epoch 1:  17%|█▋        | 53/313 [00:03<00:15, 16.59it/s]

Batch 50/313 | CE: 0.2943 | SI: 0.0000 | REP: 0.0000 | KD: 0.0000


Task 0 | Epoch 1:  33%|███▎      | 103/313 [00:06<00:12, 16.62it/s]

Batch 100/313 | CE: 0.1956 | SI: 0.0000 | REP: 0.0000 | KD: 0.0000


Task 0 | Epoch 1:  49%|████▉     | 153/313 [00:09<00:09, 16.53it/s]

Batch 150/313 | CE: 0.1090 | SI: 0.0000 | REP: 0.0000 | KD: 0.0000


Task 0 | Epoch 1:  65%|██████▍   | 203/313 [00:12<00:06, 16.68it/s]

Batch 200/313 | CE: 0.3149 | SI: 0.0000 | REP: 0.0000 | KD: 0.0000


Task 0 | Epoch 1:  81%|████████  | 253/313 [00:15<00:03, 16.50it/s]

Batch 250/313 | CE: 0.3804 | SI: 0.0000 | REP: 0.0000 | KD: 0.0000


Task 0 | Epoch 1:  97%|█████████▋| 303/313 [00:18<00:00, 16.58it/s]

Batch 300/313 | CE: 0.1911 | SI: 0.0000 | REP: 0.0000 | KD: 0.0000


Task 0 | Epoch 1: 100%|██████████| 313/313 [00:19<00:00, 16.43it/s]

Avg CE for Task 0, Epoch 2: 0.2715





[Task 0] Eval on Task 0: 91.95%

Accuracy Summary After Task 0
    Task 0: 91.95%

=== Starting Task 2/5 ===
-- Epoch 1/2


Task 1 | Epoch 0:   1%|          | 2/313 [00:00<00:44,  6.96it/s]

Batch 0/313 | CE: 8.3805 | SI: 0.0000 | REP: 0.1859 | KD: 0.1180


Task 1 | Epoch 0:  17%|█▋        | 52/313 [00:05<00:25, 10.21it/s]

Batch 50/313 | CE: 1.3812 | SI: 0.2712 | REP: 0.1747 | KD: 1.3402


Task 1 | Epoch 0:  33%|███▎      | 102/313 [00:09<00:19, 10.74it/s]

Batch 100/313 | CE: 1.2716 | SI: 0.2200 | REP: 0.2020 | KD: 1.3158


Task 1 | Epoch 0:  49%|████▊     | 152/313 [00:14<00:15, 10.37it/s]

Batch 150/313 | CE: 0.9403 | SI: 0.2071 | REP: 0.2337 | KD: 0.9625


Task 1 | Epoch 0:  65%|██████▍   | 202/313 [00:19<00:10, 10.58it/s]

Batch 200/313 | CE: 0.8605 | SI: 0.2028 | REP: 0.1358 | KD: 0.8376


Task 1 | Epoch 0:  81%|████████  | 252/313 [00:24<00:05, 10.26it/s]

Batch 250/313 | CE: 0.8224 | SI: 0.2041 | REP: 0.1612 | KD: 1.2673


Task 1 | Epoch 0:  96%|█████████▋| 302/313 [00:28<00:01, 10.39it/s]

Batch 300/313 | CE: 0.8074 | SI: 0.1983 | REP: 0.1433 | KD: 0.8463


Task 1 | Epoch 0: 100%|██████████| 313/313 [00:30<00:00, 10.41it/s]


Avg CE for Task 1, Epoch 1: 1.4761
-- Epoch 2/2


Task 1 | Epoch 1:   1%|          | 2/313 [00:00<01:01,  5.04it/s]

Batch 0/313 | CE: 0.7094 | SI: 0.2014 | REP: 0.1552 | KD: 0.5644


Task 1 | Epoch 1:  17%|█▋        | 52/313 [00:05<00:25, 10.11it/s]

Batch 50/313 | CE: 0.9154 | SI: 0.2034 | REP: 0.2079 | KD: 0.5132


Task 1 | Epoch 1:  33%|███▎      | 102/313 [00:10<00:20, 10.33it/s]

Batch 100/313 | CE: 0.5696 | SI: 0.2066 | REP: 0.1953 | KD: 0.4033


Task 1 | Epoch 1:  49%|████▊     | 152/313 [00:15<00:15, 10.24it/s]

Batch 150/313 | CE: 1.1136 | SI: 0.2102 | REP: 0.2405 | KD: 0.5098


Task 1 | Epoch 1:  65%|██████▍   | 202/313 [00:19<00:10, 10.39it/s]

Batch 200/313 | CE: 0.7563 | SI: 0.2157 | REP: 0.0998 | KD: 0.5285


Task 1 | Epoch 1:  81%|████████  | 252/313 [00:24<00:05, 10.35it/s]

Batch 250/313 | CE: 0.8544 | SI: 0.2345 | REP: 0.0746 | KD: 0.5201


Task 1 | Epoch 1:  96%|█████████▋| 302/313 [00:29<00:01, 10.33it/s]

Batch 300/313 | CE: 1.1486 | SI: 0.2360 | REP: 0.1740 | KD: 0.6487


Task 1 | Epoch 1: 100%|██████████| 313/313 [00:30<00:00, 10.21it/s]

Avg CE for Task 1, Epoch 2: 0.8007





[Task 1] Eval on Task 0: 58.60%
[Task 1] Eval on Task 1: 0.00%

Accuracy Summary After Task 1
    Task 0: 58.60%
    Task 1: 0.00%

=== Starting Task 3/5 ===
-- Epoch 1/2


Task 2 | Epoch 0:   1%|          | 2/313 [00:00<01:00,  5.14it/s]

Batch 0/313 | CE: 6.4262 | SI: 0.0000 | REP: 0.1761 | KD: 0.3000


Task 2 | Epoch 0:  17%|█▋        | 52/313 [00:05<00:24, 10.58it/s]

Batch 50/313 | CE: 3.4519 | SI: 0.1933 | REP: 1.2444 | KD: 0.8244


Task 2 | Epoch 0:  33%|███▎      | 102/313 [00:09<00:19, 10.86it/s]

Batch 100/313 | CE: 1.7261 | SI: 0.1946 | REP: 0.9797 | KD: 1.9299


Task 2 | Epoch 0:  49%|████▊     | 152/313 [00:14<00:15, 10.26it/s]

Batch 150/313 | CE: 1.3535 | SI: 0.2708 | REP: 0.9413 | KD: 1.8703


Task 2 | Epoch 0:  65%|██████▍   | 202/313 [00:19<00:10, 10.28it/s]

Batch 200/313 | CE: 0.9003 | SI: 0.2881 | REP: 0.6675 | KD: 1.1079


Task 2 | Epoch 0:  81%|████████  | 252/313 [00:24<00:05, 10.42it/s]

Batch 250/313 | CE: 0.9414 | SI: 0.2953 | REP: 0.3751 | KD: 1.5331


Task 2 | Epoch 0:  96%|█████████▋| 302/313 [00:29<00:01, 10.14it/s]

Batch 300/313 | CE: 0.7079 | SI: 0.2975 | REP: 1.0994 | KD: 2.1865


Task 2 | Epoch 0: 100%|██████████| 313/313 [00:30<00:00, 10.30it/s]


Avg CE for Task 2, Epoch 1: 1.8879
-- Epoch 2/2


Task 2 | Epoch 1:   1%|          | 2/313 [00:00<00:50,  6.20it/s]

Batch 0/313 | CE: 0.5980 | SI: 0.3096 | REP: 0.6367 | KD: 1.5944


Task 2 | Epoch 1:  17%|█▋        | 53/313 [00:05<00:25, 10.28it/s]

Batch 50/313 | CE: 0.6851 | SI: 0.3034 | REP: 0.3211 | KD: 0.8285


Task 2 | Epoch 1:  33%|███▎      | 103/313 [00:10<00:19, 10.99it/s]

Batch 100/313 | CE: 0.6503 | SI: 0.3140 | REP: 1.6878 | KD: 0.9782


Task 2 | Epoch 1:  49%|████▉     | 153/313 [00:14<00:14, 11.02it/s]

Batch 150/313 | CE: 0.6033 | SI: 0.3174 | REP: 0.2713 | KD: 0.6914


Task 2 | Epoch 1:  65%|██████▍   | 203/313 [00:19<00:10, 10.34it/s]

Batch 200/313 | CE: 2.8719 | SI: 0.3356 | REP: 0.8138 | KD: 0.9948


Task 2 | Epoch 1:  81%|████████  | 253/313 [00:24<00:05, 10.31it/s]

Batch 250/313 | CE: 0.6851 | SI: 0.3282 | REP: 0.7195 | KD: 0.8234


Task 2 | Epoch 1:  97%|█████████▋| 303/313 [00:29<00:00, 10.28it/s]

Batch 300/313 | CE: 3.1574 | SI: 0.3286 | REP: 0.5287 | KD: 0.5442


Task 2 | Epoch 1: 100%|██████████| 313/313 [00:30<00:00, 10.33it/s]

Avg CE for Task 2, Epoch 2: 0.7473





[Task 2] Eval on Task 0: 71.95%
[Task 2] Eval on Task 1: 6.60%
[Task 2] Eval on Task 2: 0.00%

Accuracy Summary After Task 2
    Task 0: 71.95%
    Task 1: 6.60%
    Task 2: 0.00%

=== Starting Task 4/5 ===
-- Epoch 1/2


Task 3 | Epoch 0:   1%|          | 2/313 [00:00<00:54,  5.67it/s]

Batch 0/313 | CE: 5.9930 | SI: 0.0000 | REP: 1.4178 | KD: 0.1234


Task 3 | Epoch 0:  17%|█▋        | 52/313 [00:05<00:24, 10.64it/s]

Batch 50/313 | CE: 2.8591 | SI: 0.1711 | REP: 1.6523 | KD: 0.2570


Task 3 | Epoch 0:  33%|███▎      | 102/313 [00:10<00:20, 10.29it/s]

Batch 100/313 | CE: 2.4999 | SI: 0.2115 | REP: 1.3509 | KD: 0.5809


Task 3 | Epoch 0:  49%|████▊     | 152/313 [00:14<00:15, 10.31it/s]

Batch 150/313 | CE: 1.5841 | SI: 0.2597 | REP: 0.6346 | KD: 0.9119


Task 3 | Epoch 0:  56%|█████▌    | 174/313 [00:17<00:13, 10.37it/s]