In [None]:

# Combined U-Net + EfficientViT 
import os, math, time, random
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision, torchvision.transforms as T

# ---------------- Utilities ----------------
def count_params(m):
    total = sum(p.numel() for p in m.parameters())
    trainable = sum(p.numel() for p in m.parameters() if p.requires_grad)
    return total, trainable

def try_flops(model, input_size=(1,3,32,32)):
    try:
        from thop import profile
        device = next(model.parameters()).device
        dummy = torch.randn(*input_size).to(device)
        macs, _ = profile(model, inputs=(dummy,), verbose=False)
        return 2 * macs
    except Exception:
        return None

# ---------------- Droppath ----------------
class DropPath(nn.Module):
    def __init__(self, p=0.):
        super().__init__(); self.p = p
    def forward(self,x):
        if not self.training or self.p==0.: return x
        keep = 1 - self.p
        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
        rand = x.new_empty(shape).bernoulli_(keep)
        return x / keep * rand

# ---------------- Core building blocks ----------------
class DWConv(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x, H, W):
        B, N, C = x.shape
        x2 = x.transpose(1,2).reshape(B, C, H, W)
        x2 = F.conv2d(x2, x2.new_ones((C,1,3,3))/9, groups=C, padding=1)
        return x2.reshape(B, C, -1).transpose(1,2)

class FFN(nn.Module):
    def __init__(self, dim, hidden_dim, drop=0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(drop),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(drop),
        )
    def forward(self, x): return self.net(x)

class CascadedGroupAttention(nn.Module):
    def __init__(self, dim, num_heads=4, qk_ratio=0.25):
        super().__init__()
        assert dim % num_heads == 0, "dim must be divisible by num_heads"
        self.h = num_heads
        self.head_dim = dim // num_heads
        self.qk_dim = max(1, int(self.head_dim * qk_ratio))
        self.q_projs = nn.ModuleList([nn.Linear(self.head_dim, self.qk_dim, bias=False) for _ in range(self.h)])
        self.k_projs = nn.ModuleList([nn.Linear(self.head_dim, self.qk_dim, bias=False) for _ in range(self.h)])
        self.v_projs = nn.ModuleList([nn.Linear(self.head_dim, self.head_dim, bias=False) for _ in range(self.h)])
        self.out = nn.Linear(dim, dim)

    def forward(self, x, H, W):
        # x: [B, N, C]
        B, N, C = x.shape
        xs = x.reshape(B, N, self.h, self.head_dim).permute(2,0,1,3)  # [h, B, N, head_dim]
        head_outs = []
        prev = None
        for j in range(self.h):
            xj = xs[j]  # [B, N, head_dim]
            if prev is not None:
                xj = xj + prev
            q = self.q_projs[j](xj)   # [B, N, qk_dim]
            k = self.k_projs[j](xj)
            v = self.v_projs[j](xj)   # [B, N, head_dim]
            sim = (q @ k.transpose(-2, -1)) / math.sqrt(max(1.0, float(self.qk_dim)))
            att = torch.softmax(sim, dim=-1)
            out_j = att @ v  # [B, N, head_dim]
            head_outs.append(out_j)
            prev = out_j.detach()
        out = torch.cat(head_outs, dim=-1)  # [B, N, C]
        return self.out(out)

class EfficientBlock(nn.Module):
    """A lightweight 'sandwich' block: FFN -> DW -> CGA -> FFN (pre-norm)"""
    def __init__(self, dim, num_heads=4, mlp_ratio=2.0, drop_path=0.0, qk_ratio=0.25):
        super().__init__()
        hidden = int(dim * mlp_ratio)
        self.norm1 = nn.LayerNorm(dim)
        self.ffn1 = FFN(dim, hidden)
        self.norm2 = nn.LayerNorm(dim)
        self.dw = DWConv()
        self.norm3 = nn.LayerNorm(dim)
        self.attn = CascadedGroupAttention(dim, num_heads=num_heads, qk_ratio=qk_ratio)
        self.norm4 = nn.LayerNorm(dim)
        self.ffn2 = FFN(dim, hidden)
        self.dp = DropPath(drop_path)

    def forward(self, x, H, W):
        x = x + self.dp(self.ffn1(self.norm1(x)))
        x = x + self.dp(self.dw(self.norm2(x), H, W))
        x = x + self.dp(self.attn(self.norm3(x), H, W))
        x = x + self.dp(self.ffn2(self.norm4(x)))
        return x

# ---------------- U-Net + EfficientViT Model ----------------
class UNetEfficientViT(nn.Module):
    def __init__(self,
                 in_ch=3,
                 num_classes=10,
                 enc_channels=(48, 96, 160),   
                 dec_channels=(96, 64),       
                 enc_depths=(1,2,3),
                 decoder_depths=(1,1),
                 heads=(2,4,4),
                 mlp_ratio=2.0,
                 qk_ratio=0.25,
                 drop_path_rate=0.1):
        super().__init__()

        self.stem = nn.Sequential(
            nn.Conv2d(in_ch, enc_channels[0], kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(enc_channels[0]),
            nn.ReLU(inplace=True),
            nn.Conv2d(enc_channels[0], enc_channels[0], kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(enc_channels[0]),
            nn.ReLU(inplace=True),
        )
        # encoder stages
        self.enc_channels = enc_channels
        self.enc_depths = enc_depths
        self.enc_stages = nn.ModuleList()
        self.enc_proj = nn.ModuleList()
        total_blocks = sum(enc_depths) + sum(decoder_depths)
        dp_rates = torch.linspace(0, drop_path_rate, total_blocks).tolist()
        dp_idx = 0
        for i, (C, L, h) in enumerate(zip(enc_channels, enc_depths, heads)):

            self.enc_proj.append(nn.Identity())
            blocks = nn.ModuleList()
            for _ in range(L):
                blocks.append(EfficientBlock(dim=C, num_heads=h, mlp_ratio=mlp_ratio, drop_path=dp_rates[dp_idx], qk_ratio=qk_ratio))
                dp_idx += 1
            self.enc_stages.append(blocks)

            if i < len(enc_channels) - 1:
                setattr(self, f"down{i}", nn.Sequential(
                    nn.Conv2d(C, enc_channels[i+1], kernel_size=3, stride=2, padding=1, bias=False),
                    nn.BatchNorm2d(enc_channels[i+1]),
                    nn.ReLU(inplace=True)
                ))

        self.up_convs = nn.ModuleList()
        self.dec_stages = nn.ModuleList()
        in_ch_dec = enc_channels[-1]
        for i, (C_out, L) in enumerate(zip(dec_channels, decoder_depths)):

            self.up_convs.append(nn.ConvTranspose2d(in_ch_dec, C_out, kernel_size=2, stride=2))
            blocks = nn.ModuleList()
            for _ in range(L):
                blocks.append(EfficientBlock(dim=C_out, num_heads=max(1, heads[-1]//2), mlp_ratio=mlp_ratio, drop_path=dp_rates[dp_idx], qk_ratio=qk_ratio))
                dp_idx += 1
            self.dec_stages.append(blocks)
            in_ch_dec = C_out


        self.refine = nn.Sequential(
            nn.Conv2d(in_ch_dec, in_ch_dec, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(in_ch_dec),
            nn.ReLU(inplace=True)
        )
        # classifier
        self.global_pool = nn.AdaptiveAvgPool2d(1)
        self.head = nn.Linear(in_ch_dec, num_classes)

    def forward(self, x):
        B = x.shape[0]
        skips = []
        x = self.stem(x)  # [B, C0, H/2, W/2], H=W=32 -> now 16
        for i, blocks in enumerate(self.enc_stages):
            C = x.shape[1]; H = x.shape[2]; W = x.shape[3]
            # tokens
            tokens = x.reshape(B, C, -1).permute(0,2,1)  # [B, N, C]
            for blk in blocks:
                tokens = blk(tokens, H, W)
            x = tokens.permute(0,2,1).reshape(B, C, H, W)
            skips.append(x)   #
            if i < len(self.enc_stages) - 1:
                x = getattr(self, f"down{i}")(x)

        for i, (upconv, blocks) in enumerate(zip(self.up_convs, self.dec_stages)):
            x = upconv(x)  
            if skip.shape[1] != x.shape[1]:
                skip = F.interpolate(skip, size=(x.shape[2], x.shape[3]), mode='bilinear', align_corners=False)
                conv1x1 = nn.Conv2d(skip.shape[1], x.shape[1], kernel_size=1).to(x.device)
                skip = conv1x1(skip)
            x = torch.cat([x, skip], dim=1)  # [B, Cx + Cskip, H, W]
            conv_reduce = nn.Conv2d(x.shape[1], x.shape[1]//2 if (x.shape[1]//2)>0 else 1, kernel_size=1).to(x.device)
            x = F.relu(nn.BatchNorm2d(x.shape[1]//2 if (x.shape[1]//2)>0 else 1).to(x.device)(conv_reduce(x)))
            Bn, Cn, Hn, Wn = x.shape
            tokens = x.reshape(Bn, Cn, -1).permute(0,2,1)
            for blk in blocks:
                tokens = blk(tokens, Hn, Wn)
            x = tokens.permute(0,2,1).reshape(Bn, Cn, Hn, Wn)

        x = self.refine(x)
        x = self.global_pool(x).reshape(B, -1)
        out = self.head(x)
        return out

# ----------------  (CIFAR-10) ----------------
def get_cifar10_loaders(bs=128, num_workers=4):
    train_transforms = T.Compose([
        T.RandAugment(num_ops=2, magnitude=9),
        T.RandomCrop(32, padding=4),
        T.RandomHorizontalFlip(),
        T.ToTensor(),
        T.Normalize((0.4914,0.4822,0.4465),(0.2470,0.2435,0.2616)),
    ])
    test_transforms = T.Compose([
        T.ToTensor(),
        T.Normalize((0.4914,0.4822,0.4465),(0.2470,0.2435,0.2616)),
    ])
    train = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transforms)
    test = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transforms)
    return DataLoader(train, batch_size=bs, shuffle=True, num_workers=num_workers), \
           DataLoader(test, batch_size=bs, shuffle=False, num_workers=num_workers)

# ---------------- Mixup/CutMix  ----------------
def cutmix_box(size, lam):
    W, H = size[3], size[2]
    cut_rat = math.sqrt(1 - lam)
    cut_w, cut_h = int(W * cut_rat), int(H * cut_rat)
    cx, cy = np.random.randint(W), np.random.randint(H)
    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)
    return bbx1, bby1, bbx2, bby2

def mixup_cutmix(x, y, alpha=1.0, cutmix_prob=0.5):
    lam = np.random.beta(alpha, alpha)
    rand_index = torch.randperm(x.size(0)).to(x.device)
    if np.random.rand() < cutmix_prob:
        bbx1, bby1, bbx2, bby2 = cutmix_box(x.size(), lam)
        x[:, :, bby1:bby2, bbx1:bbx2] = x[rand_index, :, bby1:bby2, bbx1:bbx2]
        lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (x.size(-1) * x.size(-2)))
    y_a, y_b = y, y[rand_index]
    return x, y_a, y_b, lam

# ---------------- Training  ----------------
def train_one_epoch(model, dl, optimizer, device, loss_fn, mixup_alpha=0.8):
    model.train()
    total_loss = 0.0
    top1 = 0
    top5 = 0
    n = 0
    pbar = tqdm(dl, leave=False)
    for xb, yb in pbar:
        xb, yb = xb.to(device), yb.to(device)
        xb, ya, yb2, lam = mixup_cutmix(xb, yb, alpha=mixup_alpha)
        optimizer.zero_grad()
        out = model(xb)
        loss = lam * loss_fn(out, ya) + (1 - lam) * loss_fn(out, yb2)
        loss.backward()
        optimizer.step()
        bs = out.size(0)
        total_loss += loss.item() * bs
        n += bs
        preds = out.topk(5, dim=1)[1]
        top1 += (lam * (preds[:, 0] == ya).sum() + (1 - lam) * (preds[:, 0] == yb2).sum()).item()

        for i in range(preds.size(0)):
            lab_a = ya[i].item(); lab_b = yb2[i].item()
            top5 += (lam * (lab_a in preds[i].tolist()) + (1 - lam) * (lab_b in preds[i].tolist()))
    return total_loss / n, top1 / n, top5 / n

def evaluate(model, dl, device, loss_fn):
    model.eval()
    total_loss = 0.0
    top1 = 0
    top5 = 0
    n = 0
    with torch.no_grad():
        for xb, yb in dl:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            loss = loss_fn(out, yb)
            bs = out.size(0)
            total_loss += loss.item() * bs
            n += bs
            preds = out.topk(5, dim=1)[1]
            top1 += (preds[:, 0] == yb).sum().item()
            for i in range(preds.size(0)):
                if yb[i].item() in preds[i].tolist():
                    top5 += 1
    return total_loss / n, top1 / n, top5 / n

# ---------------- Plot  ----------------
def plot_history(hist, out_dir='outputs'):
    os.makedirs(out_dir, exist_ok=True)
    epochs = len(hist['train_loss'])
    x = list(range(1, epochs + 1))
    plt.figure(figsize=(6,4))
    plt.plot(x, hist['train_loss'], label='train_loss')
    plt.plot(x, hist['val_loss'], label='val_loss')
    plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.legend(); plt.grid(True); plt.title('Loss')
    plt.savefig(os.path.join(out_dir, 'loss.png'), dpi=200); plt.close()

    plt.figure(figsize=(6,4))
    plt.plot(x, np.array(hist['train_top1']) * 100, label='train_top1')
    plt.plot(x, np.array(hist['val_top1']) * 100, label='val_top1')
    plt.plot(x, np.array(hist['val_top5']) * 100, label='val_top5')
    plt.xlabel('Epoch'); plt.ylabel('Accuracy (%)'); plt.legend(); plt.grid(True); plt.title('Accuracy')
    plt.savefig(os.path.join(out_dir, 'accuracy.png'), dpi=200); plt.close()


def main():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    batch_size = 128
    epochs = 150
    lr = 3e-4
    wd = 0.05
    patience = 15

    model = UNetEfficientViT(
        in_ch=3,
        num_classes=10,
        enc_channels=(48, 96, 160),   
        dec_channels=(160, 96),       
        enc_depths=(1,2,2),
        decoder_depths=(1,1),
        heads=(2,4,4),
        mlp_ratio=2.0,
        qk_ratio=0.25,
        drop_path_rate=0.1
    ).to(device)

    total_params, trainable = count_params(model)
    print(f"Params: {total_params:,} ({total_params/1e6:.3f}M) | Trainable: {trainable:,}")
    flops = try_flops(model, input_size=(1,3,32,32))
    if flops is not None:
        print(f"FLOPs (approx): {flops/1e6:.1f}M")

    train_loader, test_loader = get_cifar10_loaders(bs=batch_size, num_workers=4)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)

    history = {'train_loss':[], 'train_top1':[], 'train_top5':[], 'val_loss':[], 'val_top1':[], 'val_top5':[]}

    best_val = 0.0
    wait = 0
    os.makedirs('outputs', exist_ok=True)

    for ep in range(1, epochs+1):
        t0 = time.time()
        train_loss, train_t1, train_t5 = train_one_epoch(model, train_loader, optimizer, device, loss_fn, mixup_alpha=0.8)
        val_loss, val_t1, val_t5 = evaluate(model, test_loader, device, loss_fn)
        scheduler.step()

        history['train_loss'].append(train_loss); history['train_top1'].append(train_t1); history['train_top5'].append(train_t5)
        history['val_loss'].append(val_loss); history['val_top1'].append(val_t1); history['val_top5'].append(val_t5)

        improved = val_t1 > best_val
        if improved:
            best_val = val_t1
            wait = 0
            torch.save(model.state_dict(), os.path.join('outputs', 'best_unet_efficientvit_cifar10.pth'))
        else:
            wait += 1

        t1 = time.time()
        print(f"E{ep:03d}/{epochs} | TL {train_loss:.3f} | VL {val_loss:.3f} | T1 {train_t1*100:.2f}% | V1 {val_t1*100:.2f}% | V5 {val_t5*100:.2f}% | Wait {wait} | time {t1-t0:.1f}s")

        if wait >= patience:
            print(f"Early stopping at epoch {ep} (no val acc improvement for {patience} epochs).")
            break

    torch.save(model.state_dict(), os.path.join('outputs', 'last_unet_efficientvit_cifar10.pth'))
    plot_history(history, out_dir='outputs')
    print(f"Done. Best val Top-1: {best_val*100:.2f}%")

if __name__ == '__main__':
    main()


Params: 1,477,610 (1.478M) | Trainable: 1,477,610


100%|██████████| 170M/170M [00:02<00:00, 62.4MB/s]
                                                 

E001/150 | TL 2.307 | VL 2.253 | T1 10.87% | V1 14.42% | V5 59.71% | Wait 0 | time 45.7s


                                                 

E002/150 | TL 2.271 | VL 2.158 | T1 13.57% | V1 18.96% | V5 71.73% | Wait 0 | time 46.5s


                                                 

E003/150 | TL 2.230 | VL 2.108 | T1 15.52% | V1 19.10% | V5 77.75% | Wait 0 | time 48.1s


                                                 

E004/150 | TL 2.224 | VL 2.096 | T1 15.58% | V1 18.47% | V5 80.08% | Wait 1 | time 47.5s


                                                 

E005/150 | TL 2.214 | VL 2.061 | T1 16.25% | V1 21.59% | V5 80.71% | Wait 0 | time 47.6s


                                                 

E006/150 | TL 2.217 | VL 2.044 | T1 16.36% | V1 22.35% | V5 83.01% | Wait 0 | time 47.7s


                                                 

E007/150 | TL 2.204 | VL 2.036 | T1 16.87% | V1 20.74% | V5 82.30% | Wait 1 | time 47.7s


                                                 

E008/150 | TL 2.210 | VL 2.033 | T1 17.18% | V1 21.95% | V5 83.12% | Wait 2 | time 47.5s


                                                 

E009/150 | TL 2.198 | VL 2.060 | T1 17.67% | V1 23.48% | V5 83.44% | Wait 0 | time 47.7s


                                                 

E010/150 | TL 2.190 | VL 2.020 | T1 18.10% | V1 25.00% | V5 84.73% | Wait 0 | time 47.7s


                                                 

E011/150 | TL 2.199 | VL 2.009 | T1 18.34% | V1 24.79% | V5 83.31% | Wait 1 | time 47.7s


                                                 

E012/150 | TL 2.189 | VL 1.977 | T1 18.78% | V1 25.98% | V5 85.49% | Wait 0 | time 48.0s


                                                 

E013/150 | TL 2.204 | VL 2.005 | T1 18.10% | V1 25.81% | V5 85.08% | Wait 1 | time 48.0s


                                                 

E014/150 | TL 2.183 | VL 1.962 | T1 19.32% | V1 26.78% | V5 86.41% | Wait 0 | time 48.1s


                                                 

E015/150 | TL 2.192 | VL 1.973 | T1 19.19% | V1 28.49% | V5 88.09% | Wait 0 | time 48.0s


                                                 

E016/150 | TL 2.170 | VL 2.003 | T1 20.49% | V1 25.17% | V5 82.99% | Wait 1 | time 48.1s


                                                 

E017/150 | TL 2.174 | VL 1.957 | T1 20.66% | V1 28.77% | V5 86.83% | Wait 0 | time 48.1s


                                                 

E018/150 | TL 2.166 | VL 1.955 | T1 21.39% | V1 32.22% | V5 88.73% | Wait 0 | time 47.8s


                                                 

E019/150 | TL 2.166 | VL 1.924 | T1 21.86% | V1 32.76% | V5 89.97% | Wait 0 | time 47.7s


                                                 

E020/150 | TL 2.146 | VL 1.898 | T1 23.24% | V1 34.83% | V5 91.06% | Wait 0 | time 47.7s


                                                 

E021/150 | TL 2.139 | VL 1.901 | T1 23.90% | V1 33.22% | V5 89.87% | Wait 1 | time 47.5s


                                                 

E022/150 | TL 2.134 | VL 1.874 | T1 24.20% | V1 34.67% | V5 90.22% | Wait 2 | time 47.6s


                                                 

E023/150 | TL 2.108 | VL 1.845 | T1 25.75% | V1 36.13% | V5 91.71% | Wait 0 | time 47.8s


                                                 

E024/150 | TL 2.110 | VL 1.815 | T1 25.89% | V1 39.25% | V5 92.78% | Wait 0 | time 47.7s


                                                 

E025/150 | TL 2.099 | VL 1.815 | T1 26.35% | V1 38.40% | V5 92.00% | Wait 1 | time 47.6s


                                                 

E026/150 | TL 2.097 | VL 1.803 | T1 27.01% | V1 39.62% | V5 92.35% | Wait 0 | time 47.6s


                                                 

E027/150 | TL 2.104 | VL 1.799 | T1 26.90% | V1 39.95% | V5 94.02% | Wait 0 | time 47.7s


                                                 

E028/150 | TL 2.090 | VL 1.760 | T1 27.52% | V1 39.96% | V5 93.44% | Wait 0 | time 47.4s


                                                 

E029/150 | TL 2.080 | VL 1.859 | T1 27.89% | V1 36.90% | V5 91.79% | Wait 1 | time 47.7s


                                                 

E030/150 | TL 2.099 | VL 1.775 | T1 27.50% | V1 41.44% | V5 93.39% | Wait 0 | time 47.5s


                                                 

E031/150 | TL 2.061 | VL 1.754 | T1 29.55% | V1 41.76% | V5 93.05% | Wait 0 | time 47.6s


                                                 

E032/150 | TL 2.082 | VL 1.746 | T1 28.89% | V1 44.46% | V5 94.11% | Wait 0 | time 47.6s


                                                 

E033/150 | TL 2.085 | VL 1.750 | T1 28.75% | V1 43.38% | V5 93.51% | Wait 1 | time 47.5s


                                                 

E034/150 | TL 2.053 | VL 1.770 | T1 30.71% | V1 44.78% | V5 92.96% | Wait 0 | time 47.5s


                                                 

E035/150 | TL 2.061 | VL 1.679 | T1 30.25% | V1 47.88% | V5 94.50% | Wait 0 | time 47.6s


                                                 

E036/150 | TL 2.073 | VL 1.715 | T1 30.16% | V1 46.60% | V5 94.68% | Wait 1 | time 47.3s


                                                 

E037/150 | TL 2.075 | VL 1.709 | T1 30.08% | V1 49.10% | V5 94.77% | Wait 0 | time 47.6s


                                                 

E038/150 | TL 2.043 | VL 1.678 | T1 32.12% | V1 49.18% | V5 94.56% | Wait 0 | time 47.7s


                                                 

E039/150 | TL 2.051 | VL 1.686 | T1 31.50% | V1 49.75% | V5 94.75% | Wait 0 | time 47.6s


                                                 

E040/150 | TL 2.030 | VL 1.704 | T1 32.80% | V1 48.50% | V5 93.37% | Wait 1 | time 47.6s


                                                 

E041/150 | TL 2.031 | VL 1.657 | T1 33.25% | V1 50.64% | V5 95.33% | Wait 0 | time 47.9s


                                                 

E042/150 | TL 2.036 | VL 1.658 | T1 33.12% | V1 51.18% | V5 94.82% | Wait 0 | time 47.6s


                                                 

E043/150 | TL 2.043 | VL 1.628 | T1 33.20% | V1 52.29% | V5 95.37% | Wait 0 | time 47.8s


                                                 

E044/150 | TL 2.035 | VL 1.621 | T1 33.36% | V1 51.64% | V5 94.56% | Wait 1 | time 47.5s


                                                 

E045/150 | TL 2.021 | VL 1.614 | T1 34.10% | V1 55.36% | V5 95.45% | Wait 0 | time 47.7s


                                                 

E046/150 | TL 2.017 | VL 1.605 | T1 34.76% | V1 54.77% | V5 95.72% | Wait 1 | time 47.5s


                                                 

E047/150 | TL 2.017 | VL 1.571 | T1 35.04% | V1 55.72% | V5 95.72% | Wait 0 | time 47.8s


                                                 

E048/150 | TL 2.038 | VL 1.584 | T1 34.01% | V1 56.53% | V5 95.21% | Wait 0 | time 47.9s


                                                 

E049/150 | TL 2.019 | VL 1.616 | T1 34.76% | V1 55.28% | V5 95.66% | Wait 1 | time 48.1s


                                                 

E050/150 | TL 2.017 | VL 1.599 | T1 35.29% | V1 56.15% | V5 95.89% | Wait 2 | time 47.9s


                                                 

E051/150 | TL 2.007 | VL 1.588 | T1 35.91% | V1 56.06% | V5 95.81% | Wait 3 | time 48.2s


                                                 

E052/150 | TL 2.022 | VL 1.545 | T1 35.43% | V1 58.01% | V5 95.94% | Wait 0 | time 48.1s


                                                 

E053/150 | TL 1.988 | VL 1.566 | T1 37.12% | V1 56.70% | V5 95.53% | Wait 1 | time 48.3s


                                                 

E054/150 | TL 2.007 | VL 1.556 | T1 36.18% | V1 58.07% | V5 95.71% | Wait 0 | time 47.9s


                                                 

E055/150 | TL 2.023 | VL 1.543 | T1 35.80% | V1 59.10% | V5 95.94% | Wait 0 | time 48.3s


                                                 

E056/150 | TL 1.979 | VL 1.565 | T1 38.09% | V1 55.74% | V5 95.37% | Wait 1 | time 47.8s


                                                 

E057/150 | TL 2.001 | VL 1.502 | T1 36.77% | V1 59.87% | V5 96.40% | Wait 0 | time 47.8s


                                                 

E058/150 | TL 2.003 | VL 1.503 | T1 36.88% | V1 60.47% | V5 96.35% | Wait 0 | time 47.5s


                                                 

E059/150 | TL 1.999 | VL 1.514 | T1 36.96% | V1 61.74% | V5 96.38% | Wait 0 | time 47.8s


                                                 

E060/150 | TL 1.961 | VL 1.497 | T1 38.87% | V1 59.68% | V5 95.89% | Wait 1 | time 47.5s


                                                 

E061/150 | TL 1.999 | VL 1.519 | T1 37.09% | V1 61.78% | V5 96.74% | Wait 0 | time 47.8s


                                                 

E062/150 | TL 1.957 | VL 1.479 | T1 39.46% | V1 63.41% | V5 96.45% | Wait 0 | time 47.6s


                                                 

E063/150 | TL 1.968 | VL 1.532 | T1 38.71% | V1 62.43% | V5 95.69% | Wait 1 | time 47.7s


                                                 

E064/150 | TL 1.936 | VL 1.462 | T1 40.28% | V1 64.47% | V5 96.13% | Wait 0 | time 47.5s


                                                 

E065/150 | TL 1.970 | VL 1.508 | T1 38.63% | V1 61.53% | V5 95.24% | Wait 1 | time 47.7s


                                                 

E066/150 | TL 1.962 | VL 1.441 | T1 39.21% | V1 65.67% | V5 96.68% | Wait 0 | time 47.5s


                                                 

E067/150 | TL 1.958 | VL 1.477 | T1 39.86% | V1 64.74% | V5 95.92% | Wait 1 | time 47.7s


                                                 

E068/150 | TL 1.967 | VL 1.491 | T1 38.99% | V1 65.06% | V5 96.31% | Wait 2 | time 47.5s


                                                 

E069/150 | TL 1.976 | VL 1.475 | T1 38.86% | V1 63.95% | V5 96.69% | Wait 3 | time 47.6s


                                                 

E070/150 | TL 1.948 | VL 1.426 | T1 40.34% | V1 66.19% | V5 97.08% | Wait 0 | time 47.6s


                                                 

E071/150 | TL 1.950 | VL 1.480 | T1 40.14% | V1 64.47% | V5 96.54% | Wait 1 | time 47.6s


                                                 

E072/150 | TL 1.956 | VL 1.423 | T1 39.72% | V1 67.54% | V5 96.82% | Wait 0 | time 47.4s


                                                 

E073/150 | TL 1.968 | VL 1.443 | T1 38.85% | V1 66.63% | V5 96.86% | Wait 1 | time 47.7s


                                                 

E074/150 | TL 1.947 | VL 1.436 | T1 40.07% | V1 64.89% | V5 96.74% | Wait 2 | time 47.6s


                                                 

E075/150 | TL 1.947 | VL 1.414 | T1 40.23% | V1 67.10% | V5 97.18% | Wait 3 | time 48.2s


                                                 

E076/150 | TL 1.937 | VL 1.411 | T1 40.77% | V1 67.97% | V5 96.92% | Wait 0 | time 48.1s


                                                 

E077/150 | TL 1.944 | VL 1.463 | T1 40.49% | V1 66.90% | V5 96.74% | Wait 1 | time 47.7s


                                                 

E078/150 | TL 1.895 | VL 1.370 | T1 42.72% | V1 68.92% | V5 97.22% | Wait 0 | time 47.5s


                                                 

E079/150 | TL 1.942 | VL 1.396 | T1 40.61% | V1 68.25% | V5 97.13% | Wait 1 | time 47.8s


                                                 

E080/150 | TL 1.935 | VL 1.441 | T1 41.00% | V1 65.06% | V5 96.49% | Wait 2 | time 47.5s


                                                 

E081/150 | TL 1.931 | VL 1.376 | T1 41.16% | V1 68.78% | V5 97.10% | Wait 3 | time 47.8s


                                                 

E082/150 | TL 1.946 | VL 1.398 | T1 40.37% | V1 68.57% | V5 97.21% | Wait 4 | time 47.7s


                                                 

E083/150 | TL 1.942 | VL 1.380 | T1 40.71% | V1 68.64% | V5 97.17% | Wait 5 | time 47.7s


                                                 

E084/150 | TL 1.916 | VL 1.353 | T1 42.22% | V1 69.64% | V5 97.40% | Wait 0 | time 47.7s


                                                 

E085/150 | TL 1.911 | VL 1.389 | T1 42.10% | V1 68.94% | V5 97.55% | Wait 1 | time 47.8s


                                                 

E086/150 | TL 1.924 | VL 1.416 | T1 41.50% | V1 67.87% | V5 97.22% | Wait 2 | time 47.6s


                                                 

E087/150 | TL 1.912 | VL 1.398 | T1 42.49% | V1 69.52% | V5 97.46% | Wait 3 | time 47.8s


                                                 

E088/150 | TL 1.901 | VL 1.354 | T1 42.83% | V1 69.70% | V5 97.33% | Wait 0 | time 47.6s


                                                 

E089/150 | TL 1.926 | VL 1.361 | T1 42.02% | V1 70.38% | V5 97.37% | Wait 0 | time 47.8s


                                                 

E090/150 | TL 1.941 | VL 1.378 | T1 40.95% | V1 69.88% | V5 97.52% | Wait 1 | time 47.5s


                                                 

E091/150 | TL 1.938 | VL 1.351 | T1 41.11% | V1 69.22% | V5 97.43% | Wait 2 | time 47.7s


                                                 

E092/150 | TL 1.930 | VL 1.381 | T1 41.32% | V1 69.55% | V5 97.52% | Wait 3 | time 47.6s


                                                 

E093/150 | TL 1.925 | VL 1.372 | T1 41.99% | V1 69.92% | V5 97.24% | Wait 4 | time 47.7s


                                                 

E094/150 | TL 1.905 | VL 1.319 | T1 42.69% | V1 70.66% | V5 97.80% | Wait 0 | time 47.5s


                                                 

E095/150 | TL 1.889 | VL 1.353 | T1 43.49% | V1 69.87% | V5 97.41% | Wait 1 | time 47.7s


                                                 

E096/150 | TL 1.890 | VL 1.370 | T1 43.63% | V1 69.50% | V5 97.49% | Wait 2 | time 47.6s


                                                 

E097/150 | TL 1.887 | VL 1.346 | T1 43.79% | V1 71.32% | V5 97.79% | Wait 0 | time 47.7s


                                                 

E098/150 | TL 1.888 | VL 1.335 | T1 43.85% | V1 71.35% | V5 97.72% | Wait 0 | time 47.6s


                                                 

E099/150 | TL 1.895 | VL 1.341 | T1 43.44% | V1 71.55% | V5 97.61% | Wait 0 | time 47.8s


                                                 

E100/150 | TL 1.857 | VL 1.306 | T1 44.98% | V1 71.65% | V5 97.95% | Wait 0 | time 47.9s


                                                 

E101/150 | TL 1.910 | VL 1.304 | T1 42.54% | V1 72.49% | V5 97.74% | Wait 0 | time 48.2s


                                                 

E102/150 | TL 1.902 | VL 1.322 | T1 42.97% | V1 72.10% | V5 97.75% | Wait 1 | time 48.0s


                                                 

E103/150 | TL 1.898 | VL 1.365 | T1 43.20% | V1 70.29% | V5 97.34% | Wait 2 | time 48.3s


                                                 

E104/150 | TL 1.914 | VL 1.348 | T1 42.64% | V1 72.00% | V5 97.76% | Wait 3 | time 47.9s


                                                 

E105/150 | TL 1.900 | VL 1.313 | T1 43.49% | V1 72.81% | V5 97.68% | Wait 0 | time 48.2s


                                                 

E106/150 | TL 1.895 | VL 1.298 | T1 43.44% | V1 72.48% | V5 97.92% | Wait 1 | time 48.3s


                                                 

E107/150 | TL 1.905 | VL 1.320 | T1 42.93% | V1 71.69% | V5 97.76% | Wait 2 | time 48.0s


                                                 

E108/150 | TL 1.898 | VL 1.314 | T1 43.61% | V1 72.31% | V5 97.65% | Wait 3 | time 48.1s


                                                 

E109/150 | TL 1.890 | VL 1.307 | T1 43.78% | V1 73.18% | V5 97.81% | Wait 0 | time 48.1s


                                                 

E110/150 | TL 1.907 | VL 1.335 | T1 43.14% | V1 72.60% | V5 97.64% | Wait 1 | time 47.9s


                                                 

E111/150 | TL 1.861 | VL 1.294 | T1 45.14% | V1 72.85% | V5 97.94% | Wait 2 | time 48.2s


                                                 

E112/150 | TL 1.895 | VL 1.317 | T1 43.48% | V1 72.34% | V5 97.79% | Wait 3 | time 48.0s


                                                 

E113/150 | TL 1.882 | VL 1.317 | T1 44.27% | V1 72.37% | V5 97.77% | Wait 4 | time 47.8s


                                                 

E114/150 | TL 1.888 | VL 1.322 | T1 44.10% | V1 73.65% | V5 97.92% | Wait 0 | time 48.3s


                                                 

E115/150 | TL 1.895 | VL 1.326 | T1 44.04% | V1 73.20% | V5 97.71% | Wait 1 | time 47.8s


                                                 

E116/150 | TL 1.881 | VL 1.302 | T1 44.27% | V1 74.06% | V5 97.93% | Wait 0 | time 48.1s


                                                 

E117/150 | TL 1.880 | VL 1.292 | T1 44.72% | V1 73.60% | V5 98.01% | Wait 1 | time 48.0s


                                                 

E118/150 | TL 1.886 | VL 1.302 | T1 44.15% | V1 73.85% | V5 97.89% | Wait 2 | time 48.1s


                                                 

E119/150 | TL 1.915 | VL 1.337 | T1 42.96% | V1 73.90% | V5 97.66% | Wait 3 | time 48.1s


                                                 

E120/150 | TL 1.834 | VL 1.278 | T1 46.58% | V1 74.02% | V5 97.83% | Wait 4 | time 48.1s


                                                 

E121/150 | TL 1.877 | VL 1.282 | T1 44.88% | V1 74.47% | V5 98.03% | Wait 0 | time 48.2s


                                                 

E122/150 | TL 1.870 | VL 1.305 | T1 45.41% | V1 74.08% | V5 97.99% | Wait 1 | time 47.9s


                                                 

E123/150 | TL 1.902 | VL 1.294 | T1 43.72% | V1 74.49% | V5 97.78% | Wait 0 | time 48.0s


                                                 

E124/150 | TL 1.870 | VL 1.282 | T1 45.14% | V1 74.54% | V5 98.13% | Wait 0 | time 48.0s


                                                 

E125/150 | TL 1.898 | VL 1.298 | T1 43.88% | V1 74.84% | V5 97.98% | Wait 0 | time 48.3s


                                                 

E126/150 | TL 1.884 | VL 1.286 | T1 44.58% | V1 74.42% | V5 98.02% | Wait 1 | time 48.0s


                                                 

E127/150 | TL 1.881 | VL 1.286 | T1 44.80% | V1 75.22% | V5 98.09% | Wait 0 | time 48.2s


                                                 

E128/150 | TL 1.898 | VL 1.305 | T1 43.88% | V1 74.58% | V5 98.01% | Wait 1 | time 47.8s


                                                 

E129/150 | TL 1.879 | VL 1.302 | T1 44.48% | V1 74.26% | V5 98.04% | Wait 2 | time 48.1s


                                                 

E130/150 | TL 1.818 | VL 1.272 | T1 47.38% | V1 73.82% | V5 98.09% | Wait 3 | time 48.5s


                                                 

E131/150 | TL 1.861 | VL 1.263 | T1 45.62% | V1 74.46% | V5 97.96% | Wait 4 | time 48.0s


                                                 

E132/150 | TL 1.848 | VL 1.280 | T1 46.14% | V1 74.49% | V5 98.14% | Wait 5 | time 48.1s


                                                 

E133/150 | TL 1.861 | VL 1.276 | T1 45.22% | V1 74.87% | V5 98.08% | Wait 6 | time 47.9s


                                                 

E134/150 | TL 1.849 | VL 1.278 | T1 45.98% | V1 74.15% | V5 98.17% | Wait 7 | time 48.0s


                                                 

E135/150 | TL 1.903 | VL 1.276 | T1 43.42% | V1 74.93% | V5 98.07% | Wait 8 | time 48.0s


                                                 

E136/150 | TL 1.885 | VL 1.293 | T1 44.53% | V1 74.41% | V5 97.95% | Wait 9 | time 48.0s


                                                 

E137/150 | TL 1.876 | VL 1.285 | T1 44.65% | V1 74.22% | V5 98.05% | Wait 10 | time 47.8s


                                                 

E138/150 | TL 1.894 | VL 1.291 | T1 44.11% | V1 74.27% | V5 97.96% | Wait 11 | time 48.0s


                                                 

E139/150 | TL 1.839 | VL 1.274 | T1 46.53% | V1 75.03% | V5 97.98% | Wait 12 | time 48.0s


                                                 

E140/150 | TL 1.876 | VL 1.283 | T1 44.86% | V1 74.37% | V5 98.12% | Wait 13 | time 48.1s


                                                 

E141/150 | TL 1.869 | VL 1.275 | T1 45.36% | V1 74.64% | V5 98.08% | Wait 14 | time 48.3s


                                                 

E142/150 | TL 1.878 | VL 1.276 | T1 44.96% | V1 74.36% | V5 98.19% | Wait 15 | time 48.0s
Early stopping at epoch 142 (no val acc improvement for 15 epochs).
Done. Best val Top-1: 75.22%
