In [1]:
#!/usr/bin/env python
# coding: utf-8
import os
import pandas as pd
import numpy as np
import random
import wandb
from pprint import pprint

from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
from PIL import Image
import torchvision.models as visual_models
from pytorch_pretrained_vit import ViT

import transformers
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, BertModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
transformers.logging.set_verbosity_error()

transformers.__version__: 4.10.0


### 参数

In [2]:
class CFG:
    train_file = './data/train.tsv'    
    valid_file = './data/dev.tsv'   
    img_key = 'filepath'
    caption_key = 'title'
    max_text_len=34  
    image_size=224
    text_ptm='/home/yjw/ZYJ_WorkSpace/PTMs/chinese-roberta-wwm-ext/'
    img_ptm='Resnet50'  # ['ViT', 'Resnet50', 'VGG16']
    vit_name='B_16_imagenet1k'
    output_dir = './checkpoints/roberta-resnet-dim256-saved/'
    # 训练参数
    dim = 256
    device='cuda:0'
    epochs=50
    learning_rate = 1e-4
    batch_size=256
    eval_epoch = 2
    apex = True
    seed=42 
    # scheduler参数
    scheduler='cosine'                   # ['linear', 'cosine'] # lr scheduler 类型
    last_epoch=-1                        # 从第 last_epoch +1 个epoch开始训练
    batch_scheduler=True                 # 是否每个step结束后更新 lr scheduler
    weight_decay=0.01
    num_warmup_steps = 0
    num_cycles=0.5                    # 如果使用 cosine lr scheduler， 该参数决定学习率曲线的形状，0.5代表半个cosine曲线
    
    # log参数
    log_step = 5
    wandb = True
    key_metrics = 'image_to_text_R@10'
    


In [3]:
#=======设置全局seed保证结果可复现====
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

### 数据预处理

In [4]:
class TrainDataset(Dataset):
    def __init__(self, input_file):
        tokenizer = AutoTokenizer.from_pretrained(CFG.text_ptm)
        data_df = pd.read_csv(input_file, sep='\t')
        img_paths = data_df[CFG.img_key].values
        texts = data_df[CFG.caption_key].values
        img_size = CFG.image_size

        transformation = transforms.Compose([
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor(),
        ])
        
        self.inputs = []
        assert len(texts) == len(img_paths)
        print(f'load data from {input_file} len={len(texts)}')
        for text, img_path in tqdm(zip(texts, img_paths)):
            text_tensor = tokenizer(text, 
                                    max_length=CFG.max_text_len, 
                                    truncation=True, 
                                    return_tensors='pt', 
                                    padding="max_length",)
            for k,v in text_tensor.items():
                text_tensor[k] = v.squeeze()
            img_tensor = transformation(Image.open(img_path))
            self.inputs.append({'text':text_tensor, 'img':img_tensor})

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        return self.inputs[item]

### 模型定义

#### TextEncoder

In [5]:
class TextEncoder(nn.Module):
    def __init__(self, ptm_name, device, freeze=False):
        super().__init__()
        self.model = BertModel.from_pretrained(ptm_name)
        self.model.to(device)
        self.feat_dim = AutoConfig.from_pretrained(ptm_name).hidden_size

        self.freeze = freeze
        if freeze:
            for name ,param in self.model.named_parameters():
                param.requires_grad = False
        
    def forward(self, inputs):
        if self.freeze:
            self.model.eval()
        last_hidden_state = self.model(**inputs).last_hidden_state # [batch_size, seq_len, hidden_size]
        feature = torch.mean(last_hidden_state, axis=1) # [batch_size, hidden_size]

        feature = F.normalize(feature, dim=-1)
        return feature

#### ImageEncoder

In [6]:
class ImageEncoder(nn.Module):
    def __init__(self, ptm_name, device, vit_model_name=None):
        super().__init__()
        assert ptm_name in ['ViT', 'Resnet50', 'VGG16']
        if ptm_name == 'Resnet50':
            self.model = visual_models.resnet50(pretrained=True)
        elif ptm_name == 'ViT':
            self.model = ViT(vit_model_name, pretrained=True)
        else:
            self.model = visual_models.vgg16(pretrained=True)
        self.model.to(device)
        self.feat_dim = 1000

    def forward(self, inputs):
        feature = self.model(inputs) # [batch_size, 1000]
        feature = F.normalize(feature, dim=-1)
        return feature


#### SimpleCLIP

In [7]:
class SimpleCLIP(nn.Module):
    def __init__(self, dim, text_ptm_name, img_ptm_name, device, freeze=False, vit_model_name=None):
        super().__init__()
        self.device = device
        self.textencoder = TextEncoder(text_ptm_name, device, freeze=freeze)
        self.imgencoder = ImageEncoder(img_ptm_name, device, vit_model_name)

        self.text_projection = nn.Parameter(torch.empty(self.textencoder.feat_dim, dim)).to(device)
        self.img_projection = nn.Parameter(torch.empty(self.imgencoder.feat_dim, dim)).to(device)
        self.logit_scale = nn.Parameter(torch.ones([])).to(device)
        self.init_parameters()
    
    def init_parameters(self):
        nn.init.constant_(self.logit_scale, np.log(1 / 0.07))
        nn.init.normal_(self.text_projection, std=0.02)
        nn.init.normal_(self.img_projection, std=0.02)

    def loss(self, text_feat, img_feat, logit_scale):
        labels = torch.arange(text_feat.shape[0], device=self.device, dtype=torch.long)

        logits_per_image = logit_scale * img_feat @ text_feat.T   # [batch_size, batch_size]
        logits_per_text = logit_scale * text_feat @ img_feat.T   # [batch_size, batch_size]
        
        total_loss = (
            F.cross_entropy(logits_per_image, labels) +
            F.cross_entropy(logits_per_text, labels)
            ) / 2
        return total_loss

    def forward(self, text_inputs, img_inputs, outputLoss=False):
        text_feat = self.textencoder(text_inputs) @ self.text_projection # [batch_size, dim]
        img_feat = self.imgencoder(img_inputs) @ self.img_projection # [batch_size, dim]
        logit_scale = self.logit_scale.exp()
        if outputLoss:
            loss = self.loss(text_feat, img_feat, logit_scale)
            return loss, text_feat, img_feat, logit_scale
        else:
            return text_feat, img_feat, logit_scale


### 主程序

#### evaluate

In [8]:
def get_metrics(image_features, text_features, logit_scale):
    metrics = {}
    logits_per_image = (logit_scale * image_features @ text_features.t()).detach().cpu()
    logits_per_text = logits_per_image.t().detach().cpu()

    logits = {"image_to_text": logits_per_image, "text_to_image": logits_per_text}
    ground_truth = torch.arange(len(text_features)).view(-1, 1)

    for name, logit in logits.items():
        ranking = torch.argsort(logit, descending=True)
        preds = torch.where(ranking == ground_truth)[1]
        preds = preds.detach().cpu().numpy()
        metrics[f"{name}_mean_rank"] = preds.mean() + 1
        metrics[f"{name}_median_rank"] = np.floor(np.median(preds)) + 1
        for k in [1, 5, 10]:
            metrics[f"{name}_R@{k}"] = np.mean(preds < k)

    return metrics

def evaluate(model, valid_dataloader, device):
    model.eval()
    all_text_feat = []
    all_img_feat = []
    tk0 = tqdm(enumerate(valid_dataloader),total=len(valid_dataloader))
    total_loss = 0
    for step, batch in tk0:
        batch['img'] = batch['img'].to(device)
        for k,v in batch['text'].items():
            batch['text'][k] = v.to(device)
        with torch.no_grad():
            with torch.cuda.amp.autocast(enabled=CFG.apex):
                loss, text_feat, img_feat, logit_scale = model(batch['text'], 
                                                                batch['img'], 
                                                                outputLoss=True)
        total_loss += loss.item()
        all_text_feat.append(text_feat)
        all_img_feat.append(img_feat)
        
    metrics = get_metrics(image_features=torch.cat(all_img_feat),
                          text_features=torch.cat(all_text_feat),
                          logit_scale=logit_scale)
    metrics['eval_loss'] = total_loss / len(valid_dataloader)
    return metrics


#### train loop

In [9]:
def train_eval(model, train_dataloader, valid_dataloader, save_path):
    assert CFG.device.startswith('cuda') or CFG.device == 'cpu', ValueError("Invalid device.")
    device = torch.device(CFG.device)
    best_score = 0
    total_step = 0
    model.to(device)
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    if not len(train_dataloader):
        raise EOFError("Empty train_dataloader.")

    # 过滤掉冻结的权重
    param_optimizer = [(n, p) for n, p in model.named_parameters() if p.requires_grad]

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    # 设置权重decay
    optimizer_grouped_parameters = [
        {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": CFG.weight_decay},
        {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}]
    optimizer = AdamW(optimizer_grouped_parameters, lr=CFG.learning_rate, weight_decay=CFG.weight_decay)
    
    num_train_steps = int(len(train_dataloader) * CFG.epochs)
    if CFG.scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
                    optimizer, 
                    num_warmup_steps=CFG.num_warmup_steps, 
                    num_training_steps=num_train_steps, 
                    num_cycles=CFG.num_cycles, 
#                     last_epoch = ((CFG.last_epoch+1)/CFG.epochs)*num_train_steps
                )
    else:
        scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=CFG.num_warmup_steps, num_training_steps=num_train_steps
            )
    
    for cur_epc in range(int(CFG.epochs)):
        
        if cur_epc % CFG.eval_epoch == 0:
            metrics = evaluate(model, valid_dataloader, device)
            print(f"eval metrics = ")
            pprint(metrics)
            if CFG.wandb:
                wandb.log(metrics, step=total_step)
            if cur_epc > 0 and metrics[CFG.key_metrics] >= best_score:
                best_score = metrics[CFG.key_metrics]
                # model_save_path = os.path.join(save_path,f'epoch{cur_epc}.pt') # 保留所有checkpoint
                model_save_path = os.path.join(save_path,f'best_checkpoint.pt') # 保留最优checkpoint
                torch.save(model.state_dict(), model_save_path)
                print(f'save at {model_save_path}')
        
        training_loss = 0
        print("Epoch: {}".format(cur_epc))
        model.train()
        tk0 = tqdm(enumerate(train_dataloader),total=len(train_dataloader))
        for step, batch in tk0:
            total_step += 1
            batch['img'] = batch['img'].to(device)
            for k,v in batch['text'].items():
                batch['text'][k] = v.to(device)
            with torch.cuda.amp.autocast(enabled=CFG.apex):
                loss, _, _, _ = model(batch['text'], batch['img'], outputLoss=True)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            if CFG.batch_scheduler:
                scheduler.step()
            training_loss += loss.item()
            tk0.set_postfix(Epoch=cur_epc, Loss=training_loss/(step+1))
            if CFG.wandb and (step + 1) % CFG.log_step == 0:
                wandb.log({'train_loss':loss, 'lr':optimizer.param_groups[0]["lr"], 'epoch': cur_epc},
                          step=total_step)
        
    torch.cuda.empty_cache()          

#### 训练过程

In [10]:
if __name__ == '__main__':
    seed_everything(seed=42)
    if not os.path.exists(CFG.output_dir):
        os.makedirs(CFG.output_dir)
    with open(os.path.join(CFG.output_dir, 'config.txt'), 'w') as f:
        for k,v in CFG.__dict__.items():
            f.write(f'{k}: {v}\n')

    # 加载数据
    train_dataset = TrainDataset(CFG.train_file)
    valid_dataset = TrainDataset(CFG.valid_file)
    train_dataloader = DataLoader(train_dataset, batch_size=CFG.batch_size,)
    valid_dataloader = DataLoader(valid_dataset, batch_size=CFG.batch_size,)
    # 加载模型
    device = torch.device(CFG.device)
    clipModel = SimpleCLIP(CFG.dim, CFG.text_ptm, CFG.img_ptm, device, freeze=False)
    
    if CFG.wandb:
        wandb.init(project='SimpleCLIP', name=f'roberta-base-{CFG.img_ptm}-batch{CFG.batch_size}-dim{CFG.dim}')
    
    # 训练
    train_eval(clipModel, train_dataloader, valid_dataloader, CFG.output_dir)

load data from ./data/train.tsv len=40000


0it [00:00, ?it/s]

load data from ./data/dev.tsv len=5974


0it [00:00, ?it/s]

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mzyijie[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.02147236668194334,
 'image_to_text_R@1': 0.00033478406427854036,
 'image_to_text_R@10': 0.002176096417810512,
 'image_to_text_R@5': 0.001004352192835621,
 'image_to_text_mean_rank': 2825.9907934382322,
 'image_to_text_median_rank': 2796.0,
 'text_to_image_R@1': 0.0,
 'text_to_image_R@10': 0.0008369601606963508,
 'text_to_image_R@5': 0.0005021760964178105,
 'text_to_image_mean_rank': 2869.8068295949115,
 'text_to_image_median_rank': 2808.0}
Epoch: 0


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 1


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.014946842527327439,
 'image_to_text_R@1': 0.017073987278205558,
 'image_to_text_R@10': 0.10026782725142283,
 'image_to_text_R@5': 0.06210244392366923,
 'image_to_text_mean_rank': 571.0929025778373,
 'image_to_text_median_rank': 172.0,
 'text_to_image_R@1': 0.01774355540676264,
 'text_to_image_R@10': 0.10077000334784064,
 'text_to_image_R@5': 0.05825242718446602,
 'text_to_image_mean_rank': 564.6444593237362,
 'text_to_image_median_rank': 172.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 2


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 3


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.015896394111526508,
 'image_to_text_R@1': 0.011550050217609641,
 'image_to_text_R@10': 0.08938734516237028,
 'image_to_text_R@5': 0.052393706059591566,
 'image_to_text_mean_rank': 681.2870773351189,
 'image_to_text_median_rank': 205.0,
 'text_to_image_R@1': 0.017241379310344827,
 'text_to_image_R@10': 0.09792433880147305,
 'text_to_image_R@5': 0.05691329092735186,
 'text_to_image_mean_rank': 673.4070974221627,
 'text_to_image_median_rank': 201.0}
Epoch: 4


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 5


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.015597575382950405,
 'image_to_text_R@1': 0.014060930699698694,
 'image_to_text_R@10': 0.1049548041513224,
 'image_to_text_R@5': 0.06160026782725142,
 'image_to_text_mean_rank': 668.876129896217,
 'image_to_text_median_rank': 178.0,
 'text_to_image_R@1': 0.01774355540676264,
 'text_to_image_R@10': 0.10780046869768999,
 'text_to_image_R@5': 0.06327418814864412,
 'text_to_image_mean_rank': 655.3362905925678,
 'text_to_image_median_rank': 176.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 6


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 7


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.015881430202474196,
 'image_to_text_R@1': 0.015902243053230666,
 'image_to_text_R@10': 0.1083026447941078,
 'image_to_text_R@5': 0.06160026782725142,
 'image_to_text_mean_rank': 723.6787746903248,
 'image_to_text_median_rank': 192.0,
 'text_to_image_R@1': 0.017910947438901908,
 'text_to_image_R@10': 0.11533311014395715,
 'text_to_image_R@5': 0.07080682959491129,
 'text_to_image_mean_rank': 680.3029795781721,
 'text_to_image_median_rank': 185.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 8


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 9


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.015782609775972862,
 'image_to_text_R@1': 0.01774355540676264,
 'image_to_text_R@10': 0.11064613324405759,
 'image_to_text_R@5': 0.06545028456645463,
 'image_to_text_mean_rank': 710.2870773351189,
 'image_to_text_median_rank': 186.0,
 'text_to_image_R@1': 0.022765316370940744,
 'text_to_image_R@10': 0.12269835955808503,
 'text_to_image_R@5': 0.07432206226983595,
 'text_to_image_mean_rank': 708.2022095748242,
 'text_to_image_median_rank': 177.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 10


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 11


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.015704733862852056,
 'image_to_text_R@1': 0.01690659524606629,
 'image_to_text_R@10': 0.10696350853699364,
 'image_to_text_R@5': 0.06176765985939069,
 'image_to_text_mean_rank': 696.5878808168732,
 'image_to_text_median_rank': 180.0,
 'text_to_image_R@1': 0.024606628724472716,
 'text_to_image_R@10': 0.12470706394375627,
 'text_to_image_R@5': 0.07917643120187479,
 'text_to_image_mean_rank': 715.9834281888183,
 'text_to_image_median_rank': 179.0}
Epoch: 12


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 13


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.015825222595594823,
 'image_to_text_R@1': 0.017910947438901908,
 'image_to_text_R@10': 0.11332440575828591,
 'image_to_text_R@5': 0.07097422162705055,
 'image_to_text_mean_rank': 729.1404419149649,
 'image_to_text_median_rank': 188.0,
 'text_to_image_R@1': 0.024439236692333444,
 'text_to_image_R@10': 0.1297288249079344,
 'text_to_image_R@5': 0.08302644794107801,
 'text_to_image_mean_rank': 731.9859390693003,
 'text_to_image_median_rank': 184.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 14


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 15


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.01560579271366199,
 'image_to_text_R@1': 0.022263140274522933,
 'image_to_text_R@10': 0.1206896551724138,
 'image_to_text_R@5': 0.07733511884834282,
 'image_to_text_mean_rank': 699.926682289923,
 'image_to_text_median_rank': 176.0,
 'text_to_image_R@1': 0.024104452628054905,
 'text_to_image_R@10': 0.13458319383997322,
 'text_to_image_R@5': 0.08520254435888852,
 'text_to_image_mean_rank': 702.6456310679612,
 'text_to_image_median_rank': 176.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 16


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 17


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.015769570794266958,
 'image_to_text_R@1': 0.022095748242383664,
 'image_to_text_R@10': 0.12102443923669233,
 'image_to_text_R@5': 0.07348510210913961,
 'image_to_text_mean_rank': 735.4112822229662,
 'image_to_text_median_rank': 184.0,
 'text_to_image_R@1': 0.026113157013726147,
 'text_to_image_R@10': 0.12654837629728824,
 'text_to_image_R@5': 0.07934382323401407,
 'text_to_image_mean_rank': 733.5570806829595,
 'text_to_image_median_rank': 179.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 18


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 19


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.015761700575239956,
 'image_to_text_R@1': 0.02259792433880147,
 'image_to_text_R@10': 0.12437227987947774,
 'image_to_text_R@5': 0.0780046869768999,
 'image_to_text_mean_rank': 743.0254435888852,
 'image_to_text_median_rank': 185.0,
 'text_to_image_R@1': 0.025443588885169066,
 'text_to_image_R@10': 0.13056578506863073,
 'text_to_image_R@5': 0.08302644794107801,
 'text_to_image_mean_rank': 733.5609306996987,
 'text_to_image_median_rank': 180.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 20


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 21


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.015798594220541418,
 'image_to_text_R@1': 0.02058921995313023,
 'image_to_text_R@10': 0.12169400736524942,
 'image_to_text_R@5': 0.07666555071978574,
 'image_to_text_mean_rank': 734.7068965517242,
 'image_to_text_median_rank': 187.0,
 'text_to_image_R@1': 0.02795446936725812,
 'text_to_image_R@10': 0.1324070974221627,
 'text_to_image_R@5': 0.08202209574824239,
 'text_to_image_mean_rank': 739.1032808838299,
 'text_to_image_median_rank': 185.0}
Epoch: 22


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 23


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.01571654590467612,
 'image_to_text_R@1': 0.021426180113826583,
 'image_to_text_R@10': 0.12788751255440242,
 'image_to_text_R@5': 0.08018078339471041,
 'image_to_text_mean_rank': 748.2741881486442,
 'image_to_text_median_rank': 177.0,
 'text_to_image_R@1': 0.026950117174422496,
 'text_to_image_R@10': 0.13558754603280884,
 'text_to_image_R@5': 0.08386340810177435,
 'text_to_image_mean_rank': 748.1596919986608,
 'text_to_image_median_rank': 175.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 24


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 25


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.01561029530906429,
 'image_to_text_R@1': 0.02259792433880147,
 'image_to_text_R@10': 0.1290592567793773,
 'image_to_text_R@5': 0.08151991965182458,
 'image_to_text_mean_rank': 735.8995647807164,
 'image_to_text_median_rank': 172.0,
 'text_to_image_R@1': 0.029628389688650822,
 'text_to_image_R@10': 0.14178105122196183,
 'text_to_image_R@5': 0.09240040174087713,
 'text_to_image_mean_rank': 738.8069969869434,
 'text_to_image_median_rank': 171.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 26


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 27


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.015606950735673308,
 'image_to_text_R@1': 0.024104452628054905,
 'image_to_text_R@10': 0.12855708068295948,
 'image_to_text_R@5': 0.08369601606963509,
 'image_to_text_mean_rank': 740.8883495145631,
 'image_to_text_median_rank': 172.0,
 'text_to_image_R@1': 0.03163709407432206,
 'text_to_image_R@10': 0.1464680281218614,
 'text_to_image_R@5': 0.09725477067291596,
 'text_to_image_mean_rank': 744.305323066622,
 'text_to_image_median_rank': 166.0}
Epoch: 28


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 29


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.015621474051537612,
 'image_to_text_R@1': 0.023100100435219283,
 'image_to_text_R@10': 0.12872447271509876,
 'image_to_text_R@5': 0.07817207900903916,
 'image_to_text_mean_rank': 747.4281888182122,
 'image_to_text_median_rank': 170.0,
 'text_to_image_R@1': 0.02979578172079009,
 'text_to_image_R@10': 0.14278540341479745,
 'text_to_image_R@5': 0.0937395379979913,
 'text_to_image_mean_rank': 752.4857716772682,
 'text_to_image_median_rank': 165.0}
Epoch: 30


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 31


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.01563216431532055,
 'image_to_text_R@1': 0.021593572145965852,
 'image_to_text_R@10': 0.1329092735185805,
 'image_to_text_R@5': 0.08369601606963509,
 'image_to_text_mean_rank': 752.6476397723468,
 'image_to_text_median_rank': 165.0,
 'text_to_image_R@1': 0.02996317375292936,
 'text_to_image_R@10': 0.14295279544693673,
 'text_to_image_R@5': 0.09290257783729494,
 'text_to_image_mean_rank': 754.5306327418815,
 'text_to_image_median_rank': 169.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 32


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 33


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.01571911476397266,
 'image_to_text_R@1': 0.023602276531637094,
 'image_to_text_R@10': 0.13039839303649145,
 'image_to_text_R@5': 0.08085035152326749,
 'image_to_text_mean_rank': 771.6956812855708,
 'image_to_text_median_rank': 168.0,
 'text_to_image_R@1': 0.0286240374958152,
 'text_to_image_R@10': 0.14010713090056914,
 'text_to_image_R@5': 0.09022430532306662,
 'text_to_image_mean_rank': 770.8883495145631,
 'text_to_image_median_rank': 170.0}
Epoch: 34


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 35


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.01574667659588158,
 'image_to_text_R@1': 0.023267492467358555,
 'image_to_text_R@10': 0.13006360897221292,
 'image_to_text_R@5': 0.08352862403749582,
 'image_to_text_mean_rank': 783.926682289923,
 'image_to_text_median_rank': 172.0,
 'text_to_image_R@1': 0.02879142952795447,
 'text_to_image_R@10': 0.1386006026113157,
 'text_to_image_R@5': 0.09022430532306662,
 'text_to_image_mean_rank': 780.1310679611651,
 'text_to_image_median_rank': 169.0}
Epoch: 36


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 37


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.015760160284116864,
 'image_to_text_R@1': 0.025610980917308335,
 'image_to_text_R@10': 0.1342484097756947,
 'image_to_text_R@5': 0.08218948778038165,
 'image_to_text_mean_rank': 785.0487110813525,
 'image_to_text_median_rank': 172.0,
 'text_to_image_R@1': 0.03046534984934717,
 'text_to_image_R@10': 0.14178105122196183,
 'text_to_image_R@5': 0.09039169735520589,
 'text_to_image_mean_rank': 782.8927017073987,
 'text_to_image_median_rank': 171.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 38


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 39


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.015754130164471764,
 'image_to_text_R@1': 0.026113157013726147,
 'image_to_text_R@10': 0.13341144961499832,
 'image_to_text_R@5': 0.08235687981252092,
 'image_to_text_mean_rank': 784.3282557750251,
 'image_to_text_median_rank': 170.0,
 'text_to_image_R@1': 0.02946099765651155,
 'text_to_image_R@10': 0.1452962838968865,
 'text_to_image_R@5': 0.08972212922664881,
 'text_to_image_mean_rank': 783.706729159692,
 'text_to_image_median_rank': 170.0}
Epoch: 40


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 41


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.015725768986158073,
 'image_to_text_R@1': 0.024941412788751255,
 'image_to_text_R@10': 0.13608972212922665,
 'image_to_text_R@5': 0.08285905590893873,
 'image_to_text_mean_rank': 782.8729494476063,
 'image_to_text_median_rank': 167.0,
 'text_to_image_R@1': 0.03013056578506863,
 'text_to_image_R@10': 0.14462671576832942,
 'text_to_image_R@5': 0.09055908938734517,
 'text_to_image_mean_rank': 783.3927017073987,
 'text_to_image_median_rank': 170.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 42


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 43


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.015728124999441206,
 'image_to_text_R@1': 0.025778372949447605,
 'image_to_text_R@10': 0.1362571141613659,
 'image_to_text_R@5': 0.08453297623033143,
 'image_to_text_mean_rank': 783.9948108470037,
 'image_to_text_median_rank': 168.0,
 'text_to_image_R@1': 0.031971878138600605,
 'text_to_image_R@10': 0.14496149983260798,
 'text_to_image_R@5': 0.09340475393371275,
 'text_to_image_mean_rank': 784.3972212922665,
 'text_to_image_median_rank': 170.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 44


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 45


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.0157375979082038,
 'image_to_text_R@1': 0.025778372949447605,
 'image_to_text_R@10': 0.13826581854703715,
 'image_to_text_R@5': 0.08486776029460998,
 'image_to_text_mean_rank': 785.8473384666889,
 'image_to_text_median_rank': 167.0,
 'text_to_image_R@1': 0.03163709407432206,
 'text_to_image_R@10': 0.14563106796116504,
 'text_to_image_R@5': 0.09340475393371275,
 'text_to_image_mean_rank': 786.558921995313,
 'text_to_image_median_rank': 170.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 46


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 47


  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

eval metrics = 
{'eval_loss': 0.0157456734450534,
 'image_to_text_R@1': 0.026280549045865416,
 'image_to_text_R@10': 0.13910277870773352,
 'image_to_text_R@5': 0.0841981921660529,
 'image_to_text_mean_rank': 786.4055908938734,
 'image_to_text_median_rank': 166.0,
 'text_to_image_R@1': 0.031134917977904252,
 'text_to_image_R@10': 0.14563106796116504,
 'text_to_image_R@5': 0.09323736190157349,
 'text_to_image_mean_rank': 787.2830599263475,
 'text_to_image_median_rank': 172.0}
save at ./roberta-resnet-dim256-saved/best_checkpoint.pt
Epoch: 48


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 49


  0%|          | 0/157 [00:00<?, ?it/s]

wandb: Network error (ReadTimeout), entering retry loop.
