In [2]:
from vit_model import MaskedAutoEncoderViT
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
inputs = torch.randn((1, 1, 224, 224))
mae = MaskedAutoEncoderViT(in_chans=1)
patched_inputs = mae.patch_embed(inputs)
print(patched_inputs.shape)

torch.Size([1, 196, 768])


In [7]:
patched_inputs = patched_inputs + mae.pos_embed[:, 1:, :]
masked_x, mask, ids_restore = mae.random_masking(patched_inputs, 0.75)
print(masked_x.shape)
print(mask.shape)
print(ids_restore.shape)

torch.Size([1, 49, 768])
torch.Size([1, 196])
torch.Size([1, 196])


In [9]:
ids_restore

tensor([[ 96,  40, 161, 165,  72,  18,  47, 133,  31,  37,  82,  14, 135, 108,
         120, 130,  93, 158, 183, 122,  94, 111,  64, 137, 187, 152, 171,  15,
         136,   2, 162,  63, 144,   3, 173, 156, 103, 172,  62, 127,   9,  79,
         188,  29, 141, 184,  56,  17,   0, 147, 180, 194, 160, 179,  97, 153,
          60,  74,  71,  33, 125, 138,  44, 190, 124, 110,  92,   8, 192,  22,
         149,  59,  12,  41, 182, 163,  48,  99,  88,  65, 170, 132,  32, 114,
         113, 128,  80, 164, 131,  45,  57, 129,  58,  27,  83, 167,  16, 185,
          50,  68,  28,  67,  52, 123,  34, 166, 177, 126,  73, 181,  87,  55,
          81,  69,  42,  95,  23,  84, 104,   6,  13, 189, 155,  85,  19,  30,
         142, 148,  38,  49,  10,  53,   7,   5, 112, 176, 118, 116,  36, 186,
         134, 140, 115,  77, 146,  21,  91, 117, 109,  43,  76, 154, 143, 121,
          61, 150,  86,   4, 159,  20,  75,  66, 168,  11,  39,   1, 178,  46,
         193, 119,  24,  70,  35, 105,  54, 102, 174

In [12]:
ids = torch.tensor([0, 3, 5])
temp = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
torch.gather(temp, dim=0, index=ids)

tensor([1, 4, 6])

In [14]:
cls_token = mae.cls_token + mae.pos_embed[:, :1, :]
cls_tokens = cls_token.expand(patched_inputs.shape[0], -1, -1)
print(cls_token.shape)
print(cls_tokens.shape)

torch.Size([1, 1, 768])
torch.Size([1, 1, 768])


In [9]:
import argparse

def get_args_parser():
    parser = argparse.ArgumentParser('Fine-tuning on NDI images', add_help=True)
    parser.add_argument('--batch size', default=32, type=int, help='Batch size per GPU')
    parser.add_argument('--epochs', default=50, type=int)

    # Model parameters
    parser.add_argument('--model', default='mae_vit_large_patch16', type=str, metavar='MODEL',
                        help='Name of model to train')

    parser.add_argument('--input_size', default=224, type=int,
                        help='images input size')

    parser.add_argument('--mask_ratio', default=0.75, type=float,
                        help='Masking ratio (percentage of removed patches).')

    parser.add_argument('--layer_scale_init_value', default=1e-6, type=float,
                        help="Layer scale initial values")

    # Optimizer parameters
    parser.add_argument('--weight_decay', type=float, default=1e-4,
                        help='weight decay (default: 0.05)')

    parser.add_argument('--lr', type=float, default=5e-3, metavar='LR',
                        help='learning rate (absolute lr)')

    parser.add_argument('--min_lr', type=float, default=0., metavar='LR',
                        help='lower lr bound for cosine schedulers that hit 0')

    # Dataset Parameters
    parser.add_argument('--output_dir', default='./checkpoints/',
                        help='path where to save, empty for no saving')
    parser.add_argument('--log_dir', default='./logs/',
                        help='path where to save the log')
    parser.add_argument('--device', default='cuda',
                        help='device to use for training / testing')
    parser.add_argument('--seed', default=19981303, type=int)

    # Wandb Parameters
    parser.add_argument('--project', default='Test which ViT suits NDI images best', type=str,
                        help="The name of the W&B project where you're sending the new run.")

    return parser

In [17]:
from torch.nn import functional as F

a = torch.randn((32, 768))
b = torch.randn((32, 768))

a = F.normalize(a, dim=1)
b = F.normalize(b, dim=1)

sim_mat = torch.matmul(a, b.t())

In [20]:
logpt1 = F.log_softmax(sim_mat, dim=-1)
logpt1 = torch.diag(logpt1)
loss1 = -logpt1.mean()
print(loss1)

tensor(3.4728)


In [21]:
f_loss1 = F.cross_entropy(sim_mat, torch.arange(0, 32))
print(f_loss1)

tensor(3.4728)


In [7]:
from utils import AverageMeter, ProgressMeter

batch_time = AverageMeter('Batch Time', ':6.3f')
data_time = AverageMeter('Data Time', ':6.3f')
train_loss = AverageMeter('Train Loss', ':.4e')
train_acc_10 = AverageMeter('Train Acc@10', ':6.2f')
val_loss = AverageMeter('Val Loss', ':.4e')
val_acc_10 = AverageMeter('Val Acc@10', ':6.2f')
val_acc_20 = AverageMeter('Val Acc@20', ':6.2f')
val_acc_30 = AverageMeter('Val Acc@30', ':6.2f')

train_progress = ProgressMeter(32, [batch_time, data_time, train_loss, train_acc_10],
                                prefix=f'Training Progress\tEpoch: [50]')
val_progress = ProgressMeter(32, [val_loss, val_acc_10, val_acc_20, val_acc_30],
                                prefix=f'Validation Progress\tEpoch: [50]')


In [2]:
train_progress.__dict__

{'batch_fmtstr': '[{:2d}/32]',
 'meters': [<utils.AverageMeter at 0x2b61f7f00eb0>,
  <utils.AverageMeter at 0x2b61f7f00520>,
  <utils.AverageMeter at 0x2b61f7f00fa0>,
  <utils.AverageMeter at 0x2b61f7f00610>],
 'prefix': 'Training Progress\tEpoch: [50]'}

In [6]:
train_progress.meters[1].__dict__

{'name': 'Data Time', 'fmt': '6.3f', 'val': 0, 'avg': 0, 'sum': 0, 'count': 0}

In [8]:
train_progress.display(10)

Training Progress	Epoch: [50][10/32]	Batch Time  0.000 ( 0.000)	Data Time  0.000 ( 0.000)	Train Loss 0.0000e+00 (0.0000e+00)	Train Acc@10   0.00 (  0.00)


In [15]:
import timm

timm.list_models('convnext*')

['convnext_atto',
 'convnext_atto_ols',
 'convnext_base',
 'convnext_base_384_in22ft1k',
 'convnext_base_in22ft1k',
 'convnext_base_in22k',
 'convnext_femto',
 'convnext_femto_ols',
 'convnext_large',
 'convnext_large_384_in22ft1k',
 'convnext_large_in22ft1k',
 'convnext_large_in22k',
 'convnext_nano',
 'convnext_nano_ols',
 'convnext_pico',
 'convnext_pico_ols',
 'convnext_small',
 'convnext_small_384_in22ft1k',
 'convnext_small_in22ft1k',
 'convnext_small_in22k',
 'convnext_tiny',
 'convnext_tiny_384_in22ft1k',
 'convnext_tiny_hnf',
 'convnext_tiny_in22ft1k',
 'convnext_tiny_in22k',
 'convnext_xlarge_384_in22ft1k',
 'convnext_xlarge_in22ft1k',
 'convnext_xlarge_in22k']

In [22]:
from models.vit_model import vit_tiny

model = timm.create_model('convnext_small_in22ft1k', pretrained=True)

Downloading: "https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_224.pth" to /home/gp.sc.cc.tohoku.ac.jp/duanct/.cache/torch/hub/checkpoints/convnext_small_22k_1k_224.pth


In [21]:
model.head.fc

Linear(in_features=1024, out_features=1000, bias=True)

In [23]:
model

ConvNeXt(
  (stem): Sequential(
    (0): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
    (1): LayerNorm2d((96,), eps=1e-06, elementwise_affine=True)
  )
  (stages): Sequential(
    (0): ConvNeXtStage(
      (downsample): Identity()
      (blocks): Sequential(
        (0): ConvNeXtBlock(
          (conv_dw): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
          (norm): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
          (mlp): Mlp(
            (fc1): Linear(in_features=96, out_features=384, bias=True)
            (act): GELU()
            (drop1): Dropout(p=0.0, inplace=False)
            (fc2): Linear(in_features=384, out_features=96, bias=True)
            (drop2): Dropout(p=0.0, inplace=False)
          )
          (drop_path): Identity()
        )
        (1): ConvNeXtBlock(
          (conv_dw): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
          (norm): LayerNorm((96,), eps=1e-06, elementwise_affine