In [None]:
!nvidia-smi

Tue Aug  9 20:00:17 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
from google.colab import drive
drive.mount('/content/drive')
!pwd 
%cd drive/MyDrive/models/dino-main/

Mounted at /content/drive
/content
/content/drive/MyDrive/models/dino-main


In [None]:
!nvidia-smi
!pip install timm==0.4.9
!pip install yacs
!pip install -U PyYAML
!pip install einops

Tue Aug  9 21:47:11 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P0    46W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Copyright (c) Facebook, Inc. and its affiliates.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import sys
import datetime
import time
import math
import json
from pathlib import Path
from dataset import SingleData
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torchvision import models as torchvision_models

import utils
import vision_transformer as vits
from vision_transformer import DINOHead

torchvision_archs = sorted(name for name in torchvision_models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(torchvision_models.__dict__[name]))

def get_args_parser():
    parser = argparse.ArgumentParser('DINO', add_help=False)

    # Model parameters
    parser.add_argument('--arch', default='vit_base', type=str,
        choices=['vit_tiny', 'vit_small', 'vit_base', 'xcit', 'deit_tiny', 'deit_small'] \
                + torchvision_archs + torch.hub.list("facebookresearch/xcit:main"),
        help="""Name of architecture to train. For quick experiments with ViTs,
        we recommend using vit_tiny or vit_small.""")
    parser.add_argument('--patch_size', default=16, type=int, help="""Size in pixels
        of input square patches - default 16 (for 16x16 patches). Using smaller
        values leads to better performance but requires more memory. Applies only
        for ViTs (vit_tiny, vit_small and vit_base). If <16, we recommend disabling
        mixed precision training (--use_fp16 false) to avoid unstabilities.""")
    parser.add_argument('--out_dim', default=512, type=int, help="""Dimensionality of
        the DINO head output. For complex and large datasets large values (like 65k) work well.""")
    parser.add_argument('--norm_last_layer', default=True, type=utils.bool_flag,
        help="""Whether or not to weight normalize the last layer of the DINO head.
        Not normalizing leads to better performance but can make the training unstable.
        In our experiments, we typically set this paramater to False with vit_small and True with vit_base.""")
    parser.add_argument('--momentum_teacher', default=0.996, type=float, help="""Base EMA
        parameter for teacher update. The value is increased to 1 during training with cosine schedule.
        We recommend setting a higher value with small batches: for example use 0.9995 with batch size of 256.""")
    parser.add_argument('--use_bn_in_head', default=False, type=utils.bool_flag,
        help="Whether to use batch normalizations in projection head (Default: False)")

    # Temperature teacher parameters
    parser.add_argument('--warmup_teacher_temp', default=0.04, type=float,
        help="""Initial value for the teacher temperature: 0.04 works well in most cases.
        Try decreasing it if the training loss does not decrease.""")
    parser.add_argument('--teacher_temp', default=0.04, type=float, help="""Final value (after linear warmup)
        of the teacher temperature. For most experiments, anything above 0.07 is unstable. We recommend
        starting with the default value of 0.04 and increase this slightly if needed.""")
    parser.add_argument('--warmup_teacher_temp_epochs', default=0, type=int,
        help='Number of warmup epochs for the teacher temperature (Default: 30).')

    # Training/Optimization parameters
    parser.add_argument('--use_fp16', type=utils.bool_flag, default=True, help="""Whether or not
        to use half precision for training. Improves training time and memory requirements,
        but can provoke instability and slight decay of performance. We recommend disabling
        mixed precision if the loss is unstable, if reducing the patch size or if training with bigger ViTs.""")
    parser.add_argument('--weight_decay', type=float, default=0.04, help="""Initial value of the
        weight decay. With ViT, a smaller value at the beginning of training works well.""")
    parser.add_argument('--weight_decay_end', type=float, default=0.4, help="""Final value of the
        weight decay. We use a cosine schedule for WD and using a larger decay by
        the end of training improves performance for ViTs.""")
    parser.add_argument('--clip_grad', type=float, default=3.0, help="""Maximal parameter
        gradient norm if using gradient clipping. Clipping with norm .3 ~ 1.0 can
        help optimization for larger ViT architectures. 0 for disabling.""")
    parser.add_argument('--batch_size_per_gpu', default=32, type=int,
        help='Per-GPU batch-size : number of distinct images loaded on one GPU.')
    parser.add_argument('--epochs', default=100, type=int, help='Number of epochs of training.')
    parser.add_argument('--freeze_last_layer', default=1, type=int, help="""Number of epochs
        during which we keep the output layer fixed. Typically doing so during
        the first epoch helps training. Try increasing this value if the loss does not decrease.""")
    parser.add_argument("--lr", default=0.0005, type=float, help="""Learning rate at the end of
        linear warmup (highest LR used during training). The learning rate is linearly scaled
        with the batch size, and specified here for a reference batch size of 256.""")
    parser.add_argument("--warmup_epochs", default=10, type=int,
        help="Number of epochs for the linear learning-rate warm up.")
    parser.add_argument('--min_lr', type=float, default=1e-6, help="""Target LR at the
        end of optimization. We use a cosine LR schedule with linear warmup.""")
    parser.add_argument('--optimizer', default='adamw', type=str,
        choices=['adamw', 'sgd', 'lars'], help="""Type of optimizer. We recommend using adamw with ViTs.""")
    parser.add_argument('--drop_path_rate', type=float, default=0.1, help="stochastic depth rate")

    # Multi-crop parameters
    parser.add_argument('--global_crops_scale', type=float, nargs='+', default=(0.4, 1.),
        help="""Scale range of the cropped image before resizing, relatively to the origin image.
        Used for large global view cropping. When disabling multi-crop (--local_crops_number 0), we
        recommand using a wider range of scale ("--global_crops_scale 0.14 1." for example)""")
    parser.add_argument('--local_crops_number', type=int, default=8, help="""Number of small
        local views to generate. Set this parameter to 0 to disable multi-crop training.
        When disabling multi-crop we recommend to use "--global_crops_scale 0.14 1." """)
    parser.add_argument('--local_crops_scale', type=float, nargs='+', default=(0.05, 0.4),
        help="""Scale range of the cropped image before resizing, relatively to the origin image.
        Used for small local view cropping of multi-crop.""")

    # Misc
    parser.add_argument('--data_path', default='/content/drive/MyDrive/datasets/CRC/train/', type=str,
        help='Please specify path to the ImageNet training data.')
    parser.add_argument('--output_dir', default=".", type=str, help='Path to save logs and checkpoints.')
    parser.add_argument('--saveckp_freq', default=20, type=int, help='Save checkpoint every x epochs.')
    parser.add_argument('--seed', default=0, type=int, help='Random seed.')
    parser.add_argument('--num_workers', default=10, type=int, help='Number of data loading workers per GPU.')
    parser.add_argument("--dist_url", default="env://", type=str, help="""url used to set up
        distributed training; see https://pytorch.org/docs/stable/distributed.html""")
    parser.add_argument("--local_rank", default=0, type=int, help="Please ignore and do not set this argument.")
    return parser


def train_dino(args):
    utils.init_distributed_mode(args)
    utils.fix_random_seeds(args.seed)
    print("git:\n  {}\n".format(utils.get_sha()))
    print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
    cudnn.benchmark = True

    # ============ preparing data ... ============
    transform = DataAugmentationDINO(
        args.global_crops_scale,
        args.local_crops_scale,
        args.local_crops_number,
    )
    dataset = datasets.ImageFolder(args.data_path, transform=transform)
    # class_name, train_img_list = get_data_list2('/content/drive/MyDrive/dataSets/CRC/train/',0.0)
    # class_name, train_img_list = get_data_list('D:\\original_images_5\\test-patches\\trainset\\SPQ\\final\\train\\final\\final',0.0)
    # dataset=SingleData(class_name, train_img_list, transform=transforms.ToTensor())
    sampler = torch.utils.data.DistributedSampler(dataset, shuffle=True)
    data_loader = torch.utils.data.DataLoader(
        dataset,
        sampler=sampler,
        batch_size=args.batch_size_per_gpu,
        num_workers=args.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    print(f"Data loaded: there are {len(dataset)} images.")
    return data_loader
    # ============ building student and teacher networks ... ============
    # we changed the name DeiT-S for ViT-S to avoid confusions
    args.arch = args.arch.replace("deit", "vit")
    # if the network is a Vision Transformer (i.e. vit_tiny, vit_small, vit_base)
    if args.arch in vits.__dict__.keys():
        student = vits.__dict__[args.arch](
            patch_size=args.patch_size,
            drop_path_rate=args.drop_path_rate,  # stochastic depth
        )
        teacher = vits.__dict__[args.arch](patch_size=args.patch_size)
        # student.load_state_dict(torch.load('/content/drive/MyDrive/models/dino-main/clipVitB16.pth'))
        # teacher.load_state_dict(torch.load('/content/drive/MyDrive/models/dino-main/clipVitB16.pth'))
        student.load_state_dict(torch.load('/content/drive/MyDrive/models/dino-main/clipVitB16.pth'))
        teacher.load_state_dict(torch.load('/content/drive/MyDrive/models/dino-main/clipVitB16.pth'))

        embed_dim = student.embed_dim
    # if the network is a XCiT
    elif args.arch in torch.hub.list("facebookresearch/xcit:main"):
        student = torch.hub.load('facebookresearch/xcit:main', args.arch,
                                 pretrained=False, drop_path_rate=args.drop_path_rate)
        teacher = torch.hub.load('facebookresearch/xcit:main', args.arch, pretrained=False)
        embed_dim = student.embed_dim
    # otherwise, we check if the architecture is in torchvision models
    elif args.arch in torchvision_models.__dict__.keys():
        student = torchvision_models.__dict__[args.arch]()
        teacher = torchvision_models.__dict__[args.arch]()
        embed_dim = student.fc.weight.shape[1]
        # embed_dim = student.state_dict()['encoder.layers.encoder_layer_11.mlp.3.weight'].shape[1]
        # embed_dim=1000
    else:
        print(f"Unknow architecture: {args.arch}")

    # multi-crop wrapper handles forward with inputs of different resolutions
    student = utils.MultiCropWrapper(student, DINOHead(
        embed_dim,
        args.out_dim,
        use_bn=args.use_bn_in_head,
        norm_last_layer=args.norm_last_layer,
    ))
    teacher = utils.MultiCropWrapper(
        teacher,
        DINOHead(embed_dim, args.out_dim, args.use_bn_in_head),
    )
    # move networks to gpu
    student, teacher = student.cuda(), teacher.cuda()
    # synchronize batch norms (if any)
    if utils.has_batchnorms(student):
        student = nn.SyncBatchNorm.convert_sync_batchnorm(student)
        teacher = nn.SyncBatchNorm.convert_sync_batchnorm(teacher)

        # we need DDP wrapper to have synchro batch norms working...
        teacher = nn.parallel.DistributedDataParallel(teacher, device_ids=[args.gpu])
        teacher_without_ddp = teacher.module
    else:
        # teacher_without_ddp and teacher are the same thing
        teacher_without_ddp = teacher
    student = nn.parallel.DistributedDataParallel(student, device_ids=[args.gpu])
    # teacher and student start with the same weights
    teacher_without_ddp.load_state_dict(student.module.state_dict())
    # there is no backpropagation through the teacher, so no need for gradients
    for p in teacher.parameters():
        p.requires_grad = False
    print(f"Student and Teacher are built: they are both {args.arch} network.")

    # ============ preparing loss ... ============
    dino_loss = DINOLoss(
        args.out_dim,
        args.local_crops_number + 2,  # total number of crops = 2 global crops + local_crops_number
        args.warmup_teacher_temp,
        args.teacher_temp,
        args.warmup_teacher_temp_epochs,
        args.epochs,
    ).cuda()

    # ============ preparing optimizer ... ============
    params_groups = utils.get_params_groups(student)
    if args.optimizer == "adamw":
        optimizer = torch.optim.AdamW(params_groups)  # to use with ViTs
    elif args.optimizer == "sgd":
        optimizer = torch.optim.SGD(params_groups, lr=0, momentum=0.9)  # lr is set by scheduler
    elif args.optimizer == "lars":
        optimizer = utils.LARS(params_groups)  # to use with convnet and large batches
    # for mixed precision training
    fp16_scaler = None
    if args.use_fp16:
        fp16_scaler = torch.cuda.amp.GradScaler()

    # ============ init schedulers ... ============
    lr_schedule = utils.cosine_scheduler(
        args.lr * (args.batch_size_per_gpu * utils.get_world_size()) / 256.,  # linear scaling rule
        args.min_lr,
        args.epochs, len(data_loader),
        warmup_epochs=args.warmup_epochs,
    )
    wd_schedule = utils.cosine_scheduler(
        args.weight_decay,
        args.weight_decay_end,
        args.epochs, len(data_loader),
    )
    # momentum parameter is increased to 1. during training with a cosine schedule
    momentum_schedule = utils.cosine_scheduler(args.momentum_teacher, 1,
                                               args.epochs, len(data_loader))
    print(f"Loss, optimizer and schedulers ready.")

    # ============ optionally resume training ... ============
    to_restore = {"epoch": 0}
    utils.restart_from_checkpoint(
        os.path.join(args.output_dir, "checkpoint.pth"),
        run_variables=to_restore,
        student=student,
        teacher=teacher,
        optimizer=optimizer,
        fp16_scaler=fp16_scaler,
        dino_loss=dino_loss,
    )
    start_epoch = to_restore["epoch"]

    start_time = time.time()
    print("Starting DINO training !")
    for epoch in range(start_epoch, args.epochs):
        data_loader.sampler.set_epoch(epoch)

        # ============ training one epoch of DINO ... ============
        train_stats = train_one_epoch(student, teacher, teacher_without_ddp, dino_loss,
            data_loader, optimizer, lr_schedule, wd_schedule, momentum_schedule,
            epoch, fp16_scaler, args)

        # ============ writing logs ... ============
        save_dict = {
            'student': student.state_dict(),
            'teacher': teacher.state_dict(),
            'optimizer': optimizer.state_dict(),
            'epoch': epoch + 1,
            'args': args,
            'dino_loss': dino_loss.state_dict(),
        }
        if fp16_scaler is not None:
            save_dict['fp16_scaler'] = fp16_scaler.state_dict()
        utils.save_on_master(save_dict, os.path.join(args.output_dir, 'checkpoint.pth'))
        if args.saveckp_freq and epoch % args.saveckp_freq == 0:
            utils.save_on_master(save_dict, os.path.join(args.output_dir, f'checkpoint{epoch:04}.pth'))
        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                     'epoch': epoch}
        if utils.is_main_process():
            with (Path(args.output_dir) / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))


def train_one_epoch(student, teacher, teacher_without_ddp, dino_loss, data_loader,
                    optimizer, lr_schedule, wd_schedule, momentum_schedule,epoch,
                    fp16_scaler, args):
    metric_logger = utils.MetricLogger(delimiter="  ")
    header = 'Epoch: [{}/{}]'.format(epoch, args.epochs)
    for it, (images, dd) in enumerate(metric_logger.log_every(data_loader, 10, header)):
        # update weight decay and learning rate according to their schedule
        print('DDD=>',dd)
        print('images=>',images)
        it = len(data_loader) * epoch + it  # global training iteration
        for i, param_group in enumerate(optimizer.param_groups):
            param_group["lr"] = lr_schedule[it]
            if i == 0:  # only the first group is regularized
                param_group["weight_decay"] = wd_schedule[it]

        # move images to gpu
        images = [im.cuda(non_blocking=True) for im in images]
        # teacher and student forward passes + compute dino loss
        with torch.cuda.amp.autocast(fp16_scaler is not None):
            teacher_output = teacher(images[:2])  # only the 2 global views pass through the teacher
            student_output = student(images)
            loss = dino_loss(student_output, teacher_output, epoch)

        if not math.isfinite(loss.item()):
            print("Loss is {}, stopping training".format(loss.item()), force=True)
            sys.exit(1)

        # student update
        optimizer.zero_grad()
        param_norms = None
        if fp16_scaler is None:
            loss.backward()
            if args.clip_grad:
                param_norms = utils.clip_gradients(student, args.clip_grad)
            utils.cancel_gradients_last_layer(epoch, student,
                                              args.freeze_last_layer)
            optimizer.step()
        else:
            fp16_scaler.scale(loss).backward()
            if args.clip_grad:
                fp16_scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
                param_norms = utils.clip_gradients(student, args.clip_grad)
            utils.cancel_gradients_last_layer(epoch, student,
                                              args.freeze_last_layer)
            fp16_scaler.step(optimizer)
            fp16_scaler.update()

        # EMA update for the teacher
        with torch.no_grad():
            m = momentum_schedule[it]  # momentum parameter
            for param_q, param_k in zip(student.module.parameters(), teacher_without_ddp.parameters()):
                param_k.data.mul_(m).add_((1 - m) * param_q.detach().data)

        # logging
        torch.cuda.synchronize()
        metric_logger.update(loss=loss.item())
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
        metric_logger.update(wd=optimizer.param_groups[0]["weight_decay"])
    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}


class DINOLoss(nn.Module):
    def __init__(self, out_dim, ncrops, warmup_teacher_temp, teacher_temp,
                 warmup_teacher_temp_epochs, nepochs, student_temp=0.1,
                 center_momentum=0.9):
        super().__init__()
        self.student_temp = student_temp
        self.center_momentum = center_momentum
        self.ncrops = ncrops
        self.register_buffer("center", torch.zeros(1, out_dim))
        # we apply a warm up for the teacher temperature because
        # a too high temperature makes the training instable at the beginning
        self.teacher_temp_schedule = np.concatenate((
            np.linspace(warmup_teacher_temp,
                        teacher_temp, warmup_teacher_temp_epochs),
            np.ones(nepochs - warmup_teacher_temp_epochs) * teacher_temp
        ))

    def forward(self, student_output, teacher_output, epoch):
        """
        Cross-entropy between softmax outputs of the teacher and student networks.
        """
        student_out = student_output / self.student_temp
        student_out = student_out.chunk(self.ncrops)

        # teacher centering and sharpening
        temp = self.teacher_temp_schedule[epoch]
        teacher_out = F.softmax((teacher_output - self.center) / temp, dim=-1)
        teacher_out = teacher_out.detach().chunk(2)

        total_loss = 0
        n_loss_terms = 0
        for iq, q in enumerate(teacher_out):
            for v in range(len(student_out)):
                if v == iq:
                    # we skip cases where student and teacher operate on the same view
                    continue
                loss = torch.sum(-q * F.log_softmax(student_out[v], dim=-1), dim=-1)
                total_loss += loss.mean()
                n_loss_terms += 1
        total_loss /= n_loss_terms
        self.update_center(teacher_output)
        return total_loss

    @torch.no_grad()
    def update_center(self, teacher_output):
        """
        Update center used for teacher output.
        """
        batch_center = torch.sum(teacher_output, dim=0, keepdim=True)
        dist.all_reduce(batch_center)
        batch_center = batch_center / (len(teacher_output) * dist.get_world_size())

        # ema update
        self.center = self.center * self.center_momentum + batch_center * (1 - self.center_momentum)


class DataAugmentationDINO(object):
    def __init__(self, global_crops_scale, local_crops_scale, local_crops_number):
        flip_and_color_jitter = transforms.Compose([
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomApply(
                [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)],
                p=0.8
            ),
            transforms.RandomGrayscale(p=0.2),
        ])
        normalize = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        ])

        # first global crop
        self.global_transfo1 = transforms.Compose([
            transforms.RandomResizedCrop(224, scale=global_crops_scale, interpolation=Image.BICUBIC),
            flip_and_color_jitter,
            utils.GaussianBlur(1.0),
            normalize,
        ])
        # second global crop
        self.global_transfo2 = transforms.Compose([
            transforms.RandomResizedCrop(224, scale=global_crops_scale, interpolation=Image.BICUBIC),
            flip_and_color_jitter,
            utils.GaussianBlur(0.1),
            utils.Solarization(0.2),
            normalize,
        ])
        # transformation for the local small crops
        self.local_crops_number = local_crops_number
        self.local_transfo = transforms.Compose([
            transforms.RandomResizedCrop(96, scale=local_crops_scale, interpolation=Image.BICUBIC),
            flip_and_color_jitter,
            utils.GaussianBlur(p=0.5),
            normalize,
        ])

    def __call__(self, image):
        crops = []
        crops.append(self.global_transfo1(image))
        crops.append(self.global_transfo2(image))
        for _ in range(self.local_crops_number):
            crops.append(self.local_transfo(image))
        return crops


if __name__ == '__main__':
    parser = argparse.ArgumentParser('DINO', parents=[get_args_parser()])
    args = parser.parse_args("")
    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
    data_loader=train_dino(args)

Will run the code on one GPU.
| distributed init (rank 0): env://


Using cache found in /root/.cache/torch/hub/facebookresearch_xcit_main


git:
  sha: N/A, status: clean, branch: N/A

arch: vit_base
batch_size_per_gpu: 32
clip_grad: 3.0
data_path: /content/drive/MyDrive/datasets/CRC/train/
dist_url: env://
drop_path_rate: 0.1
epochs: 100
freeze_last_layer: 1
global_crops_scale: (0.4, 1.0)
gpu: 0
local_crops_number: 8
local_crops_scale: (0.05, 0.4)
local_rank: 0
lr: 0.0005
min_lr: 1e-06
momentum_teacher: 0.996
norm_last_layer: True
num_workers: 10
optimizer: adamw
out_dim: 512
output_dir: .
patch_size: 16
rank: 0
saveckp_freq: 20
seed: 0
teacher_temp: 0.04
use_bn_in_head: False
use_fp16: True
warmup_epochs: 10
warmup_teacher_temp: 0.04
warmup_teacher_temp_epochs: 0
weight_decay: 0.04
weight_decay_end: 0.4
world_size: 1


  "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "


Data loaded: there are 42885 images.


  cpuset_checked))


In [None]:

for i, (images, labels) in enumerate(data_loader, 0):
      sample_fname, _ = data_loader.dataset.samples[i]
      print(images)
      print(labels)
      print(sample_fname)
      break

In [None]:
data_loader.batch_sampler.sampler.dataset.imgs

In [None]:
import torch
import torch.nn as nn
query_features=torch.load('/content/drive/MyDrive/models/dino-main/train_features.pt',map_location=torch.device('cpu'))
test_img_list=torch.load('/content/drive/MyDrive/models/dino-main/img_names.pt',map_location=torch.device('cpu'))
query_features = nn.functional.normalize(query_features, dim=1, p=2)

# query_features
sim = torch.mm(query_features, query_features.T)


for i in range(len(test_img_list)):
        print(i)
        for j in range(len(test_img_list)):
            index1=test_img_list[i].rfind('/')
            index2=test_img_list[j].rfind('/')
            if test_img_list[i][index1+4:index1+7] == test_img_list[j][index2+4:index2+7] and i!=j and test_img_list[i][index1+1:index1+3] == test_img_list[j][index2+1:index2+3] :
                sim[i][j]=0


# for i in range(300):
#     sim[i][i]=-100.0





# simNumpy=sim.cpu().numpy()
# ranks = torch.argsort(-sim, dim=0).cpu().numpy()

In [None]:
query_features.shape

torch.Size([42885, 768])

In [None]:
sim = torch.mm(query_features, query_features.T)


In [None]:

# simNumpy=sim.cpu().numpy()
ranks = torch.argsort(-sim, dim=0).cpu().numpy()

In [None]:
sim[:,0]

tensor([ 1.0000,  0.6289,  0.3779,  ...,  0.0679,  0.1584, -0.0128])

In [None]:
len(sim)

42885

In [None]:
torch.argsort(-sim[:,0], dim=0).cpu().numpy()[:6]

array([    0, 17545, 14905, 19109, 21898, 21533])

In [None]:
KNN=[]
for i in range(len(sim)):
  print(i)
  KNN.append(torch.argsort(-sim[:,i], dim=0).cpu().numpy()[:6])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
37885
37886
37887
37888
37889
37890
37891
37892
37893
37894
37895
37896
37897
37898
37899
37900
37901
37902
37903
37904
37905
37906
37907
37908
37909
37910
37911
37912
37913
37914
37915
37916
37917
37918
37919
37920
37921
37922
37923
37924
37925
37926
37927
37928
37929
37930
37931
37932
37933
37934
37935
37936
37937
37938
37939
37940
37941
37942
37943
37944
37945
37946
37947
37948
37949
37950
37951
37952
37953
37954
37955
37956
37957
37958
37959
37960
37961
37962
37963
37964
37965
37966
37967
37968
37969
37970
37971
37972
37973
37974
37975
37976
37977
37978
37979
37980
37981
37982
37983
37984
37985
37986
37987
37988
37989
37990
37991
37992
37993
37994
37995
37996
37997
37998
37999
38000
38001
38002
38003
38004
38005
38006
38007
38008
38009
38010
38011
38012
38013
38014
38015
38016
38017
38018
38019
38020
38021
38022
38023
38024
38025
38026
38027
38028
38029
38030
38031
38032
38033
38034
38035
38036
38037
38038
38039
38040

In [None]:
KNN

[array([    0, 17545, 14905, 19109, 21898, 21533]),
 array([   1, 3300, 7819, 3286, 3258, 3256]),
 array([   2, 3451, 2456, 2386, 2387, 7819]),
 array([   3, 7774, 7856, 7819, 3475, 3285]),
 array([   4, 7701, 1310, 4467, 2336, 7774]),
 array([    5,  1200,  9472,  6829,  1201, 13001]),
 array([   6, 2111, 4509, 2115, 3237, 2138]),
 array([    7, 21884, 25910,  8346, 13896, 25101]),
 array([   8, 1650,  522, 7911, 7309,  296]),
 array([   9,  481,  468,  582,  494, 1748]),
 array([   10,  3138, 10974,  7939,  5127,  8039]),
 array([   11,  8108, 11561, 11552,  8092,  4384]),
 array([   12, 11561,  4366,  4287, 11562,   780]),
 array([   13, 11561,  8092, 11552, 11906, 11913]),
 array([   14, 11941, 11563,  8048, 11914, 11626]),
 array([  15,  667, 7960, 1798,  468, 7267]),
 array([  16, 2459, 2385, 2452, 3450, 2405]),
 array([   17,  6370,  3759, 23985,  3746, 10457]),
 array([  18,  506, 7283,  468, 3464, 7974]),
 array([  19,  494,  296,  483, 7274, 7281]),
 array([   20, 10575, 1048

In [None]:
from numpy import save
save('/content/drive/MyDrive/models/dino-main/KNN.npy', KNN)

In [None]:
from numpy import load
data=load('/content/drive/MyDrive/models/dino-main/KNN.npy')

In [None]:
import argparse
import os
import sys
import datetime
import time
import math
import json
from pathlib import Path
from dataset import SingleData
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torchvision import models as torchvision_models

import utils
import vision_transformer as vits
from vision_transformer import DINOHead


dataset = datasets.ImageFolder(args.data_path, transform=transform)
sampler = torch.utils.data.DistributedSampler(dataset, shuffle=True)
    data_loader = torch.utils.data.DataLoader(
        dataset,
        sampler=sampler,
        batch_size=args.batch_size_per_gpu,
        num_workers=args.num_workers,
        pin_memory=True,
        drop_last=True,
    )

IndentationError: ignored

In [None]:
# Copyright (c) Facebook, Inc. and its affiliates.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import sys
import datetime
import time
import math
import json
from pathlib import Path
from dataset import SingleData
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torchvision import models as torchvision_models



import torch
from torch import nn
import torch.distributed as dist
import torch.backends.cudnn as cudnn
from torchvision import models as torchvision_models
from torchvision import transforms as pth_transforms
from PIL import Image, ImageFile
import numpy as np
from dataset import SingleData
import utils
from utils2 import  LabelSmoothingCrossEntropyLoss, BatchHardTripletLoss, ImageReader, MPerClassSampler
from torchvision import datasets, transforms
import vision_transformer as vits
from eval_knn import extract_features
import math




import utils
import vision_transformer as vits
from vision_transformer import DINOHead

torchvision_archs = sorted(name for name in torchvision_models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(torchvision_models.__dict__[name]))





class OxfordParisDataset(torch.utils.data.Dataset):
    def __init__(self, class_name, img_list, transform=None,imsize=None):

      self.img_list = img_list
      self.class_name = class_name
      print(self.class_name)
      self.label_list = []
      for i in range(len(img_list)):
          pre_fix = self.img_list[i].split('/')[-1].split('_')[0]
          self.label_list.append(self.class_name.index(pre_fix))
      self.label_list = np.array(self.label_list)
      self.transform = transform


      self.samples = self.img_list
      self.transform = transform
      self.imsize = imsize

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        # path = os.path.join(self.cfg["dir_images"], self.samples[index] + ".jpg")
        ImageFile.LOAD_TRUNCATED_IMAGES = True
        # with open(path, 'rb') as f:
        img = Image.open(self.img_list[index])
        img = img.convert('RGB')
        if self.imsize is not None:
            img.thumbnail((self.imsize, self.imsize), Image.ANTIALIAS)
        if self.transform is not None:
            img = self.transform(img)
        return img, index













def get_args_parser():
    parser = argparse.ArgumentParser('DINO', add_help=False)

    # Model parameters
    parser.add_argument('--arch', default='vit_base', type=str,
        choices=['vit_tiny', 'vit_small', 'vit_base', 'xcit', 'deit_tiny', 'deit_small'] \
                + torchvision_archs + torch.hub.list("facebookresearch/xcit:main"),
        help="""Name of architecture to train. For quick experiments with ViTs,
        we recommend using vit_tiny or vit_small.""")
    parser.add_argument('--patch_size', default=16, type=int, help="""Size in pixels
        of input square patches - default 16 (for 16x16 patches). Using smaller
        values leads to better performance but requires more memory. Applies only
        for ViTs (vit_tiny, vit_small and vit_base). If <16, we recommend disabling
        mixed precision training (--use_fp16 false) to avoid unstabilities.""")
    parser.add_argument('--out_dim', default=512, type=int, help="""Dimensionality of
        the DINO head output. For complex and large datasets large values (like 65k) work well.""")
    parser.add_argument('--norm_last_layer', default=True, type=utils.bool_flag,
        help="""Whether or not to weight normalize the last layer of the DINO head.
        Not normalizing leads to better performance but can make the training unstable.
        In our experiments, we typically set this paramater to False with vit_small and True with vit_base.""")
    parser.add_argument('--momentum_teacher', default=0.996, type=float, help="""Base EMA
        parameter for teacher update. The value is increased to 1 during training with cosine schedule.
        We recommend setting a higher value with small batches: for example use 0.9995 with batch size of 256.""")
    parser.add_argument('--use_bn_in_head', default=False, type=utils.bool_flag,
        help="Whether to use batch normalizations in projection head (Default: False)")

    # Temperature teacher parameters
    parser.add_argument('--warmup_teacher_temp', default=0.04, type=float,
        help="""Initial value for the teacher temperature: 0.04 works well in most cases.
        Try decreasing it if the training loss does not decrease.""")
    parser.add_argument('--teacher_temp', default=0.04, type=float, help="""Final value (after linear warmup)
        of the teacher temperature. For most experiments, anything above 0.07 is unstable. We recommend
        starting with the default value of 0.04 and increase this slightly if needed.""")
    parser.add_argument('--warmup_teacher_temp_epochs', default=0, type=int,
        help='Number of warmup epochs for the teacher temperature (Default: 30).')

    # Training/Optimization parameters
    parser.add_argument('--use_fp16', type=utils.bool_flag, default=True, help="""Whether or not
        to use half precision for training. Improves training time and memory requirements,
        but can provoke instability and slight decay of performance. We recommend disabling
        mixed precision if the loss is unstable, if reducing the patch size or if training with bigger ViTs.""")
    parser.add_argument('--weight_decay', type=float, default=0.04, help="""Initial value of the
        weight decay. With ViT, a smaller value at the beginning of training works well.""")
    parser.add_argument('--weight_decay_end', type=float, default=0.4, help="""Final value of the
        weight decay. We use a cosine schedule for WD and using a larger decay by
        the end of training improves performance for ViTs.""")
    parser.add_argument('--clip_grad', type=float, default=3.0, help="""Maximal parameter
        gradient norm if using gradient clipping. Clipping with norm .3 ~ 1.0 can
        help optimization for larger ViT architectures. 0 for disabling.""")
    parser.add_argument('--batch_size_per_gpu', default=32, type=int,
        help='Per-GPU batch-size : number of distinct images loaded on one GPU.')
    parser.add_argument('--epochs', default=100, type=int, help='Number of epochs of training.')
    parser.add_argument('--freeze_last_layer', default=1, type=int, help="""Number of epochs
        during which we keep the output layer fixed. Typically doing so during
        the first epoch helps training. Try increasing this value if the loss does not decrease.""")
    parser.add_argument("--lr", default=0.0005, type=float, help="""Learning rate at the end of
        linear warmup (highest LR used during training). The learning rate is linearly scaled
        with the batch size, and specified here for a reference batch size of 256.""")
    parser.add_argument("--warmup_epochs", default=10, type=int,
        help="Number of epochs for the linear learning-rate warm up.")
    parser.add_argument('--min_lr', type=float, default=1e-6, help="""Target LR at the
        end of optimization. We use a cosine LR schedule with linear warmup.""")
    parser.add_argument('--optimizer', default='adamw', type=str,
        choices=['adamw', 'sgd', 'lars'], help="""Type of optimizer. We recommend using adamw with ViTs.""")
    parser.add_argument('--drop_path_rate', type=float, default=0.1, help="stochastic depth rate")

    # Multi-crop parameters
    parser.add_argument('--global_crops_scale', type=float, nargs='+', default=(0.4, 1.),
        help="""Scale range of the cropped image before resizing, relatively to the origin image.
        Used for large global view cropping. When disabling multi-crop (--local_crops_number 0), we
        recommand using a wider range of scale ("--global_crops_scale 0.14 1." for example)""")
    parser.add_argument('--local_crops_number', type=int, default=8, help="""Number of small
        local views to generate. Set this parameter to 0 to disable multi-crop training.
        When disabling multi-crop we recommend to use "--global_crops_scale 0.14 1." """)
    parser.add_argument('--local_crops_scale', type=float, nargs='+', default=(0.05, 0.4),
        help="""Scale range of the cropped image before resizing, relatively to the origin image.
        Used for small local view cropping of multi-crop.""")

    # Misc
    parser.add_argument('--data_path', default='/content/drive/MyDrive/datasets/CRC/train/', type=str,
        help='Please specify path to the ImageNet training data.')
    parser.add_argument('--output_dir', default=".", type=str, help='Path to save logs and checkpoints.')
    parser.add_argument('--saveckp_freq', default=20, type=int, help='Save checkpoint every x epochs.')
    parser.add_argument('--seed', default=0, type=int, help='Random seed.')
    parser.add_argument('--num_workers', default=10, type=int, help='Number of data loading workers per GPU.')
    parser.add_argument("--dist_url", default="env://", type=str, help="""url used to set up
        distributed training; see https://pytorch.org/docs/stable/distributed.html""")
    parser.add_argument("--local_rank", default=0, type=int, help="Please ignore and do not set this argument.")
    return parser


def get_data_list(data_path, ratio=0.0):
    img_list = []
    for root, dirs, files in os.walk(data_path):
        if files == []:
            class_name = dirs
        elif dirs == []:
            for f in files:
                img_path = os.path.join(root, f)
                img_list.append(img_path)

    np.random.seed(1)
    train_img_list =img_list
    return class_name, train_img_list

def train_dino(args):
    utils.init_distributed_mode(args)
    utils.fix_random_seeds(args.seed)
    print("git:\n  {}\n".format(utils.get_sha()))
    print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
    cudnn.benchmark = True

    # ============ preparing data ... ============
    transform = DataAugmentationDINO(
        args.global_crops_scale,
        args.local_crops_scale,
        args.local_crops_number,
    )
    dataset = datasets.ImageFolder(args.data_path, transform=transform)
    # data_path="/content/drive/MyDrive/datasets/CRC/train"
    # class_name, train_img_list = get_data_list(data_path,0.0)
    # dataset=OxfordParisDataset(class_name, train_img_list, transform=transform)


    # class_name, train_img_list = get_data_list2('/content/drive/MyDrive/dataSets/CRC/train/',0.0)
    # class_name, train_img_list = get_data_list('D:\\original_images_5\\test-patches\\trainset\\SPQ\\final\\train\\final\\final',0.0)
    # dataset=SingleData(class_name, train_img_list, transform=transforms.ToTensor())
    sampler = torch.utils.data.DistributedSampler(dataset, shuffle=True)
    data_loader = torch.utils.data.DataLoader(
        dataset,
        sampler=sampler,
        batch_size=args.batch_size_per_gpu,
        num_workers=args.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    print(f"Data loaded: there are {len(dataset)} images.")



    # ============ building student and teacher networks ... ============
    # we changed the name DeiT-S for ViT-S to avoid confusions
    args.arch = args.arch.replace("deit", "vit")
    # if the network is a Vision Transformer (i.e. vit_tiny, vit_small, vit_base)
    if args.arch in vits.__dict__.keys():
        student = vits.__dict__[args.arch](
            patch_size=args.patch_size,
            drop_path_rate=args.drop_path_rate,  # stochastic depth
        )
        teacher = vits.__dict__[args.arch](patch_size=args.patch_size)
        # student.load_state_dict(torch.load('/content/drive/MyDrive/models/dino-main/clipVitB16.pth'))
        # teacher.load_state_dict(torch.load('/content/drive/MyDrive/models/dino-main/clipVitB16.pth'))
        student.load_state_dict(torch.load('/content/drive/MyDrive/models/dino-main/clipVitB16.pth'))
        teacher.load_state_dict(torch.load('/content/drive/MyDrive/models/dino-main/clipVitB16.pth'))

        embed_dim = student.embed_dim
    # if the network is a XCiT
    elif args.arch in torch.hub.list("facebookresearch/xcit:main"):
        student = torch.hub.load('facebookresearch/xcit:main', args.arch,
                                 pretrained=False, drop_path_rate=args.drop_path_rate)
        teacher = torch.hub.load('facebookresearch/xcit:main', args.arch, pretrained=False)
        embed_dim = student.embed_dim
    # otherwise, we check if the architecture is in torchvision models
    elif args.arch in torchvision_models.__dict__.keys():
        student = torchvision_models.__dict__[args.arch]()
        teacher = torchvision_models.__dict__[args.arch]()
        embed_dim = student.fc.weight.shape[1]
        # embed_dim = student.state_dict()['encoder.layers.encoder_layer_11.mlp.3.weight'].shape[1]
        # embed_dim=1000
    else:
        print(f"Unknow architecture: {args.arch}")

    # multi-crop wrapper handles forward with inputs of different resolutions
    student = utils.MultiCropWrapper(student, DINOHead(
        embed_dim,
        args.out_dim,
        use_bn=args.use_bn_in_head,
        norm_last_layer=args.norm_last_layer,
    ))
    teacher = utils.MultiCropWrapper(
        teacher,
        DINOHead(embed_dim, args.out_dim, args.use_bn_in_head),
    )
    # move networks to gpu
    student, teacher = student.cuda(), teacher.cuda()





    # torch.save(data_loader.dataset.img_list, 'img_names.pt')
    train_features,train_label = extract_features(teacher, data_loader, False, multiscale=False)
    query_features,query_label = extract_features(teacher, data_loader, False, multiscale=False)
    torch.save(train_features, 'train_features.pt')
    torch.save(query_features, 'query_features.pt')
    torch.save(train_label, 'train_label.pt')
    torch.save(query_label, 'query_label.pt')

    # return data_loader








    # synchronize batch norms (if any)
    if utils.has_batchnorms(student):
        student = nn.SyncBatchNorm.convert_sync_batchnorm(student)
        teacher = nn.SyncBatchNorm.convert_sync_batchnorm(teacher)

        # we need DDP wrapper to have synchro batch norms working...
        teacher = nn.parallel.DistributedDataParallel(teacher, device_ids=[args.gpu])
        teacher_without_ddp = teacher.module
    else:
        # teacher_without_ddp and teacher are the same thing
        teacher_without_ddp = teacher
    student = nn.parallel.DistributedDataParallel(student, device_ids=[args.gpu])
    # teacher and student start with the same weights
    teacher_without_ddp.load_state_dict(student.module.state_dict())
    # there is no backpropagation through the teacher, so no need for gradients
    for p in teacher.parameters():
        p.requires_grad = False
    print(f"Student and Teacher are built: they are both {args.arch} network.")

    # ============ preparing loss ... ============
    dino_loss = DINOLoss(
        args.out_dim,
        args.local_crops_number + 2,  # total number of crops = 2 global crops + local_crops_number
        args.warmup_teacher_temp,
        args.teacher_temp,
        args.warmup_teacher_temp_epochs,
        args.epochs,
    ).cuda()

    # ============ preparing optimizer ... ============
    params_groups = utils.get_params_groups(student)
    if args.optimizer == "adamw":
        optimizer = torch.optim.AdamW(params_groups)  # to use with ViTs
    elif args.optimizer == "sgd":
        optimizer = torch.optim.SGD(params_groups, lr=0, momentum=0.9)  # lr is set by scheduler
    elif args.optimizer == "lars":
        optimizer = utils.LARS(params_groups)  # to use with convnet and large batches
    # for mixed precision training
    fp16_scaler = None
    if args.use_fp16:
        fp16_scaler = torch.cuda.amp.GradScaler()

    # ============ init schedulers ... ============
    lr_schedule = utils.cosine_scheduler(
        args.lr * (args.batch_size_per_gpu * utils.get_world_size()) / 256.,  # linear scaling rule
        args.min_lr,
        args.epochs, len(data_loader),
        warmup_epochs=args.warmup_epochs,
    )
    wd_schedule = utils.cosine_scheduler(
        args.weight_decay,
        args.weight_decay_end,
        args.epochs, len(data_loader),
    )
    # momentum parameter is increased to 1. during training with a cosine schedule
    momentum_schedule = utils.cosine_scheduler(args.momentum_teacher, 1,
                                               args.epochs, len(data_loader))
    print(f"Loss, optimizer and schedulers ready.")

    # ============ optionally resume training ... ============
    to_restore = {"epoch": 0}
    utils.restart_from_checkpoint(
        os.path.join(args.output_dir, "checkpoint.pth"),
        run_variables=to_restore,
        student=student,
        teacher=teacher,
        optimizer=optimizer,
        fp16_scaler=fp16_scaler,
        dino_loss=dino_loss,
    )
    start_epoch = to_restore["epoch"]

    start_time = time.time()
    print("Starting DINO training !")
    for epoch in range(start_epoch, args.epochs):
        data_loader.sampler.set_epoch(epoch)

        # ============ training one epoch of DINO ... ============
        train_stats = train_one_epoch(student, teacher, teacher_without_ddp, dino_loss,
            data_loader, optimizer, lr_schedule, wd_schedule, momentum_schedule,
            epoch, fp16_scaler, args)

        # ============ writing logs ... ============
        save_dict = {
            'student': student.state_dict(),
            'teacher': teacher.state_dict(),
            'optimizer': optimizer.state_dict(),
            'epoch': epoch + 1,
            'args': args,
            'dino_loss': dino_loss.state_dict(),
        }
        if fp16_scaler is not None:
            save_dict['fp16_scaler'] = fp16_scaler.state_dict()
        utils.save_on_master(save_dict, os.path.join(args.output_dir, 'checkpoint.pth'))
        if args.saveckp_freq and epoch % args.saveckp_freq == 0:
            utils.save_on_master(save_dict, os.path.join(args.output_dir, f'checkpoint{epoch:04}.pth'))
        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                     'epoch': epoch}
        if utils.is_main_process():
            with (Path(args.output_dir) / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))


def train_one_epoch(student, teacher, teacher_without_ddp, dino_loss, data_loader,
                    optimizer, lr_schedule, wd_schedule, momentum_schedule,epoch,
                    fp16_scaler, args):
    metric_logger = utils.MetricLogger(delimiter="  ")
    header = 'Epoch: [{}/{}]'.format(epoch, args.epochs)
    for it, (images, _) in enumerate(metric_logger.log_every(data_loader, 10, header)):
        # update weight decay and learning rate according to their schedule
        it = len(data_loader) * epoch + it  # global training iteration
        for i, param_group in enumerate(optimizer.param_groups):
            param_group["lr"] = lr_schedule[it]
            if i == 0:  # only the first group is regularized
                param_group["weight_decay"] = wd_schedule[it]

        # move images to gpu
        images = [im.cuda(non_blocking=True) for im in images]
        # teacher and student forward passes + compute dino loss
        with torch.cuda.amp.autocast(fp16_scaler is not None):
            teacher_output = teacher(images[:2])  # only the 2 global views pass through the teacher
            student_output = student(images)
            loss = dino_loss(student_output, teacher_output, epoch)

        if not math.isfinite(loss.item()):
            print("Loss is {}, stopping training".format(loss.item()), force=True)
            sys.exit(1)

        # student update
        optimizer.zero_grad()
        param_norms = None
        if fp16_scaler is None:
            loss.backward()
            if args.clip_grad:
                param_norms = utils.clip_gradients(student, args.clip_grad)
            utils.cancel_gradients_last_layer(epoch, student,
                                              args.freeze_last_layer)
            optimizer.step()
        else:
            fp16_scaler.scale(loss).backward()
            if args.clip_grad:
                fp16_scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
                param_norms = utils.clip_gradients(student, args.clip_grad)
            utils.cancel_gradients_last_layer(epoch, student,
                                              args.freeze_last_layer)
            fp16_scaler.step(optimizer)
            fp16_scaler.update()

        # EMA update for the teacher
        with torch.no_grad():
            m = momentum_schedule[it]  # momentum parameter
            for param_q, param_k in zip(student.module.parameters(), teacher_without_ddp.parameters()):
                param_k.data.mul_(m).add_((1 - m) * param_q.detach().data)

        # logging
        torch.cuda.synchronize()
        metric_logger.update(loss=loss.item())
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
        metric_logger.update(wd=optimizer.param_groups[0]["weight_decay"])
    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}


class DINOLoss(nn.Module):
    def __init__(self, out_dim, ncrops, warmup_teacher_temp, teacher_temp,
                 warmup_teacher_temp_epochs, nepochs, student_temp=0.1,
                 center_momentum=0.9):
        super().__init__()
        self.student_temp = student_temp
        self.center_momentum = center_momentum
        self.ncrops = ncrops
        self.register_buffer("center", torch.zeros(1, out_dim))
        # we apply a warm up for the teacher temperature because
        # a too high temperature makes the training instable at the beginning
        self.teacher_temp_schedule = np.concatenate((
            np.linspace(warmup_teacher_temp,
                        teacher_temp, warmup_teacher_temp_epochs),
            np.ones(nepochs - warmup_teacher_temp_epochs) * teacher_temp
        ))

    def forward(self, student_output, teacher_output, epoch):
        """
        Cross-entropy between softmax outputs of the teacher and student networks.
        """
        student_out = student_output / self.student_temp
        student_out = student_out.chunk(self.ncrops)

        # teacher centering and sharpening
        temp = self.teacher_temp_schedule[epoch]
        teacher_out = F.softmax((teacher_output - self.center) / temp, dim=-1)
        teacher_out = teacher_out.detach().chunk(2)

        total_loss = 0
        n_loss_terms = 0
        for iq, q in enumerate(teacher_out):
            for v in range(len(student_out)):
                if v == iq:
                    # we skip cases where student and teacher operate on the same view
                    continue
                loss = torch.sum(-q * F.log_softmax(student_out[v], dim=-1), dim=-1)
                total_loss += loss.mean()
                n_loss_terms += 1
        total_loss /= n_loss_terms
        self.update_center(teacher_output)
        return total_loss

    @torch.no_grad()
    def update_center(self, teacher_output):
        """
        Update center used for teacher output.
        """
        batch_center = torch.sum(teacher_output, dim=0, keepdim=True)
        dist.all_reduce(batch_center)
        batch_center = batch_center / (len(teacher_output) * dist.get_world_size())

        # ema update
        self.center = self.center * self.center_momentum + batch_center * (1 - self.center_momentum)


class DataAugmentationDINO(object):
    def __init__(self, global_crops_scale, local_crops_scale, local_crops_number):
        flip_and_color_jitter = transforms.Compose([
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomApply(
                [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)],
                p=0.8
            ),
            transforms.RandomGrayscale(p=0.2),
        ])
        normalize = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        ])

        # first global crop
        self.global_transfo1 = transforms.Compose([
            transforms.RandomResizedCrop(224, scale=global_crops_scale, interpolation=Image.BICUBIC),
            flip_and_color_jitter,
            utils.GaussianBlur(1.0),
            normalize,
        ])
        # second global crop
        self.global_transfo2 = transforms.Compose([
            transforms.RandomResizedCrop(224, scale=global_crops_scale, interpolation=Image.BICUBIC),
            flip_and_color_jitter,
            utils.GaussianBlur(0.1),
            utils.Solarization(0.2),
            normalize,
        ])
        # transformation for the local small crops
        self.local_crops_number = local_crops_number
        self.local_transfo = transforms.Compose([
            transforms.RandomResizedCrop(96, scale=local_crops_scale, interpolation=Image.BICUBIC),
            flip_and_color_jitter,
            utils.GaussianBlur(p=0.5),
            normalize,
        ])

    def __call__(self, image):
        crops = []
        crops.append(self.global_transfo1(image))
        crops.append(self.global_transfo2(image))
        for _ in range(self.local_crops_number):
            crops.append(self.local_transfo(image))
        return crops


if __name__ == '__main__':
    parser = argparse.ArgumentParser('DINO', parents=[get_args_parser()])
    args = parser.parse_args("")
    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
    dataloader=train_dino(args)

Will run the code on one GPU.
| distributed init (rank 0): env://


Using cache found in /root/.cache/torch/hub/facebookresearch_xcit_main


git:
  sha: N/A, status: clean, branch: N/A

arch: vit_base
batch_size_per_gpu: 32
clip_grad: 3.0
data_path: /content/drive/MyDrive/datasets/CRC/train/
dist_url: env://
drop_path_rate: 0.1
epochs: 100
freeze_last_layer: 1
global_crops_scale: (0.4, 1.0)
gpu: 0
local_crops_number: 8
local_crops_scale: (0.05, 0.4)
local_rank: 0
lr: 0.0005
min_lr: 1e-06
momentum_teacher: 0.996
norm_last_layer: True
num_workers: 10
optimizer: adamw
out_dim: 512
output_dir: .
patch_size: 16
rank: 0
saveckp_freq: 20
seed: 0
teacher_temp: 0.04
use_bn_in_head: False
use_fp16: True
warmup_epochs: 10
warmup_teacher_temp: 0.04
warmup_teacher_temp_epochs: 0
weight_decay: 0.04
weight_decay_end: 0.4
world_size: 1


  "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
  cpuset_checked))


Data loaded: there are 42885 images.
tensor([[[[ 2.1975,  2.2318,  2.2318,  ...,  0.8961,  0.9817,  1.0502],
          [ 2.1975,  2.2318,  2.2489,  ...,  0.9474,  1.0502,  1.1358],
          [ 2.2147,  2.2318,  2.2489,  ...,  1.0331,  1.1529,  1.2385],
          ...,
          [ 1.5297,  1.5297,  1.5982,  ...,  1.2728,  1.1700,  1.1358],
          [ 1.5297,  1.4954,  1.5125,  ...,  1.4098,  1.3242,  1.3070],
          [ 1.5468,  1.4954,  1.4612,  ...,  1.5468,  1.4612,  1.4269]],

         [[ 2.1660,  2.2535,  2.3235,  ...,  0.6779,  0.6954,  0.7479],
          [ 2.2010,  2.2710,  2.3410,  ...,  0.7129,  0.7479,  0.8004],
          [ 2.2360,  2.2885,  2.3410,  ...,  0.7304,  0.8004,  0.8529],
          ...,
          [ 1.5707,  1.5882,  1.6408,  ...,  1.1681,  1.0630,  1.0280],
          [ 1.5532,  1.5357,  1.5532,  ...,  1.3606,  1.2556,  1.2206],
          [ 1.5882,  1.5357,  1.5182,  ...,  1.5007,  1.3957,  1.3606]],

         [[ 2.5529,  2.5877,  2.6226,  ...,  1.4548,  1.5594,  1.

AttributeError: ignored

In [None]:
a = [[1], [2], [3]]
b = torch.FloatTensor(a)
b

tensor([[1.],
        [2.],
        [3.]])

In [None]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f2a5cd30790>

In [None]:
# Copyright (c) Facebook, Inc. and its affiliates.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from google.colab import drive
drive.mount('/content/drive')
!pwd 
%cd drive/MyDrive/models/dino-main/
!nvidia-smi
!pip install timm==0.4.9
!pip install yacs
!pip install -U PyYAML
!pip install einops



# Copyright (c) Facebook, Inc. and its affiliates.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import pickle
import argparse

import torch
from torch import nn
import torch.distributed as dist
import torch.backends.cudnn as cudnn
from torchvision import models as torchvision_models
from torchvision import transforms as pth_transforms
from PIL import Image, ImageFile
import numpy as np
from dataset import SingleData
import utils
from utils2 import  LabelSmoothingCrossEntropyLoss, BatchHardTripletLoss, ImageReader, MPerClassSampler
from torchvision import datasets, transforms
import vision_transformer as vits
from eval_knn import extract_features
import math


def get_data_list(data_path, ratio=0.0):
    img_list = []
    for root, dirs, files in os.walk(data_path):
        if files == []:
            class_name = dirs
        elif dirs == []:
            for f in files:
                img_path = os.path.join(root, f)
                img_list.append(img_path)

    np.random.seed(1)
    train_img_list =img_list
    return class_name, train_img_list

# class OxfordParisDataset(torch.utils.data.Dataset):
#     def __init__(self, dir_main, dataset, split, transform=None, imsize=None):
#         if dataset not in ['roxford5k', 'rparis6k']:
#             raise ValueError('Unknown dataset: {}!'.format(dataset))

#         # loading imlist, qimlist, and gnd, in cfg as a dict
#         gnd_fname = os.path.join(dir_main, dataset, 'gnd_{}.pkl'.format(dataset))
#         with open(gnd_fname, 'rb') as f:
#             cfg = pickle.load(f)
#         cfg['gnd_fname'] = gnd_fname
#         cfg['ext'] = '.jpg'
#         cfg['qext'] = '.jpg'
#         cfg['dir_data'] = os.path.join(dir_main, dataset)
#         cfg['dir_images'] = os.path.join(cfg['dir_data'], 'jpg')
#         cfg['n'] = len(cfg['imlist'])
#         cfg['nq'] = len(cfg['qimlist'])
#         cfg['im_fname'] = config_imname
#         cfg['qim_fname'] = config_qimname
#         cfg['dataset'] = dataset
#         self.cfg = cfg

#         self.samples = cfg["qimlist"] if split == "query" else cfg["imlist"]
#         self.transform = transform
#         self.imsize = imsize

#     def __len__(self):
#         return len(self.samples)

#     def __getitem__(self, index):
#         path = os.path.join(self.cfg["dir_images"], self.samples[index] + ".jpg")
#         ImageFile.LOAD_TRUNCATED_IMAGES = True
#         with open(path, 'rb') as f:
#             img = Image.open(f)
#             img = img.convert('RGB')
#         if self.imsize is not None:
#             img.thumbnail((self.imsize, self.imsize), Image.ANTIALIAS)
#         if self.transform is not None:
#             img = self.transform(img)
#         return img, index


class OxfordParisDataset(torch.utils.data.Dataset):
    def __init__(self, class_name, img_list, transform=None,imsize=None):

      self.img_list = img_list
      self.class_name = class_name
      print(self.class_name)
      self.label_list = []
      for i in range(len(img_list)):
          pre_fix = self.img_list[i].split('/')[-1].split('_')[0]
          self.label_list.append(self.class_name.index(pre_fix))
      self.label_list = np.array(self.label_list)
      self.transform = transform


      self.samples = self.img_list
      self.transform = transform
      self.imsize = imsize

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        # path = os.path.join(self.cfg["dir_images"], self.samples[index] + ".jpg")
        ImageFile.LOAD_TRUNCATED_IMAGES = True
        # with open(path, 'rb') as f:
        img = Image.open(self.img_list[index])
        img = img.convert('RGB')
        if self.imsize is not None:
            img.thumbnail((self.imsize, self.imsize), Image.ANTIALIAS)
        if self.transform is not None:
            img = self.transform(img)
        return img, index,self.img_list[index]




def config_imname(cfg, i):
    return os.path.join(cfg['dir_images'], cfg['imlist'][i] + cfg['ext'])


def config_qimname(cfg, i):
    return os.path.join(cfg['dir_images'], cfg['qimlist'][i] + cfg['qext'])


if __name__ == '__main__':
    parser = argparse.ArgumentParser('Image Retrieval on revisited Paris and Oxford')
    parser.add_argument('--data_path', default='/content/drive/MyDrive/dataSets/oxbuild_images', type=str)
    parser.add_argument('--dataset', default='roxford5k', type=str, choices=['roxford5k', 'rparis6k'])
    parser.add_argument('--multiscale', default=False, type=utils.bool_flag)
    parser.add_argument('--imsize', default=224, type=int, help='Image size')
    # parser.add_argument('--pretrained_weights', default='/content/drive/MyDrive/models/dino-main/checkpoint 45k data-200ep/checkpoint0150.pth', type=str, help="Path to pretrained weights to evaluate.")
    # parser.add_argument('--pretrained_weights', default='/content/drive/MyDrive/models/dino-main/checkpoint 65K/checkpoint.pth', type=str, help="Path to pretrained weights to evaluate.")
    # parser.add_argument('--pretrained_weights', default='/content/drive/MyDrive/models/dino-main/checkpoint 512/checkpoint.pth', type=str, help="Path to pretrained weights to evaluate.")
    parser.add_argument('--pretrained_weights', default='/content/drive/MyDrive/models/dino-main/checkpoint.pth', type=str, help="Path to pretrained weights to evaluate.")
    # parser.add_argument('--pretrained_weights', default='/content/drive/MyDrive/models/dino-main/checkpoint-ViTB/checkpoint.pth', type=str, help="Path to pretrained weights to evaluate.")
    
    
    parser.add_argument('--use_cuda', default=True, type=utils.bool_flag)
    parser.add_argument('--arch', default='vit_base', type=str, help='Architecture')
    # parser.add_argument('--patch_size', default=16, type=int, help='Patch resolution of the model.')
    parser.add_argument('--patch_size', default=16, type=int, help='Patch resolution of the model.')
    parser.add_argument("--checkpoint_key", default="teacher", type=str,
        help='Key to use in the checkpoint (example: "teacher")')
    parser.add_argument('--num_workers', default=8, type=int, help='Number of data loading workers per GPU.')
    parser.add_argument("--dist_url", default="env://", type=str, help="""url used to set up
        distributed training; see https://pytorch.org/docs/stable/distributed.html""")
    parser.add_argument("--local_rank", default=1, type=int, help="Please ignore and do not set this argument.")
    args = parser.parse_args("")

    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))
    print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
    cudnn.benchmark = True

    # ============ preparing data ... ============
    transform = pth_transforms.Compose([
        pth_transforms.ToTensor(),
        pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])
    
    #################################################
    # data_path="/content/drive/MyDrive/dataSets/CRC/DINO-test-same/train"
    # data_path="/content/drive/MyDrive/dataSets/CRC/forDINO/train"
    data_path="/content/drive/MyDrive/datasets/CRC/train"
   
    # data_path="D:\\models\\CGD-master\\dataset\\img"
    # test_path="/content/drive/MyDrive/dataSets/CRC/fixed different test2"
    test_path="/content/drive/MyDrive/datasets/CRC/train"

    # test_path="/content/drive/MyDrive/dataSets/CRC/fixed same test"
    class_name, train_img_list = get_data_list(data_path,0.0)
    train_img_list=sorted(train_img_list)
    class_test_name, test_img_list = get_data_list(test_path,0.0)
    test_img_list=sorted(test_img_list)
    dataset_train=OxfordParisDataset(class_name, train_img_list, transform=transform)
    # train_sample = MPerClassSampler(train_data_set.label_list, batch_size=8)
    # data_loader_train = torch.utils.data.DataLoader(train_data_set, batch_sampler=train_sample, num_workers=8)
    # train_data_set = datasets.ImageFolder(data_path, transform=transform)
    # test_data_set = datasets.ImageFolder(test_path, transform=transform)
    dataset_query = OxfordParisDataset(class_test_name, test_img_list, transform=transform)
    # data_loader_query = torch.utils.data.DataLoader(test_data_set, batch_size=8, shuffle=False, num_workers=8)
    ##################################################
    
    # dataset_train = OxfordParisDataset(args.data_path, args.dataset, split="train", transform=transform, imsize=args.imsize)
    # dataset_query = OxfordParisDataset(args.data_path, args.dataset, split="query", transform=transform, imsize=args.imsize)
    # sampler = torch.utils.data.DistributedSampler(train_data_set, shuffle=False)
    sampler = torch.utils.data.DistributedSampler(dataset_train, shuffle=False)

    # sampler = torch.utils.data.Sampler(dataset_train)
    data_loader_train = torch.utils.data.DataLoader(
        dataset_train,
        # train_data_set,
        # sampler=sampler,
        # sampler=train_sample,
        batch_size=1,
        num_workers=args.num_workers,
        pin_memory=True,
        drop_last=False,
    )
    data_loader_query = torch.utils.data.DataLoader(
        dataset_query,
        # test_data_set,
        batch_size=1,
        num_workers=args.num_workers,
        pin_memory=True,
        drop_last=False,
    )
    # print(f"train: {len(dataset_train)} imgs / query: {len(dataset_query)} imgs")
    # print(data_loader_train.dataset.samples)
    # ============ building network ... ============
    if "vit" in args.arch:
        model = vits.__dict__[args.arch](patch_size=args.patch_size, num_classes=0)
        print(f"Model {args.arch} {args.patch_size}x{args.patch_size} built.")
    elif "xcit" in args.arch:
        model = torch.hub.load('facebookresearch/xcit:main', args.arch, num_classes=0)
    elif args.arch in torchvision_models.__dict__.keys():
        model = torchvision_models.__dict__[args.arch](num_classes=0)
    else:
        print(f"Architecture {args.arch} non supported")
        sys.exit(1)
    if args.use_cuda:
        model.cuda()
    model.eval()

    # load pretrained weights
    if os.path.isfile(args.pretrained_weights):
        state_dict = torch.load(args.pretrained_weights, map_location="cpu")
        if args.checkpoint_key is not None and args.checkpoint_key in state_dict:
            print(f"Take key {args.checkpoint_key} in provided checkpoint dict")
            state_dict = state_dict[args.checkpoint_key]
        # remove `module.` prefix
        state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
        # remove `backbone.` prefix induced by multicrop wrapper
        state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
        msg = model.load_state_dict(state_dict, strict=False)
        print('Pretrained weights found at {} and loaded with msg: {}'.format(args.pretrained_weights, msg))
    elif args.arch == "vit_small" and args.patch_size == 16:
        print("Since no pretrained weights have been provided, we load pretrained DINO weights on Google Landmark v2.")
        model.load_state_dict(torch.hub.load_state_dict_from_url(url="https://dl.fbaipublicfiles.com/dino/dino_vitsmall16_googlelandmark_pretrain/dino_vitsmall16_googlelandmark_pretrain.pth"))
    else:
        print("Warning: We use random weights.")

    ############################################################################
    # Step 1: extract features
    print("###########",data_loader_query.dataset.img_list)
    torch.save(data_loader_query.dataset.img_list, 'img_names.pt')
    train_features,train_label = extract_features(model, data_loader_train, args.use_cuda, multiscale=args.multiscale)
    query_features,query_label = extract_features(model, data_loader_query, args.use_cuda, multiscale=args.multiscale)
    torch.save(train_features, 'train_features.pt')
    torch.save(query_features, 'query_features.pt')
    torch.save(train_label, 'train_label.pt')
    torch.save(query_label, 'query_label.pt')
    

    
    
    print(train_features)
    print(query_features)
    if utils.get_rank() == 0:  # only rank 0 will work from now on
        # normalize features
        train_features = nn.functional.normalize(train_features, dim=1, p=2)
        query_features = nn.functional.normalize(query_features, dim=1, p=2)

        ############################################################################
        # Step 2: similarity
        # sim = torch.mm(train_features, query_features.T)
        # ranks = torch.argsort(-sim, dim=0).cpu().numpy()
        # torch.save(sim, 'sim.pt')
        # print(sim)

        # print(ranks)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content
/content/drive/MyDrive/models/dino-main
Sun Aug  7 12:08:52 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+--------------

  cpuset_checked))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  [35990/42885]  eta: 0:01:03    time: 0.009026  data: 0.000197  max mem: 471
  [36000/42885]  eta: 0:01:03    time: 0.008970  data: 0.000196  max mem: 471
  [36010/42885]  eta: 0:01:03    time: 0.008957  data: 0.000158  max mem: 471
  [36020/42885]  eta: 0:01:02    time: 0.009103  data: 0.000190  max mem: 471
  [36030/42885]  eta: 0:01:02    time: 0.008954  data: 0.000198  max mem: 471
  [36040/42885]  eta: 0:01:02    time: 0.008818  data: 0.000148  max mem: 471
  [36050/42885]  eta: 0:01:02    time: 0.008921  data: 0.000137  max mem: 471
  [36060/42885]  eta: 0:01:02    time: 0.008937  data: 0.000134  max mem: 471
  [36070/42885]  eta: 0:01:02    time: 0.008948  data: 0.000142  max mem: 471
  [36080/42885]  eta: 0:01:02    time: 0.009049  data: 0.000168  max mem: 471
  [36090/42885]  eta: 0:01:02    time: 0.008999  data: 0.000184  max mem: 471
  [36100/42885]  eta: 0:01:02    time: 0.008973  data: 0.000156  max mem: 471

RuntimeError: ignored

In [None]:
i=0
for samples, index,path in data_loader_train:
  print(path)
  i+=1
  if i ==100:
    break

  cpuset_checked))


['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_00.png']
['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_010.png']
['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_011.png']
['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_012.png']
['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_015.png']
['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_08.png']
['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_09.png']
['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_10.png']
['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_100.png']
['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_101.png']
['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_1010.png']
['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_1011.png']
['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_1012.png']
['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_1014.png']
['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_1015.png']
['/conten

In [None]:
train_img_list[40000]

'/content/drive/MyDrive/datasets/CRC/train/s3/s3_175_49_1922.png'

In [None]:
sorted(train_img_list)

['/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_00.png',
 '/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_010.png',
 '/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_011.png',
 '/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_012.png',
 '/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_015.png',
 '/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_08.png',
 '/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_09.png',
 '/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_10.png',
 '/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_100.png',
 '/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_101.png',
 '/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_1010.png',
 '/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_1011.png',
 '/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_1012.png',
 '/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_1014.png',
 '/content/drive/MyDrive/datasets/CRC/train/s1/s1_002_2_1015.png',
 '/conten

In [None]:

test_img_list[40000]

'/content/drive/MyDrive/datasets/CRC/train/s2/s2_017_22_714.png'

In [None]:
dataloader.__dict__.keys()

dict_keys(['dataset', 'num_workers', 'prefetch_factor', 'pin_memory', 'pin_memory_device', 'timeout', 'worker_init_fn', '_DataLoader__multiprocessing_context', '_dataset_kind', 'batch_size', 'drop_last', 'sampler', 'batch_sampler', 'generator', 'collate_fn', 'persistent_workers', '_DataLoader__initialized', '_IterableDataset_len_called', '_iterator'])

In [None]:
len(dataloader.dataset.imgs)

42885

In [None]:
dataloader.dataset.imgs[10000]

('/content/drive/MyDrive/datasets/CRC/train/s1/s1_157_77_45.png', 0)

In [None]:
test_img_list

['/content/drive/MyDrive/datasets/CRC/test/s3/s3_016_0_03.png',
 '/content/drive/MyDrive/datasets/CRC/test/s3/s3_016_0_18.png',
 '/content/drive/MyDrive/datasets/CRC/test/s3/s3_016_0_21.png',
 '/content/drive/MyDrive/datasets/CRC/test/s3/s3_016_0_11.png',
 '/content/drive/MyDrive/datasets/CRC/test/s3/s3_016_0_05.png',
 '/content/drive/MyDrive/datasets/CRC/test/s3/s3_016_0_110.png',
 '/content/drive/MyDrive/datasets/CRC/test/s3/s3_016_0_13.png',
 '/content/drive/MyDrive/datasets/CRC/test/s3/s3_016_0_07.png',
 '/content/drive/MyDrive/datasets/CRC/test/s3/s3_016_0_20.png',
 '/content/drive/MyDrive/datasets/CRC/test/s3/s3_016_0_06.png',
 '/content/drive/MyDrive/datasets/CRC/test/s3/s3_016_0_08.png',
 '/content/drive/MyDrive/datasets/CRC/test/s3/s3_016_0_09.png',
 '/content/drive/MyDrive/datasets/CRC/test/s3/s3_016_0_19.png',
 '/content/drive/MyDrive/datasets/CRC/test/s3/s3_016_0_12.png',
 '/content/drive/MyDrive/datasets/CRC/test/s3/s3_016_0_15.png',
 '/content/drive/MyDrive/datasets/CRC/t

In [None]:
paths

# New Section

In [None]:
!nvidia-smi
from google.colab import drive
drive.mount('/content/drive')
!pwd 
%cd drive/MyDrive/models/dino-main/
!nvidia-smi
!pip install timm==0.4.9
!pip install yacs
!pip install -U PyYAML
!pip install einops
# data
from numpy import load
KNN=load('/content/drive/MyDrive/models/dino-main/KNN.npy')

Mon Aug  8 03:30:09 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

array([[ 2627,  2607,  2807,  2846,  2682,  3329],
       [ 3300,  7819,  3286,  3258,  3256,  3285],
       [ 3451,  2456,  2387,  2386,  7819,  7774],
       ...,
       [34973, 35378, 34958, 35580, 34955, 35608],
       [33584, 34955, 35542, 38431, 35634, 35366],
       [35542, 35580, 34955, 33584, 35634, 34973]])

In [None]:
# Copyright (c) Facebook, Inc. and its affiliates.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import sys
import datetime
import time
import math
import json
from pathlib import Path
from dataset import SingleData
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torchvision import models as torchvision_models

import utils
import vision_transformer as vits
from vision_transformer import DINOHead
from numpy import load
KNN=load('/content/drive/MyDrive/models/dino-main/KNN2.npy')
torchvision_archs = sorted(name for name in torchvision_models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(torchvision_models.__dict__[name]))

def get_args_parser():
    parser = argparse.ArgumentParser('DINO', add_help=False)

    # Model parameters
    parser.add_argument('--arch', default='vit_base', type=str,
        choices=['vit_tiny', 'vit_small', 'vit_base', 'xcit', 'deit_tiny', 'deit_small'] \
                + torchvision_archs + torch.hub.list("facebookresearch/xcit:main"),
        help="""Name of architecture to train. For quick experiments with ViTs,
        we recommend using vit_tiny or vit_small.""")
    parser.add_argument('--patch_size', default=16, type=int, help="""Size in pixels
        of input square patches - default 16 (for 16x16 patches). Using smaller
        values leads to better performance but requires more memory. Applies only
        for ViTs (vit_tiny, vit_small and vit_base). If <16, we recommend disabling
        mixed precision training (--use_fp16 false) to avoid unstabilities.""")
    parser.add_argument('--out_dim', default=512, type=int, help="""Dimensionality of
        the DINO head output. For complex and large datasets large values (like 65k) work well.""")
    parser.add_argument('--norm_last_layer', default=True, type=utils.bool_flag,
        help="""Whether or not to weight normalize the last layer of the DINO head.
        Not normalizing leads to better performance but can make the training unstable.
        In our experiments, we typically set this paramater to False with vit_small and True with vit_base.""")
    parser.add_argument('--momentum_teacher', default=0.996, type=float, help="""Base EMA
        parameter for teacher update. The value is increased to 1 during training with cosine schedule.
        We recommend setting a higher value with small batches: for example use 0.9995 with batch size of 256.""")
    parser.add_argument('--use_bn_in_head', default=False, type=utils.bool_flag,
        help="Whether to use batch normalizations in projection head (Default: False)")

    # Temperature teacher parameters
    parser.add_argument('--warmup_teacher_temp', default=0.04, type=float,
        help="""Initial value for the teacher temperature: 0.04 works well in most cases.
        Try decreasing it if the training loss does not decrease.""")
    parser.add_argument('--teacher_temp', default=0.04, type=float, help="""Final value (after linear warmup)
        of the teacher temperature. For most experiments, anything above 0.07 is unstable. We recommend
        starting with the default value of 0.04 and increase this slightly if needed.""")
    parser.add_argument('--warmup_teacher_temp_epochs', default=0, type=int,
        help='Number of warmup epochs for the teacher temperature (Default: 30).')

    # Training/Optimization parameters
    parser.add_argument('--use_fp16', type=utils.bool_flag, default=True, help="""Whether or not
        to use half precision for training. Improves training time and memory requirements,
        but can provoke instability and slight decay of performance. We recommend disabling
        mixed precision if the loss is unstable, if reducing the patch size or if training with bigger ViTs.""")
    parser.add_argument('--weight_decay', type=float, default=0.04, help="""Initial value of the
        weight decay. With ViT, a smaller value at the beginning of training works well.""")
    parser.add_argument('--weight_decay_end', type=float, default=0.4, help="""Final value of the
        weight decay. We use a cosine schedule for WD and using a larger decay by
        the end of training improves performance for ViTs.""")
    parser.add_argument('--clip_grad', type=float, default=3.0, help="""Maximal parameter
        gradient norm if using gradient clipping. Clipping with norm .3 ~ 1.0 can
        help optimization for larger ViT architectures. 0 for disabling.""")
    parser.add_argument('--batch_size_per_gpu', default=32, type=int,
        help='Per-GPU batch-size : number of distinct images loaded on one GPU.')
    parser.add_argument('--epochs', default=100, type=int, help='Number of epochs of training.')
    parser.add_argument('--freeze_last_layer', default=1, type=int, help="""Number of epochs
        during which we keep the output layer fixed. Typically doing so during
        the first epoch helps training. Try increasing this value if the loss does not decrease.""")
    parser.add_argument("--lr", default=0.0005, type=float, help="""Learning rate at the end of
        linear warmup (highest LR used during training). The learning rate is linearly scaled
        with the batch size, and specified here for a reference batch size of 256.""")
    parser.add_argument("--warmup_epochs", default=10, type=int,
        help="Number of epochs for the linear learning-rate warm up.")
    parser.add_argument('--min_lr', type=float, default=1e-6, help="""Target LR at the
        end of optimization. We use a cosine LR schedule with linear warmup.""")
    parser.add_argument('--optimizer', default='adamw', type=str,
        choices=['adamw', 'sgd', 'lars'], help="""Type of optimizer. We recommend using adamw with ViTs.""")
    parser.add_argument('--drop_path_rate', type=float, default=0.1, help="stochastic depth rate")

    # Multi-crop parameters
    parser.add_argument('--global_crops_scale', type=float, nargs='+', default=(0.4, 1.),
        help="""Scale range of the cropped image before resizing, relatively to the origin image.
        Used for large global view cropping. When disabling multi-crop (--local_crops_number 0), we
        recommand using a wider range of scale ("--global_crops_scale 0.14 1." for example)""")
    parser.add_argument('--local_crops_number', type=int, default=8, help="""Number of small
        local views to generate. Set this parameter to 0 to disable multi-crop training.
        When disabling multi-crop we recommend to use "--global_crops_scale 0.14 1." """)
    parser.add_argument('--local_crops_scale', type=float, nargs='+', default=(0.05, 0.4),
        help="""Scale range of the cropped image before resizing, relatively to the origin image.
        Used for small local view cropping of multi-crop.""")

    # Misc
    parser.add_argument('--data_path', default='/content/drive/MyDrive/datasets/CRC/train/', type=str,
        help='Please specify path to the ImageNet training data.')
    parser.add_argument('--output_dir', default=".", type=str, help='Path to save logs and checkpoints.')
    parser.add_argument('--saveckp_freq', default=1, type=int, help='Save checkpoint every x epochs.')
    parser.add_argument('--seed', default=0, type=int, help='Random seed.')
    parser.add_argument('--num_workers', default=10, type=int, help='Number of data loading workers per GPU.')
    parser.add_argument("--dist_url", default="env://", type=str, help="""url used to set up
        distributed training; see https://pytorch.org/docs/stable/distributed.html""")
    parser.add_argument("--local_rank", default=0, type=int, help="Please ignore and do not set this argument.")
    return parser
class ImageFolderWithPaths(datasets.ImageFolder):
    """Custom dataset that includes image file paths. Extends
    torchvision.datasets.ImageFolder
    """

    # override the __getitem__ method. this is the method that dataloader calls
    def __getitem__(self, index):
        # this is what ImageFolder normally returns 
        original_tuple = super(ImageFolderWithPaths, self).__getitem__(index)
        # the image file path
        path = self.imgs[index][0]
        # make a new tuple that includes original and the path
        tuple_with_path = (original_tuple + (path,)+(index,))
        return tuple_with_path


def train_dino(args):
    utils.init_distributed_mode(args)
    utils.fix_random_seeds(args.seed)
    print("git:\n  {}\n".format(utils.get_sha()))
    print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
    cudnn.benchmark = True

    # ============ preparing data ... ============
    transform = DataAugmentationDINO(
        args.global_crops_scale,
        args.local_crops_scale,
        args.local_crops_number,
    )
    # dataset = datasets.ImageFolder(args.data_path, transform=transform)
    dataset = ImageFolderWithPaths(args.data_path, transform=transform)

    
    # class_name, train_img_list = get_data_list2('/content/drive/MyDrive/dataSets/CRC/train/',0.0)
    # class_name, train_img_list = get_data_list('D:\\original_images_5\\test-patches\\trainset\\SPQ\\final\\train\\final\\final',0.0)
    # dataset=SingleData(class_name, train_img_list, transform=transforms.ToTensor())
    sampler = torch.utils.data.DistributedSampler(dataset, shuffle=True)
    data_loader = torch.utils.data.DataLoader(
        dataset,
        sampler=sampler,
        batch_size=args.batch_size_per_gpu,
        num_workers=args.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    print(f"Data loaded: there are {len(dataset)} images.")
    # return data_loader
    # ============ building student and teacher networks ... ============
    # we changed the name DeiT-S for ViT-S to avoid confusions
    args.arch = args.arch.replace("deit", "vit")
    # if the network is a Vision Transformer (i.e. vit_tiny, vit_small, vit_base)
    if args.arch in vits.__dict__.keys():
        student = vits.__dict__[args.arch](
            patch_size=args.patch_size,
            drop_path_rate=args.drop_path_rate,  # stochastic depth
        )
        teacher = vits.__dict__[args.arch](patch_size=args.patch_size)
        # student.load_state_dict(torch.load('/content/drive/MyDrive/models/dino-main/clipVitB16.pth'))
        # teacher.load_state_dict(torch.load('/content/drive/MyDrive/models/dino-main/clipVitB16.pth'))
        student.load_state_dict(torch.load('/content/drive/MyDrive/models/dino-main/clipVitB16.pth'))
        teacher.load_state_dict(torch.load('/content/drive/MyDrive/models/dino-main/clipVitB16.pth'))

        embed_dim = student.embed_dim
    # if the network is a XCiT
    elif args.arch in torch.hub.list("facebookresearch/xcit:main"):
        student = torch.hub.load('facebookresearch/xcit:main', args.arch,
                                 pretrained=False, drop_path_rate=args.drop_path_rate)
        teacher = torch.hub.load('facebookresearch/xcit:main', args.arch, pretrained=False)
        embed_dim = student.embed_dim
    # otherwise, we check if the architecture is in torchvision models
    elif args.arch in torchvision_models.__dict__.keys():
        student = torchvision_models.__dict__[args.arch]()
        teacher = torchvision_models.__dict__[args.arch]()
        embed_dim = student.fc.weight.shape[1]
        # embed_dim = student.state_dict()['encoder.layers.encoder_layer_11.mlp.3.weight'].shape[1]
        # embed_dim=1000
    else:
        print(f"Unknow architecture: {args.arch}")

    # multi-crop wrapper handles forward with inputs of different resolutions
    student = utils.MultiCropWrapper(student, DINOHead(
        embed_dim,
        args.out_dim,
        use_bn=args.use_bn_in_head,
        norm_last_layer=args.norm_last_layer,
    ))
    teacher = utils.MultiCropWrapper(
        teacher,
        DINOHead(embed_dim, args.out_dim, args.use_bn_in_head),
    )
    # move networks to gpu
    student, teacher = student.cuda(), teacher.cuda()
    # synchronize batch norms (if any)
    if utils.has_batchnorms(student):
        student = nn.SyncBatchNorm.convert_sync_batchnorm(student)
        teacher = nn.SyncBatchNorm.convert_sync_batchnorm(teacher)

        # we need DDP wrapper to have synchro batch norms working...
        teacher = nn.parallel.DistributedDataParallel(teacher, device_ids=[args.gpu])
        teacher_without_ddp = teacher.module
    else:
        # teacher_without_ddp and teacher are the same thing
        teacher_without_ddp = teacher
    student = nn.parallel.DistributedDataParallel(student, device_ids=[args.gpu])
    # teacher and student start with the same weights
    teacher_without_ddp.load_state_dict(student.module.state_dict())
    # there is no backpropagation through the teacher, so no need for gradients
    for p in teacher.parameters():
        p.requires_grad = False
    print(f"Student and Teacher are built: they are both {args.arch} network.")

    # ============ preparing loss ... ============
    dino_loss = DINOLoss(
        args.out_dim,
        args.local_crops_number + 2,  # total number of crops = 2 global crops + local_crops_number
        args.warmup_teacher_temp,
        args.teacher_temp,
        args.warmup_teacher_temp_epochs,
        args.epochs,
    ).cuda()

    # ============ preparing optimizer ... ============
    params_groups = utils.get_params_groups(student)
    if args.optimizer == "adamw":
        optimizer = torch.optim.AdamW(params_groups)  # to use with ViTs
    elif args.optimizer == "sgd":
        optimizer = torch.optim.SGD(params_groups, lr=0, momentum=0.9)  # lr is set by scheduler
    elif args.optimizer == "lars":
        optimizer = utils.LARS(params_groups)  # to use with convnet and large batches
    # for mixed precision training
    fp16_scaler = None
    if args.use_fp16:
        fp16_scaler = torch.cuda.amp.GradScaler()

    # ============ init schedulers ... ============
    lr_schedule = utils.cosine_scheduler(
        args.lr * (args.batch_size_per_gpu * utils.get_world_size()) / 256.,  # linear scaling rule
        args.min_lr,
        args.epochs, len(data_loader),
        warmup_epochs=args.warmup_epochs,
    )
    wd_schedule = utils.cosine_scheduler(
        args.weight_decay,
        args.weight_decay_end,
        args.epochs, len(data_loader),
    )
    # momentum parameter is increased to 1. during training with a cosine schedule
    momentum_schedule = utils.cosine_scheduler(args.momentum_teacher, 1,
                                               args.epochs, len(data_loader))
    print(f"Loss, optimizer and schedulers ready.")

    # ============ optionally resume training ... ============
    to_restore = {"epoch": 0}
    utils.restart_from_checkpoint(
        os.path.join(args.output_dir, "checkpoint.pth"),
        run_variables=to_restore,
        student=student,
        teacher=teacher,
        optimizer=optimizer,
        fp16_scaler=fp16_scaler,
        dino_loss=dino_loss,
    )
    start_epoch = to_restore["epoch"]

    start_time = time.time()
    print("Starting DINO training !")
    for epoch in range(start_epoch, args.epochs):
        data_loader.sampler.set_epoch(epoch)

        # ============ training one epoch of DINO ... ============
        train_stats = train_one_epoch(student, teacher, teacher_without_ddp, dino_loss,
            data_loader, optimizer, lr_schedule, wd_schedule, momentum_schedule,
            epoch, fp16_scaler, args)

        # ============ writing logs ... ============
        save_dict = {
            'student': student.state_dict(),
            'teacher': teacher.state_dict(),
            'optimizer': optimizer.state_dict(),
            'epoch': epoch + 1,
            'args': args,
            'dino_loss': dino_loss.state_dict(),
        }
        if fp16_scaler is not None:
            save_dict['fp16_scaler'] = fp16_scaler.state_dict()
        utils.save_on_master(save_dict, os.path.join(args.output_dir, 'checkpoint.pth'))
        if args.saveckp_freq and epoch % args.saveckp_freq == 0:
            utils.save_on_master(save_dict, os.path.join(args.output_dir, f'checkpoint{epoch:04}.pth'))
        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                     'epoch': epoch}
        if utils.is_main_process():
            with (Path(args.output_dir) / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))


def train_one_epoch(student, teacher, teacher_without_ddp, dino_loss, data_loader,
                    optimizer, lr_schedule, wd_schedule, momentum_schedule,epoch,
                    fp16_scaler, args):
    metric_logger = utils.MetricLogger(delimiter="  ")
    header = 'Epoch: [{}/{}]'.format(epoch, args.epochs)
    for it, (images, _,paths,indces) in enumerate(metric_logger.log_every(data_loader, 10, header)):
        # update weight decay and learning rate according to their schedule
        # print('DDD=>',dd)
        # print('images=>',paths)
        # print(index)
        # print(images[0])
        # for index in indces:
        #   temp=[]
        #   for 
        # np.random.randint(1,5)
        images2,b,c,d=data_loader.dataset.__getitem__(KNN[indces[0]][0])
        for i in range(len(images2)):
          images2[i]=images2[i].unsqueeze(0)

        nearst=[]
        for i in range(1,len(indces)):
          a,b,c,d=data_loader.dataset.__getitem__(KNN[indces[i]][0])
          nearst.append(a)
        
        for i in range(len(images2)):
          for j in range(len(nearst)):
            images2[i]=torch.cat((images2[i], nearst[j][i].unsqueeze(0)), 0)
            # print(images2[i].shape)
        # images2=[torch.Tensor() for _ in range(len(images))]
        # for i in range(len(images2)):
        #   for index in range(1,len(indces)):
        #     a,b,c,d=data_loader.dataset.__getitem__(index)
        #     images2[i]=torch.cat((images2[i], a[i].unsqueeze(0)), 0)
        #     print(images2[i].shape)

        # print(len(images2))
        # print(len(images2[0]))
        # print(images[0].shape)
        it = len(data_loader) * epoch + it  # global training iteration
        for i, param_group in enumerate(optimizer.param_groups):
            param_group["lr"] = lr_schedule[it]
            if i == 0:  # only the first group is regularized
                param_group["weight_decay"] = wd_schedule[it]

        # move images to gpu
        images = [im.cuda(non_blocking=True) for im in images]
        images2 = [im.cuda(non_blocking=True) for im in images2]
        
        # teacher and student forward passes + compute dino loss
        with torch.cuda.amp.autocast(fp16_scaler is not None):
            teacher_output = teacher(images[:2])  # only the 2 global views pass through the teacher
            student_output = student(images2)
            loss = dino_loss(student_output, teacher_output, epoch)

        if not math.isfinite(loss.item()):
            print("Loss is {}, stopping training".format(loss.item()), force=True)
            sys.exit(1)

        # student update
        optimizer.zero_grad()
        param_norms = None
        if fp16_scaler is None:
            loss.backward()
            if args.clip_grad:
                param_norms = utils.clip_gradients(student, args.clip_grad)
            utils.cancel_gradients_last_layer(epoch, student,
                                              args.freeze_last_layer)
            optimizer.step()
        else:
            fp16_scaler.scale(loss).backward()
            if args.clip_grad:
                fp16_scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
                param_norms = utils.clip_gradients(student, args.clip_grad)
            utils.cancel_gradients_last_layer(epoch, student,
                                              args.freeze_last_layer)
            fp16_scaler.step(optimizer)
            fp16_scaler.update()

        # EMA update for the teacher
        with torch.no_grad():
            m = momentum_schedule[it]  # momentum parameter
            for param_q, param_k in zip(student.module.parameters(), teacher_without_ddp.parameters()):
                param_k.data.mul_(m).add_((1 - m) * param_q.detach().data)

        # logging
        torch.cuda.synchronize()
        metric_logger.update(loss=loss.item())
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
        metric_logger.update(wd=optimizer.param_groups[0]["weight_decay"])
    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}


class DINOLoss(nn.Module):
    def __init__(self, out_dim, ncrops, warmup_teacher_temp, teacher_temp,
                 warmup_teacher_temp_epochs, nepochs, student_temp=0.1,
                 center_momentum=0.9):
        super().__init__()
        self.student_temp = student_temp
        self.center_momentum = center_momentum
        self.ncrops = ncrops
        self.register_buffer("center", torch.zeros(1, out_dim))
        # we apply a warm up for the teacher temperature because
        # a too high temperature makes the training instable at the beginning
        self.teacher_temp_schedule = np.concatenate((
            np.linspace(warmup_teacher_temp,
                        teacher_temp, warmup_teacher_temp_epochs),
            np.ones(nepochs - warmup_teacher_temp_epochs) * teacher_temp
        ))

    def forward(self, student_output, teacher_output, epoch):
        """
        Cross-entropy between softmax outputs of the teacher and student networks.
        """
        student_out = student_output / self.student_temp
        student_out = student_out.chunk(self.ncrops)

        # teacher centering and sharpening
        temp = self.teacher_temp_schedule[epoch]
        teacher_out = F.softmax((teacher_output - self.center) / temp, dim=-1)
        teacher_out = teacher_out.detach().chunk(2)

        total_loss = 0
        n_loss_terms = 0
        for iq, q in enumerate(teacher_out):
            for v in range(len(student_out)):
                if v == iq:
                    # we skip cases where student and teacher operate on the same view
                    continue
                loss = torch.sum(-q * F.log_softmax(student_out[v], dim=-1), dim=-1)
                total_loss += loss.mean()
                n_loss_terms += 1
        total_loss /= n_loss_terms
        self.update_center(teacher_output)
        return total_loss

    @torch.no_grad()
    def update_center(self, teacher_output):
        """
        Update center used for teacher output.
        """
        batch_center = torch.sum(teacher_output, dim=0, keepdim=True)
        dist.all_reduce(batch_center)
        batch_center = batch_center / (len(teacher_output) * dist.get_world_size())

        # ema update
        self.center = self.center * self.center_momentum + batch_center * (1 - self.center_momentum)


class DataAugmentationDINO(object):
    def __init__(self, global_crops_scale, local_crops_scale, local_crops_number):
        flip_and_color_jitter = transforms.Compose([
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomApply(
                [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)],
                p=0.8
            ),
            transforms.RandomGrayscale(p=0.2),
        ])
        normalize = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        ])

        # first global crop
        self.global_transfo1 = transforms.Compose([
            transforms.RandomResizedCrop(224, scale=global_crops_scale, interpolation=Image.BICUBIC),
            flip_and_color_jitter,
            utils.GaussianBlur(1.0),
            normalize,
        ])
        # second global crop
        self.global_transfo2 = transforms.Compose([
            transforms.RandomResizedCrop(224, scale=global_crops_scale, interpolation=Image.BICUBIC),
            flip_and_color_jitter,
            utils.GaussianBlur(0.1),
            utils.Solarization(0.2),
            normalize,
        ])
        # transformation for the local small crops
        self.local_crops_number = local_crops_number
        self.local_transfo = transforms.Compose([
            transforms.RandomResizedCrop(96, scale=local_crops_scale, interpolation=Image.BICUBIC),
            flip_and_color_jitter,
            utils.GaussianBlur(p=0.5),
            normalize,
        ])

    def __call__(self, image):
        crops = []
        crops.append(self.global_transfo1(image))
        crops.append(self.global_transfo2(image))
        for _ in range(self.local_crops_number):
            crops.append(self.local_transfo(image))
        return crops


if __name__ == '__main__':
    parser = argparse.ArgumentParser('DINO', parents=[get_args_parser()])
    args = parser.parse_args("")
    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
    data_loader=train_dino(args)

Will run the code on one GPU.
| distributed init (rank 0): env://


Using cache found in /root/.cache/torch/hub/facebookresearch_xcit_main


git:
  sha: N/A, status: clean, branch: N/A

arch: vit_base
batch_size_per_gpu: 32
clip_grad: 3.0
data_path: /content/drive/MyDrive/datasets/CRC/train/
dist_url: env://
drop_path_rate: 0.1
epochs: 100
freeze_last_layer: 1
global_crops_scale: (0.4, 1.0)
gpu: 0
local_crops_number: 8
local_crops_scale: (0.05, 0.4)
local_rank: 0
lr: 0.0005
min_lr: 1e-06
momentum_teacher: 0.996
norm_last_layer: True
num_workers: 10
optimizer: adamw
out_dim: 512
output_dir: .
patch_size: 16
rank: 0
saveckp_freq: 1
seed: 0
teacher_temp: 0.04
use_bn_in_head: False
use_fp16: True
warmup_epochs: 10
warmup_teacher_temp: 0.04
warmup_teacher_temp_epochs: 0
weight_decay: 0.04
weight_decay_end: 0.4
world_size: 1


  "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
  cpuset_checked))


Data loaded: there are 42885 images.
Student and Teacher are built: they are both vit_base network.
Loss, optimizer and schedulers ready.
Found checkpoint at ./checkpoint.pth
=> loaded 'student' from checkpoint './checkpoint.pth' with msg <All keys matched successfully>
=> loaded 'teacher' from checkpoint './checkpoint.pth' with msg <All keys matched successfully>
=> loaded 'optimizer' from checkpoint: './checkpoint.pth'
=> loaded 'fp16_scaler' from checkpoint: './checkpoint.pth'
=> loaded 'dino_loss' from checkpoint './checkpoint.pth' with msg <All keys matched successfully>
Starting DINO training !
Epoch: [32/100]  [   0/1340]  eta: 1 day, 4:42:40  loss: 1.306212 (1.306212)  lr: 0.000054 (0.000054)  wd: 0.123551 (0.123551)  time: 77.134697  data: 18.948778  max mem: 11672
Epoch: [32/100]  [  10/1340]  eta: 4:12:27  loss: 1.187819 (1.197527)  lr: 0.000054 (0.000054)  wd: 0.123569 (0.123569)  time: 11.388844  data: 1.722863  max mem: 12009
Epoch: [32/100]  [  20/1340]  eta: 2:59:53  lo

In [None]:
pd = [torch.Tensor() for _ in range(4)]
for i in range(4):
    j = torch.tensor([3]).float()
    pd[i] = torch.cat((pd[i], j), 0)
pd

[tensor([3.]), tensor([3.]), tensor([3.]), tensor([3.])]

In [None]:
for i in range(4):
    j = torch.tensor([4]).float()
    pd[i] = torch.cat((pd[i], torch.tensor([j])), 0)
pd

[tensor([3., 4.]), tensor([3., 4.]), tensor([3., 4.]), tensor([3., 4.])]

In [None]:
output=[]
for i in range(3):
  a=torch.randn(1)
  output.append(a)
output
# torch.cat(output, dim=-1)

[tensor([0.4033]), tensor([0.8380]), tensor([-0.7193])]

In [None]:
output

[tensor([0.4033]), tensor([0.8380]), tensor([-0.7193])]

In [None]:
torch.Tensor(output)

tensor([ 0.4033,  0.8380, -0.7193])

In [None]:
x = torch.randn(2, 3,3)
y = torch.randn(2, 3,3)
print(x)
print(y)

tensor([[[-0.3620, -0.4097,  0.3955],
         [-0.0418, -0.4276, -1.8563],
         [-0.7525,  0.9091, -0.3030]],

        [[-0.4412,  0.8844,  0.3951],
         [ 0.8906, -0.6563,  0.4618],
         [ 0.0560, -0.1739, -0.7750]]])
tensor([[[-4.9871e-01, -1.8873e+00, -1.9635e+00],
         [ 1.3202e+00,  3.1139e+00, -7.0969e-01],
         [-1.2129e+00,  1.9360e-01, -5.0598e-01]],

        [[ 1.5205e+00,  1.3715e-03,  7.8831e-01],
         [ 1.3623e-01,  3.2163e-01,  3.9057e-01],
         [ 7.6883e-01, -1.0201e+00, -4.6545e-01]]])


In [None]:
x=x.unsqueeze(0)

In [None]:
x=torch.cat((x, y.unsqueeze(0)), 0)
print(x.shape)

torch.Size([4, 2, 3, 3])


In [None]:
z

tensor([[[[-0.2907, -0.1355, -0.4988],
          [ 2.9091, -1.1906,  1.3279],
          [-0.6615, -1.8332,  0.8427]],

         [[ 0.3563,  0.3791, -0.5160],
          [ 0.6322,  0.6372,  0.2922],
          [ 1.7044, -0.6808, -0.0822]]],


        [[[ 0.1391,  0.1222, -1.0129],
          [ 0.8878, -1.3862,  0.1869],
          [ 0.9350,  1.3363,  0.6449]],

         [[-0.8724,  1.6557, -0.9406],
          [-1.0563,  1.0062, -1.4946],
          [ 1.2123, -0.1757,  0.7299]]]])

In [None]:
z.shape

torch.Size([4, 3, 3])

In [None]:
k=[]
k.append(x)
k.append(y)
torch.Tensor(k)

ValueError: ignored

In [None]:
xnew_from_cat = torch.cat((x, x), 0)


In [None]:
xnew_from_cat.shape

torch.Size([4, 3, 3])

In [None]:

KNN[35838][np.random.randint(1,4)]

29101

In [None]:
import torch

In [None]:
data_loader.dataset.__dict__

{'batch_size': 32,
 'drop_last': True,
 'sampler': <torch.utils.data.distributed.DistributedSampler at 0x7f3166666690>}

In [None]:
test_img_list=torch.load('/content/drive/MyDrive/models/dino-main/img_names.pt',map_location=torch.device('cpu'))


In [None]:
# print(data_loader.dataset.__getitem__(35838))
print(test_img_list[35838])

/content/drive/MyDrive/datasets/CRC/train/s3/s3_172_33_422.png


In [None]:
ii=0
for inputs, labels, paths,index in data_loader:
    ii=inputs
    break


  cpuset_checked))


In [None]:
len(ii[0])

32

In [None]:
a,b,c,d=data_loader.dataset.__getitem__(40000)

NameError: ignored

In [None]:
data_loader.dataset.__getitem__((1,2))

TypeError: ignored

In [None]:
a[8].shape

torch.Size([3, 96, 96])

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.imshow(  a[6].permute(1, 2, 0)  )

NameError: ignored

In [None]:
data_loader

<torch.utils.data.dataloader.DataLoader at 0x7fa30fc02dd0>