# Download Dataset

In [None]:
!gdown --id 1HLJlpPtgys73C3epBTgGRON3mSvsLHC1 --output hw4_data.zip
!unzip -q hw4_data.zip
!rm hw4_data.zip

Downloading...
From: https://drive.google.com/uc?id=1HLJlpPtgys73C3epBTgGRON3mSvsLHC1
To: /content/hw4-YDBX/hw4_data.zip
100% 1.13G/1.13G [00:05<00:00, 214MB/s]


# Import Packages

In [None]:
import argparse
import numpy as np
import glob
import os
import random
import torch
import torch.nn as nn
import torchvision
from PIL import Image
from torch.utils import data
from torch.utils.data import DataLoader
from torchvision.transforms import transforms
from tqdm.auto import tqdm

# Set Random Seed

In [None]:
def fix_random_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.enabled = False

# Dataset Class

In [None]:
class Mini_dataset(data.Dataset):
    def __init__(self, data_paths, labels, transforms):
        self.data_paths = data_paths
        self.labels = labels
        self.transforms = transforms
    
    def __len__(self):
        return len(self.data_paths)

    def __getitem__(self, index):
        path, label = self.data_paths[index], self.labels[index]
        img = self.transforms(Image.open(path).convert('RGB'))
        return img, label

# Sampler

In [None]:
class CategoriesSampler():

    def __init__(self, label, n_batch, n_cls, n_per, mode, pos):
        self.n_batch = n_batch
        self.n_cls = n_cls
        self.n_per = n_per
        self.mode = mode
        self.pos = pos

        label = np.array(label)
        self.m_ind = []
        for i in range(max(label) + 1):
            ind = np.argwhere(label == i).reshape(-1)
            ind = torch.from_numpy(ind)
            self.m_ind.append(ind)

    def __len__(self):
        return self.n_batch
    
    def __iter__(self):
        for i_batch in range(self.n_batch):
            if self.mode == 'train':
                batch = []
                classes = torch.randperm(len(self.m_ind))[:self.n_cls]
                for c in classes:
                    l = self.m_ind[c]
                    pos = torch.randperm(len(l))[:self.n_per]
                    batch.append(l[pos])
                batch = torch.stack(batch).t().reshape(-1)
                yield batch
            else:
                yield torch.Tensor(self.pos[i_batch]).long()

# Get DataLoader Function

In [None]:
def get_training_dataset(train_path, train_batch, n_workers, train_way, train_shot, train_query):
    
    train_transforms = transforms.Compose([
        transforms.Resize((84, 84)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    train_data_dir = os.path.join(train_path, 'train')
    train_csv_path = os.path.join(train_path, 'train.csv')

    train_paths = []
    labels = []
    train_labels = []
    label_num = -1
    with open(train_csv_path, 'r') as f:
        for line in f.readlines()[1:]:
            _, path, label = line.split(',')

            if label not in labels:
                labels.append(label)
                label_num += 1

            path = os.path.join(train_data_dir, path)
            train_paths.append(path)
            train_labels.append(label_num)
    
    train_set = Mini_dataset(train_paths, train_labels, train_transforms)
    train_sampler = CategoriesSampler(train_set.labels, train_batch, train_way, train_shot + train_query, 'train', None)
    train_loader = DataLoader(
        train_set,
        batch_sampler= train_sampler,
        num_workers=n_workers,
        pin_memory=True
    )

    return train_loader

def get_valid_dataset(valid_path, valid_batch, n_workers, valid_way, valid_shot, valid_query):
    
    valid_transforms = transforms.Compose([
        transforms.Resize((84, 84)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    valid_data_dir = os.path.join(valid_path, 'val')
    valid_csv_path = os.path.join(valid_path, 'val.csv')
    valid_test_case_csv_path = os.path.join(valid_path, 'val_testcase.csv')

    valid_paths = []
    labels = []
    valid_labels = []
    label_num = -1
    pos = []

    with open(valid_test_case_csv_path, 'r') as f:
        for line in f.readlines()[1:]:
            pos.append([int(x) for x in line.split(',')[1:]])

    with open(valid_csv_path, 'r') as f:
        for line in f.readlines()[1:]:
            _, path, label = line.split(',')

            if label not in labels:
                labels.append(label)
                label_num += 1

            path = os.path.join(valid_data_dir, path)
            valid_paths.append(path)
            valid_labels.append(label_num)
    
    valid_set = Mini_dataset(valid_paths, valid_labels, valid_transforms)
    valid_sampler = CategoriesSampler(valid_set.labels, valid_batch, valid_way, valid_shot + valid_query, 'val', pos)
    valid_loader = DataLoader(
        valid_set,
        batch_sampler=valid_sampler,
        num_workers=n_workers,
        pin_memory=True
    )

    return valid_loader

# Model

In [None]:
class Convnet(nn.Module):
    def __init__(self, in_channels=3, hid_channels=64, out_channels=64):
        super().__init__()

        def conv_block(in_channels, out_channels):
            bn = nn.BatchNorm2d(out_channels)
            nn.init.uniform_(bn.weight)
            return nn.Sequential(
                nn.Conv2d(in_channels, out_channels, 3, padding=1),
                bn,
                nn.ReLU(),
                nn.MaxPool2d(2)
            )

        self.encoder = nn.Sequential(
            conv_block(in_channels, hid_channels),
            conv_block(hid_channels, hid_channels),
            conv_block(hid_channels, hid_channels),
            conv_block(hid_channels, out_channels)
        )

    def forward(self, x):
        x = self.encoder(x)
        return x.view(x.size(0), -1)

# Distance Function

In [None]:
def euclidean_metric(a, b):
    n = a.shape[0]
    m = b.shape[0]
    a = a.unsqueeze(1).expand(n, m, -1)
    b = b.unsqueeze(0).expand(n, m, -1)
    logits = -((a - b)**2).sum(dim=2)
    return logits

def cosine_similarity(a, b):
    cos = nn.CosineSimilarity(dim=2, eps=1e-6)
    n = a.shape[0]
    m = b.shape[0]
    a = a.unsqueeze(1).expand(n, m, -1)
    b = b.unsqueeze(0).expand(n, m, -1)
    logits = cos(a, b)
    return logits

def distance_func(a, b, linear):
    # print(linear.weight)
    a = linear(a)
    b = linear(b)
    logits = a * b.t()
    return logits

# Training

In [None]:
def training(device, model, optimizer, train_loader, test_loader, start_epoch, n_epochs, shot, query, train_way, test_way, linear):
    
    loss_fn = nn.CrossEntropyLoss()
    for epoch in range(start_epoch, n_epochs):
        
        model.train()

        train_losses = []
        train_accs = []

        for batch in tqdm(train_loader):
        # for i, batch in enumerate(train_loader):
            # print(linear.weight)
            imgs, _ = batch
            imgs = imgs.to(device)
            
            p = shot * train_way
            data_shot, data_query = imgs[:p], imgs[p:]

            proto = model(data_shot)
            proto = proto.reshape(shot, train_way, -1).mean(dim=0)

            labels = torch.arange(train_way).repeat(query).long().to(device)
            # labels = labels.long().to(device)

            # logits = euclidean_metric(model(data_query), proto)
            logits = cosine_similarity(model(data_query), proto)
            # logits = distance_func(model(data_query), proto, linear)
            loss = loss_fn(logits, labels)
            acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

            train_losses.append(loss.item())
            train_accs.append(acc.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        loss = sum(train_losses) / len(train_losses)
        acc = sum(train_accs) / len(train_accs)
        acc_std = np.std(np.array(train_accs), axis=0)

        with open('./record.txt', 'a') as f:
            f.write(f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {loss:.5f}, acc = {acc:.5f}, acc_std = {acc_std:.5f}\n")
        print(f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {loss:.5f}, acc = {acc:.5f}, acc_std = {acc_std:.5f}\n")

# Testing

In [None]:
def test(device, model, valid_loader, shot, query, test_way):
    ckpt = torch.load('./hw4_1.pt', map_location='cpu')
    model.load_state_dict(ckpt['model'])
    model.eval()
    
    preds = []

    with torch.no_grad():
        for batch in tqdm(valid_loader):
        # for i, batch in enumerate(valid_loader):
            imgs, _ = batch
            imgs = imgs.to(device)
            
            p = shot * test_way
            data_shot, data_query = imgs[:p], imgs[p:]

            proto = model(data_shot)
            proto = proto.reshape(shot, test_way, -1).mean(dim=0)

            # logits = euclidean_metric(model(data_query), proto)
            logits = cosine_similarity(model(data_query), proto)
            # logits = distance_func(model(data_query), proto, linear)
            preds.append(logits.argmax(dim=-1))
    
    firstrow = ['episode_id'] + [f'query{i}' for i in range(75)]
    rows = [firstrow] + [[i] + pred.cpu().tolist() for i, pred in enumerate(preds)]
    with open('./test.csv', 'w') as f:
        mywriter = csv.writer(f, delimiter=',')
        mywriter.writerows(rows)
            # print(preds)
            # input()
    # return preds
    # firstrow = ['episode_id'] + [f'query{i}' for i in range(75)]
    # rows = [firstrow] + preds
    # with open('./test.csv', 'w') as f:


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
seed = 0
fix_random_seeds(seed)

train_path = './hw4_data/mini'
train_batch = 100
train_way = 30

test_path = './hw4_data/mini'
test_batch = 600
test_way = 5

shot = 1
query = 15
n_workers = 0
n_epochs = 30

model = Convnet().to(device)
linear = nn.Linear(1600, 1).to(device)
optimizer = torch.optim.Adam(list(model.parameters()) + list(linear.parameters()), lr=1e-3)

load_model = os.path.exists('./hw4_1.pt')

start_epoch = 0
if load_model:
    with open('./record.txt', 'a') as f:
        f.write('Loading model...\n')
    ckpt = torch.load(f'./hw4_1.pt', map_location='cpu')
    start_epoch = ckpt['last_epoch'] + 1
    model.load_state_dict(ckpt['model'])
    optimizer.load_state_dict(ckpt['optim'])
    # scheduler.load_state_dict(ckpt['scheduler'])
else:
    with open('./record.txt', 'w') as f:
        f.write('')

train_loader = get_training_dataset(train_path, train_batch, n_workers, train_way, shot, query)
valid_loader = get_valid_dataset(test_path, test_batch, n_workers, test_way, shot, query)

# preds = test(device, model, valid_loader, shot, query, test_way)
training(device, model, optimizer, train_loader, valid_loader, start_epoch, n_epochs, shot, query, train_way, test_way, linear)

# test_fns, test_loader = get_testing_dataset(test_repo, batch_size, n_workers, image_size)
# if mode == 'train':
#     n_epochs = 20
#     train_loader = get_training_dataset('./hw3_data/p1_data/train', batch_size, n_workers, image_size)
#     training(device, model, optimizer, train_loader, test_loader, start_epoch, n_epochs)
# if mode == 'test':
#     testing(device, model, test_fns, test_loader, csv_path)

In [None]:
!wget https://www.dropbox.com/s/tslpzq6b1mpp8v1/hw4_1.pt?dl=0 -O hw4_1.pt

--2022-01-02 03:34:48--  https://www.dropbox.com/s/tslpzq6b1mpp8v1/hw4_1.pt?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.18, 2620:100:601b:18::a27d:812
Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/tslpzq6b1mpp8v1/hw4_1.pt [following]
--2022-01-02 03:34:49--  https://www.dropbox.com/s/raw/tslpzq6b1mpp8v1/hw4_1.pt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucc81c4ecb0761b155a4cb302a6b.dl.dropboxusercontent.com/cd/0/inline/Bc8Fc1CLWLRtYSZQ25uD1xViCjHAT-ZM3ANliXyeACHEXJW251nqdQ7WL0_3JauOoza_KRgSE_W0aeqsdMQjPfnpxK3gJU3Ce4kIeIC0b8mVIy2Hmw1VPpmrXkR_sJzoch_7f73LhowrjQaiQgUpWYPH/file# [following]
--2022-01-02 03:34:49--  https://ucc81c4ecb0761b155a4cb302a6b.dl.dropboxusercontent.com/cd/0/inline/Bc8Fc1CLWLRtYSZQ25uD1xViCjHAT-ZM3ANliXyeACHEXJW251nqdQ7WL0_3JauOoza_KRgSE_W0aeqsdMQjPfnpxK3gJU3Ce

# Calcuate Accuracy

In [None]:
%mv hw4_1_2_cos.pt hw4_1.pt

In [None]:
!python ./hw4_1_test.py --test_csv_path './hw4_data/mini/val.csv' --test_img_repo './hw4_data/mini/val' --testcase_csv './hw4_data/mini/val_testcase.csv' --output_csv './output.csv'

In [None]:
!git clone https://YDBX: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@github.com/DLCV-Fall-2021/hw4-YDBX.git
%cd hw4-YDBX

Cloning into 'hw4-YDBX'...
remote: Enumerating objects: 24, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 24 (delta 6), reused 20 (delta 3), pack-reused 0[K
Unpacking objects: 100% (24/24), done.
/content/hw4-YDBX


In [None]:
!bash get_dataset.sh

--2022-01-03 16:26:51--  https://docs.google.com/uc?export=download&confirm=&id=1gNFhiaidM26gzXJCw1GQxeoAIeuK9X4N
Resolving docs.google.com (docs.google.com)... 74.125.203.138, 74.125.203.100, 74.125.203.113, ...
Connecting to docs.google.com (docs.google.com)|74.125.203.138|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘hw4_data.zip’

hw4_data.zip            [ <=>                ]   3.07K  --.-KB/s    in 0s      

2022-01-03 16:26:51 (38.4 MB/s) - ‘hw4_data.zip’ saved [3139]

Archive:  ./hw4_data.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of ./hw4_data.zip or
        ./hw4_data.zip.zip, and cannot find ./hw4_data.zip.ZIP, period.


In [None]:
!bash hw4_download.sh

--2022-01-03 16:25:59--  https://www.dropbox.com/s/uf67z6k0s851qgx/hw4_1.pt?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.85.18, 2620:100:6031:18::a27d:5112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.85.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/uf67z6k0s851qgx/hw4_1.pt [following]
--2022-01-03 16:25:59--  https://www.dropbox.com/s/raw/uf67z6k0s851qgx/hw4_1.pt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucb76584aaf227a38252495ce8cc.dl.dropboxusercontent.com/cd/0/inline/BdFSHzXKagh3z28MCMF8duO2YepD00Bdvu8LAUZeiglUTbOxxf4XhkwmFpBbpwCe8vAnyR54_ECzyrnOYYnPjOud2QDcuNdGYpzxLfBSbJsQlee8yTtozU13vZQ9AGSwIIRAVm6mU6LCRhkwbR-qS83I/file# [following]
--2022-01-03 16:26:00--  https://ucb76584aaf227a38252495ce8cc.dl.dropboxusercontent.com/cd/0/inline/BdFSHzXKagh3z28MCMF8duO2YepD00Bdvu8LAUZeiglUTbOxxf4XhkwmFpBbpwCe8vAnyR54_ECzyrnOYYnPjOud2QDcuN

In [None]:
!rm -r ./hw4_data/mini/train ./hw4_data/office/train
!rm ./hw4_data/mini/train.csv ./hw4_data/office/train.csv

In [None]:
!bash hw4_p1.sh './hw4_data/mini/val.csv' './hw4_data/mini/val' './hw4_data/mini/val_testcase.csv' './output.csv'

In [None]:
!bash hw4_p2.sh './hw4_data/office/val.csv' './hw4_data/office/val' './output2.csv'

In [None]:
!python eval.py './output.csv' './hw4_data/mini/val_testcase_gt.csv'

Accuracy: 42.48 +- 0.79 %


In [None]:
import csv
import numpy as np
test_a = []
with open('./output.csv', 'r', newline='') as f:
    rows = csv.reader(f)
    for i, row in enumerate(rows):
        if i != 0:
            test_a.append(row[1:])
test_b = []
with open('./hw4_data/mini/val_testcase_gt.csv', 'r', newline='') as f:
    rows = csv.reader(f)
    for i, row in enumerate(rows):
        if i != 0:
            test_b.append(row[1:])
x = np.array(test_a) == np.array(test_b)
accs = np.mean(x, axis=1)
print(f'mean = {accs.mean():.5f}, std = {accs.std():.5f}')
# print((np.array(test_a) == np.array(test_b)))

mean = 0.42478, std = 0.09873


In [None]:
import csv
test_a = []
with open('./output2.csv', 'r', newline='') as f:
    rows = csv.reader(f)
    for i, row in enumerate(rows):
        if i != 0:
            test_a.append(row[2])
# print(test_a)
# input()
test_b = []
with open('./hw4_data/office/val.csv', 'r', newline='') as f:
    rows = csv.reader(f)
    for i, row in enumerate(rows):
        if i != 0:
            test_b.append(row[2])

correct = 0
total = 0
for i in range(len(test_a)):
    total += 1
    if test_a[i] == test_b[i]:
        correct += 1
print(f'acc = {correct/total:.5f}')

acc = 0.37438
