# 패키지 설치

In [None]:
#@title
# !git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
# %cd Mecab-ko-for-Google-Colab
# !bash install_mecab-ko_on_colab190912.sh

!pip install konlpy transformers Korpora sentencepiece

Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 717 kB/s 
[?25hCollecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 51.6 MB/s 
[?25hCollecting Korpora
  Downloading Korpora-0.2.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 5.1 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 46.0 MB/s 
Collecting beautifulsoup4==4.6.0
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.2 MB/s 
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 65.2 MB/s 
Collecting colorama
  Downloading colorama-0.4.

# 기본 패키지 로드

In [None]:
import os, sys
from google.colab import drive

from easydict import EasyDict

import math
import random
import time
import re

from tqdm.notebook import tqdm

import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F
import torch.utils.data as tud
import torch.nn as nn
import torch.optim as optim
import gc


from Korpora import Korpora
from Korpora import ModuNewsKorpus
import transformers
from transformers import BertModel, DistilBertModel

import konlpy
from konlpy.tag import Kkma, Komoran, Hannanum, Okt
from konlpy.utils import pprint
from konlpy.tag import Mecab
from collections import Counter


from cProfile import Profile
from pstats import Stats

# torch.backends.cudnn.benchmark = True

# 기본 설정

In [None]:
# ?: https://stackoverflow.com/questions/60002128/failed-to-assign-a-dict-with-key-of-type-int-to-easydict
C = EasyDict()

C.gdrive = {}
# GDrive 마운트 설정
C.gdrive.mount_path = '/gdrive'
C.gdrive.root = '/cs492i'

C.corpus = {}
# 말뭉치 작은거 사용
C.corpus.use_smallset = True

C.dataset = {}
# 데이터셋 나누기 비율
C.dataset.split_train = 0.8
# 데이터셋 배치 크기
C.dataset.batch_size = 64 # 64


C.torch = {}
# Torch Seed 고정 (사용 안하면 None)
C.torch.manual_seed = None
# 가능하면 GPU 사용
C.torch.use_gpu = True



# 설정 적용 및 잡일

In [None]:
if C.torch.manual_seed is not None:
    torch.manual_seed(C.torch.manual_seed)
    torch.cuda.manual_seed(C.torch.manual_seed)
    print('Using torch manual seed:', C.torch.manual_seed)

device = torch.device("cuda" if torch.cuda.is_available() and C.torch.use_gpu else "cpu")
print('Using device:', device.type)

Using device: cuda


In [None]:
# 드라이브 로드
drive.mount(C.gdrive.mount_path)
root = '%s/My Drive%s' % (C.gdrive.mount_path, C.gdrive.root)


Mounted at /gdrive


In [None]:
%cd $root
%pwd

/gdrive/My Drive/cs492i


'/gdrive/My Drive/cs492i'

# 드라이브에서 파일 임포트

In [None]:
# load .py files from google drive
from tokenization_kobert import KoBertTokenizer

# 유틸 정의

In [None]:
class NewsDateDataset(tud.Dataset):
    """News with Date dataset."""

    def __init__(self, corpus, news_transform=None, date_transform=None):
        """
        Args:
            corpus (Corpus): Korpora corpus with news object.
            news_transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.corpus = corpus
        self.news_transform = news_transform
        self.date_transform = date_transform

    def __len__(self):
        return len(self.corpus.train)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # Option apply
        paragraph = self.corpus.train[idx].paragraph
        if self.news_transform is not None:
            paragraph = self.news_transform(paragraph)
        
        date = (int(corpus.train[idx].date[0:4]), int(corpus.train[idx].date[4:6]))
        if self.date_transform is not None:
            date = self.date_transform(date)
        
        return paragraph, date


In [None]:
# tokenizer = tk_hanbert.HanBertTokenizer.from_pretrained('HanBert-54kN-torch')
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
"""
tokenizer.tokenize(문자열)
tokenizer.convert_tokens_to_ids(문자열)
tokenizer.covert_tokens_to_string(문자열)
"""

# bert_model = BertModel.from_pretrained('monologg/kobert')

distilbert_model = DistilBertModel.from_pretrained('monologg/distilkobert')


Downloading:   0%|          | 0.00/363k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/76.0k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


Downloading:   0%|          | 0.00/441 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/108M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/distilkobert were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# One-Hot Encoding tensot

one_hot = F.one_hot(torch.arange(120))
one_hot = torch.as_tensor(one_hot, dtype = torch.float32)
print(one_hot)

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]])


In [None]:
class Compose(object):
    """Composes several transforms together. Copied from torchvision.transforms.Compose"""

    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img):
        for t in self.transforms:
            img = t(img)
        return img

    def __repr__(self):
        format_string = self.__class__.__name__ + '('
        for t in self.transforms:
            format_string += '\n'
            format_string += '    {0}'.format(t)
        format_string += '\n)'
        return format_string


# 엄청 느릴듯
class RegexSubstitution(object):
    """Regex substitution class for transform"""

    def __init__(self, regex, sub=''):
        if isinstance(regex, re.Pattern):
            self.regex = regex
        else:
            self.regex = re.compile(regex)
        self.sub = sub
    
    def __call__(self, target):
        if isinstance(target, list):
            return [ self.regex.sub(self.sub, self.regex.sub(self.sub, string)) for string in target ]
        else:
            return self.regex.sub(self.sub, self.regex.sub(self.sub, target))

class SquashParagraph(object):
    """Squash paragraph"""

    def __call__(self, sample):
        return '\n'.join(sample)

class ExcludeTitle(object):
    """Exclude title"""

    def __call__(self, sample):
        return sample[1:]

class PreprocessString(object):
    """ [CLS] ? [SEP] """

    def __call__(self, sample):
        return "[CLS] " + sample + " [SEP]"

class PackToFloat(object):
    """ year, month -> float """

    def __call__(self, date):
        year, month = date
        return (12 * (year - 2009) + month - 1) / 120
        # month 1~12 to 0~11

class PackToInt(object):
    """ year, month -> int """

    def __call__(self, date):
        year, month = date
        return (12 * (year - 2009) + month - 1)
        # month 1~12 to 0~11

class OneHotEnc(object):
    """ year, month -> torch.tensor(120) """

    def __call__(self, date):
        year, month = date
  
        return one_hot[(12 * (year - 2009) + month - 1)]

def bert_tokenizer_old(sent, MAX_LEN=3000):
    input_ids, attention_masks, token_type_ids = [], [], []
    for text in sent:
        encoded_dict = tokenizer.encode_plus(
            text=text, 
            add_special_tokens=True, 
            max_length=MAX_LEN, 
            padding='max_length', 
            return_attention_mask=True,
            truncation=True)
        input_id = encoded_dict['input_ids']
        attention_mask = encoded_dict['attention_mask']
        token_type_id = encoded_dict['token_type_ids']
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)

    return input_ids, attention_masks, token_type_ids

def bert_tokenizer(sent, MAX_LEN=512):
    encoded_dict = tokenizer.batch_encode_plus(
        batch_text_or_text_pairs=sent,
        add_special_tokens=True, 
        max_length=MAX_LEN, 
        padding='max_length', 
        return_attention_mask=True,
        truncation=True)
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    token_type_id = encoded_dict['token_type_ids']

    return input_id, attention_mask, token_type_ids

def distilbert_tokenizer(sent, MAX_LEN=512):
    encoded_dict = tokenizer.batch_encode_plus(
        batch_text_or_text_pairs=sent,
        add_special_tokens=True, 
        max_length=MAX_LEN, 
        padding='max_length', 
        return_attention_mask=True,
        truncation=True)
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']

    return input_id, attention_mask

def text_pipeline(text):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

# https://androidkt.com/create-dataloader-with-collate_fn-for-variable-length-input-in-pytorch/
def collate_batch_old(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True, padding_value=tokenizer.pad_token_id)
    return text_list.to(device), label_list.to(device)

def collate_batch_pack(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        text_list.append(_text)
    # as_tensor로 복사 방지
    label_list = torch.as_tensor(label_list, dtype=torch.float32)
    # int64 -> float32 아닌가?
    # print(label_list)
    btr_a, btr_b, btr_c = bert_tokenizer(text_list)
    btr_a = torch.as_tensor(btr_a)
    btr_b = torch.as_tensor(btr_b)
    btr_c = torch.as_tensor(btr_c)
    # print(btr_a.requires_grad)
    return btr_a.to(device), btr_b.to(device), btr_c.to(device), label_list.to(device)

def collate_batch_OH(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        text_list.append(_text)
    # as_tensor로 복사 방지
    label_tensor = torch.stack(label_list)
    # int64 -> float32 아닌가?
    # print(label_list)
    input_ids, attention_mask, token_type_ids = bert_tokenizer(text_list)
    input_ids = torch.as_tensor(input_ids)
    attention_mask = torch.as_tensor(attention_mask)
    token_type_ids = torch.as_tensor(token_type_ids)
    # print(btr_a.requires_grad)
    return input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), label_tensor.to(device)

def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        text_list.append(_text)
        
    # as_tensor로 복사 방지
    label_list = torch.as_tensor(label_list, dtype=torch.float32)
    # int64 -> float32 아닌가?
    # print(label_list)
    btr_a, btr_b = distilbert_tokenizer(text_list)
    btr_a = torch.as_tensor(btr_a)
    btr_b = torch.as_tensor(btr_b)
    # print(btr_a.requires_grad)
    return btr_a, btr_b, label_list

# 데이터셋 로드

In [None]:
# 말뭉치 로드
corpus_force_load = False
if (not ('corpus' in globals())) or (corpus is None) or corpus_force_load:
    corpus = None
    if C.corpus.use_smallset:
        corpus = ModuNewsKorpus(root_dir = root + '/smallnews', force_download=False, load_light=False)
    else:
        corpus = ModuNewsKorpus(root_dir = root + '/news', force_download=False, load_light=False)
else:
    print('Corpus already loaded')


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    모두의 말뭉치는 문화체육관광부 산하 국립국어원에서 제공하는 말뭉치로
    총 13 개의 말뭉치로 이뤄져 있습니다.

    해당 말뭉치를 이용하기 위해서는 국립국어원 홈페이지에 가셔서 "회원가입 > 말뭉치 신청 > 승인"의
    과정을 거치셔야 합니다.

    https://corpus.korean.go.kr/#none

    모두의 말뭉치는 승인 후 다운로드 가능 기간 및 횟수 (3회) 에 제한이 있습니다.

    로그인 기능 및 Korpora 패키지에서의 다운로드 기능을 제공하려 하였지만,
    국립국어원에서 위의 이유로 이에 대한 기능은 제공이 불가함을 확인하였습니다.

    Korpora==0.2.0 에서는 "개별 말뭉치 신청 > 승인"이 완료되었다고 가정,
    로컬에 다운로드 된 말뭉치를 손쉽게 로딩하는 기능만 제공합니다

    (Korpora 개발진 lovit@github, ratsgo@github)

    # License
    모두의 말뭉치의 모든 저작권은 `문화체육관광부 국립국어원
    (National Institute of Korean Language)` 에 귀속됩니다.

    소유권을 포함한 전문은 다음의 주소에서 확인하실 수 있습니다.

    https://corpus.korean.go.kr/boards/termsInfo.do

    제13조 (소유권)
    ① 누리집이 제공하는 서비스, 그에 필요한 소프트웨어, 이미지, 마크, 로고, 디자인

Loading ModuNews: 100%|██████████| 10/10 [00:25<00:00,  2.55s/it]


In [None]:
nds_dataset = NewsDateDataset(corpus, news_transform=Compose([ExcludeTitle(),
                                                              SquashParagraph(),
                                                              RegexSubstitution(r'\([^()]+\)|[<>\'"△▲□■]'),
                                                              # PreprocessString()
                                                              ]),
                              date_transform=PackToInt() )

print("Total news:", len(nds_dataset))

n_train = math.floor(C.dataset.split_train * len(nds_dataset))
n_val = len(nds_dataset) - n_train
print("Dataset split to (train, val) = (%d, %d)" % (n_train, n_val))

train_dataset, valid_dataset = tud.random_split(nds_dataset, [n_train, n_val])

train_loader = tud.DataLoader(train_dataset,
                              batch_size=C.dataset.batch_size,
                              shuffle=True,
                              drop_last=False,
                              collate_fn=collate_batch,
                              )
valid_loader = tud.DataLoader(valid_dataset,
                              batch_size=C.dataset.batch_size,
                              shuffle=False,
                              drop_last=False,
                              collate_fn=collate_batch,
                              )


Total news: 115080
Dataset split to (train, val) = (92064, 23016)


In [None]:
# TODO: sorting batch?
# 2009~2018 : (12*(year-2009)+month)/12*10
# max_len = -1
# for i_batch, sample_batched in tqdm(enumerate(train_loader)):
#     # print(i_batch, sample_batched)
#     max_len = sample_batched[0].size(0) if max_len < sample_batched[0].size(0) else max_len
#     print(sample_batched[3])
# print(max_len)

In [None]:
input = torch.tensor([0.0, 1.0, 2.0], requires_grad=True)
w = torch.tensor([[1.0], [1.0], [1.0]], requires_grad=False)
(w*input).requires_grad

True

# 모델

In [None]:
class SanctiMoly(nn.Module):
    """ Holy Moly News BERT """

    def __init__(self, freeze_bert = True):
        super(SanctiMoly, self).__init__()
        self.encoder = distilbert_model
        # FC-BN-Tanh
        self.linear = nn.Sequential(nn.Linear(768, 1024),
                                    nn.BatchNorm1d(1024),
                                    nn.Tanh(),
                                    nn.Dropout(),
                                    nn.Linear(1024, 768),
                                    nn.BatchNorm1d(768),
                                    nn.Tanh(),
                                    nn.Dropout(),
                                    nn.Linear(768, 1)
                                    )

        if freeze_bert == True:
            for param in self.encoder.parameters():
                param.requires_grad = False
        else:
            for param in self.encoder.parameters():
                param.requires_grad = True

            
    def forward(self, input_ids, attention_mask): # Token_type_ids는 상관믕므
        enc_o = self.encoder(input_ids, attention_mask)
        
        output = self.linear(enc_o.last_hidden_state[:, 0, :])
        # print(output.shape)
        return torch.squeeze(output, 1)

# nn.Seq?

In [None]:
model = SanctiMoly(freeze_bert=True).to(device)


# for param in model.parameters():
#     print(param.requires_grad)

# print(model.named_parameters)
# for name, param in model.named_parameters():
#     if name in ['linear.0.weight', 'linear.0.bias', 'linear.3.weight', 'linear.3.bias', 'linear.6.weight', 'linear.6.bias']:
#         param.requires_grad = True
#         print(name)
#     else:
#         param.requires_grad = False

# bert_model.to(device)
# print(next(bert_model.parameters()).device)

In [None]:
print(torch.cuda.memory_allocated()/1024/1024)
!nvidia-smi

113.16259765625
Fri Dec  3 22:31:01 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    32W / 250W |   1247MiB / 16280MiB |      7%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------

# 학습

In [None]:
criterion = None

def save_model(model, mode="last"):
    result_dir = root+'/Result'
    os.makedirs(result_dir, exist_ok=True)
    torch.save(model.state_dict(),  result_dir+f'/{type(model).__name__}_{mode}.ckpt')
    
def run_epoch(epoch, model, optimizer, is_train=True, dataloader=None):
    total_loss = 0
    n_total = 0
    n_correct = 0
    # batch_size = C.dataset.batch_size
    if dataloader is None:
        dataloader = train_loader if is_train else valid_loader
    if is_train:
        model.train()
    else:
        model.eval()
    for batch in enumerate(tqdm(dataloader)):

        i_batch, sample_batched = batch
        s_a, s_b, sampled_label = sample_batched
        
        pred = None
        if is_train:
            pred = model(s_a.to(device), s_b.to(device))
        else:
            with torch.no_grad():
                pred = model(s_a.to(device), s_b.to(device))
        # print(pred, sampled_label)
        loss = criterion(pred, sampled_label.to(device))

        #print(loss.requires_grad)
        #loss = torch.autograd.Variable(loss, requires_grad = True)
        n_targets = sampled_label.size(0)
        n_total += n_targets
        total_loss += float(loss)

        # One_hot
        # n_correct += (pred.argmax(-1) == sampled_label.argmax(-1)).long().sum().item()

        # Int
        # 모델 output 복사후 사용 -> GPU 메모리 아끼기

        b_correct = (torch.floor(pred.cpu()) == torch.floor(sampled_label.cpu())).long().sum().item()
        n_correct += b_correct

        if i_batch%50 == 0:
            # print(torch.cuda.memory_allocated()/1024/1024, torch.cuda.memory_reserved()/1024/1024)
            print(pred)
            print('[Batch {}]'.format(i_batch), 'Train' if is_train else 'Valid', 
            "Loss", float(loss) / n_targets, "Accuracy", b_correct * 100 / n_targets, "%")
        if is_train:
            optimizer.zero_grad()
            loss.backward()
            # torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
            optimizer.step()
    total_loss /= n_total
    accu = n_correct * 100 / n_total
    print("Epoch", epoch, 'Train' if is_train else 'Valid', 
          "Loss", total_loss, "Accuracy", accu, "%")
    return total_loss

def run_experiment(model):
    global criterion
    criterion = nn.MSELoss(size_average=None, reduce=None, reduction='mean')
    # criterion = nn.CrossEntropyLoss().cuda()
    
    # 학습시킬 파라미터만 넣자 -> Freeze 경우

    optimizer = optim.AdamW(filter(lambda p: p.requires_grad==True, model.parameters()), lr=0.001) # 0.001, 0.1, 3e-5
  
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', 
            factor=0.25, patience=1, threshold=0.0001, threshold_mode='rel', 
            cooldown=0, min_lr=0, eps=1e-08, verbose=False)
    best_val_loss = np.inf

    for epoch in tqdm(range(5)):
        run_epoch(epoch, model, optimizer, is_train=True)
        with torch.no_grad():
            val_loss = run_epoch(epoch, model, None, is_train=False)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            save_model(model, 'best')
        save_model(model)

        scheduler.step(val_loss)
    

In [None]:
run_experiment(model)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1439 [00:00<?, ?it/s]

tensor([-0.7262, -0.6024, -0.9827,  0.0342,  0.2585, -0.5359,  0.1630, -0.0339,
         0.7568, -0.1546, -0.8336, -0.0620,  0.0986,  0.8214, -0.2183, -0.2985,
        -0.9270,  0.2598,  0.6329,  0.4137,  0.4886,  0.1104,  0.8013,  0.6984,
        -0.0197, -0.3545, -0.3753,  0.5932, -1.2671, -0.4536, -0.4945,  0.1947,
         0.0682,  1.3892,  0.0442,  0.6519, -0.0939,  0.0077,  0.0215,  0.6234,
        -0.0977,  0.3124,  0.4915,  0.1799, -0.1723,  0.2686,  0.1583, -0.2600,
         0.2418, -0.5476, -1.1552,  0.0974,  0.1916,  0.4026, -0.9003,  0.3837,
        -1.3666,  0.5019, -0.4247,  0.0522, -0.5103, -0.2290,  0.0496, -0.2638],
       device='cuda:0', grad_fn=<SqueezeBackward1>)
[Batch 0] Train Loss 60.73143768310547 Accuracy 0.0 %
tensor([  6.0514,   3.8644,   4.0467,   9.8643,  -2.4395, -14.8514,  16.5585,
         -2.7472,  20.4860,  18.9301,  10.2519,   2.4374,  -5.1630,  -6.7843,
         -8.6636,  22.6365, -10.7223, -13.5925,   2.2345,   2.2838,  17.6940,
         -5.6008,  

  0%|          | 0/360 [00:00<?, ?it/s]

tensor([73.1940, 62.3465, 58.3217, 55.7579, 61.4262, 40.5101, 73.2608, 72.3897,
        56.4422, 71.9347, 55.7099, 64.1019, 51.1325, 50.3618, 47.8634, 65.1404,
        34.7859, 72.5275, 44.7751, 54.8573, 60.9784, 71.8854, 30.7948, 50.9403,
        73.4920, 81.3428, 55.4480, 67.2916, 73.8446, 58.5088, 69.1577, 56.1525,
        39.1646, 71.9575, 34.7356, 42.9461, 45.2502, 57.4877, 55.5973, 37.4089,
        47.7574, 72.9824, 65.1288, 33.6258, 58.0889, 50.1217, 39.3458, 71.6141,
        62.7232, 62.8200, 68.7806, 57.1760, 59.4653, 58.0892, 53.9300, 34.6613,
        71.0461, 73.0729, 62.8883, 82.3876, 48.8666, 72.1968, 42.2977, 39.7101],
       device='cuda:0')
[Batch 0] Valid Loss 15.737303733825684 Accuracy 0.0 %
tensor([44.6912, 62.0719, 13.3178, 74.7311, 65.3854, 48.9387, 75.8470, 60.8989,
        46.2722, 66.6509, 70.0561, 43.0727, 63.1746, 71.2756, 85.1599, 62.4997,
        66.0820, 51.4846, 54.5053, 42.4820, 58.2512, 39.0621, 65.4723, 44.2929,
        90.0711, 58.5235, 61.5928, 42.47

  0%|          | 0/1439 [00:00<?, ?it/s]

tensor([62.7648, 34.1523, 67.3523, 53.4739, 69.2859, 63.0716, 70.0416, 51.0552,
        46.3622, 46.0490, 51.9588, 80.7219, 38.2259, 53.4674, 51.0002, 50.1829,
        55.2945, 49.9367, 55.6702, 43.5570, 55.8363, 87.4597, 82.4013, 62.2593,
        66.6291, 47.8063, 52.8591, 52.8466, 53.3615, 85.8031, 85.8839, 48.5368,
        50.1352, 80.6071, 77.0904, 65.2311, 41.3527, 52.5937, 58.1121, 82.9632,
        53.5217, 48.5881, 29.8102, 55.0204, 67.9978, 15.8121, 60.2901, 69.6395,
        64.0024, 40.6783, 53.7291, 54.7458, 43.9188, 43.0276, 73.9257, 78.0725,
        59.7220, 52.5177, 54.1183, 32.5588, 61.2875, 64.4772, 44.3022, 70.9337],
       device='cuda:0', grad_fn=<SqueezeBackward1>)
[Batch 0] Train Loss 14.743603706359863 Accuracy 0.0 %
tensor([43.5096, 53.1674, 77.1827, 52.9207, 51.4942, 50.0997, 68.7465, 42.1283,
        59.2138, 73.5373, 75.7391, 64.3572, 82.3585, 46.3438, 55.0179, 41.7234,
        28.2962, 81.5401, 38.8296, 51.7344, 71.2228, 55.6207, 55.1245, 73.7123,
        68.8

  0%|          | 0/360 [00:00<?, ?it/s]

tensor([73.6567, 63.1385, 60.0810, 61.0233, 58.1808, 42.8284, 72.7631, 66.7377,
        54.2985, 70.1795, 57.1016, 58.7035, 49.7481, 54.1470, 52.9260, 71.0008,
        35.9869, 71.5901, 40.7724, 59.7157, 66.1117, 71.2196, 24.9733, 50.5001,
        75.4374, 83.1469, 52.0776, 74.2188, 76.6983, 59.3464, 70.7788, 57.1236,
        38.0692, 70.0611, 50.3141, 60.5984, 40.7786, 62.9707, 46.0943, 47.8617,
        55.4399, 66.0699, 66.4622, 41.7455, 58.6340, 54.6303, 32.9603, 77.6645,
        64.4908, 66.9133, 57.4329, 61.5582, 66.8237, 62.1357, 59.7123, 36.2810,
        64.6375, 79.6408, 57.4226, 88.5313, 49.5187, 85.9912, 38.5271, 36.7809],
       device='cuda:0')
[Batch 0] Valid Loss 15.084159851074219 Accuracy 0.0 %
tensor([53.6548, 60.7159, 14.7494, 79.6985, 60.6578, 54.8207, 68.5435, 66.0797,
        43.0201, 61.3024, 75.6241, 44.2209, 54.0964, 63.1225, 90.7691, 68.7604,
        56.3704, 56.9975, 63.1610, 47.5318, 67.4243, 39.0787, 57.1280, 41.7111,
        87.6151, 54.1565, 69.9413, 42.56

  0%|          | 0/1439 [00:00<?, ?it/s]

tensor([ 57.7677,  70.6541,  39.1469, 103.7065,  42.9514,  52.8247,  56.7868,
         45.1488,  35.9217,  82.7452,  59.8632,  65.9913,  49.3757,  45.0473,
         37.2486,  56.2504,  82.0790,  44.7727,  44.0913,  34.6482,  67.5828,
         51.2763,  82.0395,  73.0627,  66.0153,  60.2681,  59.9258,  41.7584,
         88.6263,  36.4976,  48.8296,  57.9965,  60.5884,  57.8958,  33.3352,
         60.3701,  51.3137,  81.8886, 103.5171,  67.2640,  57.4826,  51.6712,
         40.4620,  54.1666,  57.0181,  50.7883,  65.7972,  30.0891,  69.4072,
         46.1420,  40.8655,  61.4527,  33.2297,  82.0989,  40.2072,  68.6185,
         84.9554,  48.5216,  17.0822,  38.9512,  50.2095,  25.4469,  50.3252,
         76.2317], device='cuda:0', grad_fn=<SqueezeBackward1>)
[Batch 0] Train Loss 14.566547393798828 Accuracy 3.125 %
tensor([ 61.1155, 103.1770,  41.6034,  75.3948,  76.4493,  69.2756,  58.5398,
         65.8022,  70.5055,  90.0420,  52.1780,  73.7409,  60.8013,  49.1235,
         70.3137,  64

  0%|          | 0/360 [00:00<?, ?it/s]

tensor([70.5367, 62.8812, 62.3386, 64.2156, 50.7801, 41.6033, 73.3062, 69.9892,
        48.2357, 66.2705, 57.0443, 68.0102, 50.4835, 49.5391, 50.1847, 71.1590,
        35.0267, 64.0477, 44.3555, 62.7697, 64.6342, 80.4126, 20.6569, 59.5814,
        78.2552, 83.8934, 54.3799, 73.2410, 81.1600, 68.4682, 67.7336, 55.5481,
        40.0510, 73.7340, 33.7773, 54.1290, 39.9997, 59.3805, 46.3150, 38.8874,
        49.7612, 68.7345, 57.8572, 41.7358, 57.0837, 46.7635, 31.5151, 75.6096,
        67.6103, 62.6496, 64.8046, 53.7466, 73.7178, 58.0384, 64.0275, 33.6830,
        67.3794, 78.0580, 57.8801, 87.9211, 47.2459, 71.7215, 39.9521, 43.5180],
       device='cuda:0')
[Batch 0] Valid Loss 13.80466365814209 Accuracy 0.0 %
tensor([55.1958, 68.1200,  9.5915, 77.5461, 58.8804, 44.7636, 77.7640, 65.9561,
        43.7438, 64.3414, 67.1322, 37.9984, 56.0573, 58.9696, 84.9204, 72.3083,
        65.9935, 53.5164, 63.2568, 47.9441, 62.4489, 37.0340, 63.0880, 45.0103,
        81.5130, 58.8791, 66.8306, 36.244

  0%|          | 0/1439 [00:00<?, ?it/s]

tensor([24.0419, 40.5579, 78.0627, 67.4966, 69.2211, 86.0461, 26.8010, 46.9198,
        60.9890, 73.6407, 48.5236, 21.5618, 61.1106, 57.7559, 24.2441, 70.5301,
        65.5280, 89.3440, 55.6854, 67.9464, 59.1557, 23.1422, 60.9971, 43.4515,
        59.7345, 40.9769, 69.6317, 69.7489, 51.3404, 63.8426, 61.6743, 54.8267,
        73.8894, 84.3151, 61.8216, 74.4258, 54.4623, 49.5043, 43.6075, 37.9170,
        93.9979, 68.2072, 56.1577, 55.9991, 38.3404, 76.8201, 73.5438, 39.1528,
        49.8350, 63.1712, 66.2758, 75.2305, 74.0290, 53.8819, 41.4414, 39.8484,
        64.1206, 43.5239, 38.1874, 46.7263, 62.8513, 25.1191, 72.1893, 34.0337],
       device='cuda:0', grad_fn=<SqueezeBackward1>)
[Batch 0] Train Loss 13.29916000366211 Accuracy 3.125 %
tensor([66.3137, 49.7579, 47.9476, 78.1624, 59.3836, 48.8655, 59.4184, 24.5203,
        68.1720, 35.0650, 35.7815, 72.3857, 64.0141, 31.4428, 62.7109, 24.5617,
        42.5268, 34.6463, 47.0667, 60.6190, 53.5555, 71.3751, 64.9861, 77.1541,
        55.

In [None]:
print(torch.cuda.memory_allocated()/1024/1024)
!nvidia-smi

In [None]:
model.eval()

def i2ym(fl):
    return (fl // 12 + 2009, fl % 12 + 1)

with torch.no_grad():
    for batch in enumerate(tqdm(valid_loader)):
        i_batch, sample_batched = batch
        s_a, s_b, sampled_label = sample_batched

        pred = model(s_a.to(device), s_b.to(device))
        
        total = 0
        count=0
        for i in range(C.dataset.batch_size):
            print(i, ": expected", i2ym(sampled_label[i].item()), "| got", i2ym(pred[i].item()), "L1 dist", abs(sampled_label[i].item()-pred[i].item()))
            total += abs(sampled_label[i].item()-pred[i].item())
            if abs(sampled_label[i].item()-pred[i].item()) <= 6:
                count+=1
        print(count*100/C.dataset.batch_size, total/C.dataset.batch_size)
        break
