In [None]:
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
import pandas as pd
from tqdm import tqdm
from transformers import logging
logging.set_verbosity_error()
from transformers import BertModel, BertTokenizer, BertForMaskedLM, pipeline
from sentence_transformers import SentenceTransformer, util

print(torch.cuda.is_available())
print(torch.__version__)

import os
from pathlib import Path
os.chdir(Path(os.environ["MASTER"]))

In [None]:
s = "Das bedeutet: Er hat sehr viele schlimme Verletzungen."

model= "deepset/gbert-base"
tokenizer= BertTokenizer.from_pretrained(model)

encoding = tokenizer.encode(s)
tokens = tokenizer.tokenize(s)
print(encoding)
print(tokens)

In [None]:
from utils.datasets import DatasetWordPiece
from models.tvae_trainer import TVAETrainer
from models.tvae_model import TVAE
import torch

dataset = DatasetWordPiece(large=False, max_length=128)
model = TVAE(ntoken=dataset.vocab_size)
trainer = TVAETrainer(dataset=dataset, model=model)

# def f(s):
#     mem = torch.cuda.memory_allocated()
#     print(f"{s}: {mem/(1024**2)} GiB ({mem} bytes)")
# f("Before")
# model.to("cuda")
# f("After model")

s = "Das bedeutet: Er hat sehr viele schlimme Verletzungen."
path_model = Path(
    "save/2023-01-18_Pin_down_Lr/checkpoints/2023-01-20_20:44:29_TVAE_German/2023-01-20_20:44:29_TVAE_RegTrue/model.pt")

model = TVAE(ntoken=dataset.vocab_size)
model.load_state_dict(torch.load(path_model))
model.cuda()

t = torch.Tensor(dataset.encode(s))
t = t.view(1, -1).long()
lbl = torch.IntTensor([[1]])
batch = trainer.process_batch_data((t, lbl))

d = {}
for i, k in enumerate(["src", "tgt", "tgt_true", "tgt_mask", "memory_mask", "src_key_padding_mask", "tgt_key_padding_mask", "labels"]):
    d[k] = batch[i]

output = model(
    src=d["src"],
    tgt=d["tgt"],
    tgt_mask=d["tgt_mask"],
    memory_mask=d["memory_mask"],
    src_key_padding_mask=d["src_key_padding_mask"],
    tgt_key_padding_mask=d["tgt_key_padding_mask"]
)

prob = output[0]
out_tokens = torch.argmax(prob, dim=-1)
out_tokens = [int(i) for i in list(out_tokens.data.to("cpu")[0])]


def acc(weights, targets):
    # get predicted label
    weights = torch.argmax(weights, dim=-1)
    # remove [PAD] label (== 0) from accuracy calculation
    mask = targets.ge(0.5)
    numerator = torch.sum(targets.masked_select(
        mask) == weights.masked_select(mask))
    denominator = len(targets.masked_select(mask))
    acc = numerator / denominator
    return acc

# print(dataset.tokenizer.decode(out_tokens))
# print(acc(prob, d["tgt_true"]))
# print(list(d["tgt_true"][0].cpu().numpy()))
# print(out_tokens)
