In [1]:
import json
import logging
import os
import shutil
from pathlib import Path
from pymongo import MongoClient
import pandas as pd
# from megatron.data.dataset_utils import get_indexed_dataset_
import sys
sys.path.append('..')
from torch.nn import CrossEntropyLoss

import torch
import horovod.torch as hvd
from dotenv import load_dotenv
import torch
import numpy as np
from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
import datasets
from huggingface_hub import hf_hub_download
from sklearn.metrics import f1_score, accuracy_score

from lm_experiments_tools import TrainerArgs
from lm_experiments_tools.trainer import Trainer
from torch.utils.data import Dataset, DataLoader

In [2]:
model_path = "../runs/t5-base/ilpc-large/lr5e-06_constant_with_warmup_adamw_wd1e-02_512-512_bs128_iters150000_pretrained_2sep_enum_nodesc/run_1/"

In [3]:
from transformers import T5ForConditionalGeneration, AutoModel

In [5]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')



In [6]:
# cpt = torch.load(model_path + 'model_best.pth', map_location='cpu')
# model.load_state_dict(cpt['model_state_dict'])

In [7]:
# path = '/home/bulatov/bulatov/datasets/ilpc22/small-valid.csv'
# path = '/home/bulatov/bulatov/datasets/ilpc22/large_2sep/large_test.csv'
# valid_path = '/home/bulatov/bulatov/datasets/ilpc22/large_2sep_enum/large_valid.csv'
# test_path = '/home/bulatov/bulatov/datasets/ilpc22/large_2sep_enum/large_test.csv'

valid_path = '/home/bulatov/bulatov/datasets/ilpc22/small_2sep_enum/small_valid.csv'
test_path = '/home/bulatov/bulatov/datasets/ilpc22/small_2sep_enum/small_test.csv'

In [8]:
class KGLMLocalDataset(Dataset):
    def __init__(self, path, neighborhood=True, description=True, sep='[SEP]', sep2=' [SEP-2] '):
        self.df = pd.read_csv(path)
        self.neighborhood = neighborhood
        self.description = description
        self.sep = sep
        self.sep2 = sep2

    def  __getitem__(self, idx):
        item = {}
        triplet = self.df.iloc[idx]
        item["input"] = triplet.verbalization
        if not self.neighborhood:
            item["input"] = self.drop_neighborhood(item["input"])
        
        item["outputs"] = triplet['verbalized_tail']
        if not self.description:
            item["outputs"] = self.drop_description(item["outputs"])
            
        item["output_id"] = triplet["tail"]
            
        return item
        
    def __len__(self):
        return self.df.shape[0]
            
    
    def drop_neighborhood(self, text):
        return self.sep.join(text.split(self.sep)[:2]) + self.sep
    
    
    def drop_description(self, text):
        return text.split(self.sep2)[0]

In [9]:
# train_dataset = KGLMLocalDataset(path, neighborhood=True, description=False)

In [10]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('t5-small')

In [11]:
class Holder:
    def __init__(self):
        pass

In [12]:
args = Holder()
args.target_seq_len = 512
args.input_seq_len = 512
args.input_prefix = ''

In [13]:
global_attention_first_token = False  # should be True for LED
encode_plus_kwargs = {'truncation': True, 'padding': 'longest', 'pad_to_multiple_of': 1}
generate_kwargs = {}

def collate_fn(batch):
    # print('batch', batch[0].keys(), batch[0]['input'])
    # cut too long strings because they may slow down tokenization
    inputs = [b['input'][:args.input_seq_len * 10] for b in batch]
    if 'outputs' in batch[0]:
        # if we have more than 1 label per example (only in valid) take only one of them
        # to compute loss on valid
        labels = [b['outputs'][:args.target_seq_len * 10] for b in batch]
    else:
        labels = [b['output'][:args.target_seq_len * 10] for b in batch]
    if args.input_prefix:
        inputs = [args.input_prefix + inp for inp in inputs]
    features = tokenizer.batch_encode_plus(list(inputs), max_length=args.input_seq_len, return_tensors='pt',
                                           **encode_plus_kwargs)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer.batch_encode_plus(list(labels), max_length=args.target_seq_len, return_tensors='pt',
                                             **encode_plus_kwargs).input_ids
    labels[labels == tokenizer.pad_token_id] = -100
    features['labels'] = labels
    if 'outputs' in batch[0]:
        features['target_text'] = [b['outputs'] for b in batch]
    else:
        features['target_text'] = [b['output'] for b in batch]
    if 'output_id' in batch[0]:
        features['output_id'] = [b['output_id'] for b in batch]
    if 'global_attention_mask' in features:
        raise RuntimeError('What global attention mask for Longformer and LongformerEncoder-Decoder should be?')
    return features

In [35]:
bs = 64
args.drop_neighborhood = False
args.drop_description = True

kwargs = {'pin_memory': True}#, 'num_workers': args.data_n_workers}
valid_dataset = KGLMLocalDataset(valid_path, neighborhood=not args.drop_neighborhood, description=not args.drop_description)

valid_sampler = RandomSampler(valid_dataset)
valid_dataloader = DataLoader(valid_dataset, batch_size=bs, sampler=valid_sampler,
                              collate_fn=collate_fn, **kwargs)


test_dataset = KGLMLocalDataset(test_path, neighborhood=not args.drop_neighborhood, description=True)#not args.drop_description)
test_sampler = RandomSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, batch_size=bs, sampler=test_sampler,
                              collate_fn=collate_fn, **kwargs)

In [36]:
# train_sampler = RandomSampler(train_dataset)
# kwargs = {'pin_memory': True, }
# train_dataloader = DataLoader(train_dataset, batch_size=16, sampler=train_sampler,
#                               collate_fn=collate_fn, **kwargs)

In [38]:
gen = iter(test_dataloader)
sample = next(gen)

target_text = sample.pop('target_text')
output_id = sample.pop('output_id')
labels = sample.pop('labels')

In [39]:
tokenizer.add_tokens(['[SEP]', '[SEP-2]'])

0

In [40]:
decoder_input_ids = model._shift_right(labels)
with torch.no_grad():
    out = model(**sample, decoder_input_ids=decoder_input_ids)

In [41]:
decoder_input_ids = model._shift_right(labels)
with torch.no_grad():
    out = model(**sample, decoder_input_ids=decoder_input_ids)
    
sep2_id = tokenizer.encode('[SEP-2]')[0]

lm_logits = out['logits']
loss_fct = CrossEntropyLoss(ignore_index=-100, reduction='none')
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))

weight = 10
entity_mask = torch.ones_like(labels)
for row, start_ind in torch.nonzero(labels == sep2_id):
    entity_mask[row, :start_ind] *= weight

flat_entity_mask = entity_mask.view(-1)

In [48]:
sep2_id = tokenizer.encode('[SEP-2]')[0]

In [58]:
weight = 10
entity_mask = torch.ones_like(labels)
for row, start_ind in torch.nonzero(labels == sep2_id):
    entity_mask[row, :start_ind] *= weight

flat_entity_mask = entity_mask.view(-1)

In [59]:
weighted_loss = loss * flat_entity_mask 

In [60]:
loss

tensor([26.9937, 34.8106,  8.5109,  ...,  0.0000,  0.0000,  0.0000])

In [61]:
weighted_loss

tensor([269.9374,  34.8106,   8.5109,  ...,   0.0000,   0.0000,   0.0000])

In [47]:
sep2_id

32101

In [44]:
test_dataset[0]

{'input': 'predict [SEP] Roy Clark [SEP-2] American country music musician and performer cause of death [SEP] place of death Tulsa [SEP-2] county seat of Tulsa County, Oklahoma, United States [SEP] instrument banjo [SEP-2] musical instrument [SEP] record label Capitol Records [SEP-2] American record label; imprint of Capitol Records, Inc. [SEP] record label Four Star Records [SEP-2] record label [SEP]',
 'outputs': 'pneumonia [SEP-2] inflammatory condition of the lung',
 'output_id': 'Q12192'}

In [24]:
loss.shape

torch.Size([704])

In [18]:
with torch.no_grad():
    out1 = model(**sample, labels = labels)
out1['loss']

tensor(7.7404)

In [None]:
1/0

In [None]:
out.keys()

odict_keys(['logits', 'past_key_values', 'encoder_last_hidden_state'])

In [None]:
out['loss']

tensor(3.0198, grad_fn=<NllLossBackward>)

### Calculate hits

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import json
import time


class HitsCalculator:
    def __init__(self, emb_model=SentenceTransformer('all-MiniLM-L6-v2'), drop_description=False):
        self.emb_model = emb_model
        if drop_description:
            self.index=faiss.read_index("faiss/entities.index")
        else:
            self.index=faiss.read_index("faiss/entities+desc.index")
            

    def hits(self, outputs, labels, tokens_to_replace=(" [SEP-2]", " [SEP-3]")):
        for token in tokens_to_replace:
            outputs = list(map(lambda x: x.replace(token, ""), outputs))

        vectors = self.emb_model.encode(outputs)
        _, indices = self.index.search(vectors, 10)

        hits = {"Hits@1": 0, "Hits@3": 0, "Hits@5": 0, "Hits@10": 0}

        for i, label in enumerate(labels):
            target = int(label[1:]) 

            if target == indices[i][0]:
                hits['Hits@1'] += 1
                hits['Hits@3'] += 1
                hits['Hits@5'] += 1
                hits['Hits@10'] += 1
            
            elif target in indices[i][:3]:
                hits['Hits@3'] += 1
                hits['Hits@5'] += 1
                hits['Hits@10'] += 1

            elif target in indices[i][:5]:
                hits['Hits@5'] += 1
                hits['Hits@10'] += 1
            
            elif target in indices[i][:10]:
                hits['Hits@10'] += 1
            
        return { metric: hits[metric]/len(labels) for metric in hits.keys() }

2022-12-28 17:36:24,282 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2022-12-28 17:36:24,636 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cuda


### Debug Hits@k

In [None]:
gen_outputs = model.generate(sample['input_ids'])

In [None]:
tokenizer.batch_decode(gen_outputs)

['<pad> musician [SEP-2] person who performs or composes music</s><pad><pad><pad><pad>',
 '<pad> UK [SEP-2] citizen [SEP-2] citizen in the Kingdom of the Netherlands',
 '<pad> Christian Church [SEP-2] consists of the Latin Church and 23 Eastern Catholic Churche',
 '<pad> Johann Sebastian Bach [SEP-2] 0 [SEP-2] 0 [S',
 '<pad> John James [SEP-2] inverse of genre John James [SEP-2] ',
 '<pad> John S. Kennedy [SEP-2] inverse of country of citizenship</s><pad><pad><pad>',
 '<pad> economist [SEP-2] a person who writes and publishes poetry</s><pad><pad>',
 '<pad> Paul Walker [SEP-2] inverse of languages spoken, written or signed John Wayne [',
 '<pad> Jean-François Pisier [SEP-2] French businessman, politician, and industrial',
 '<pad> William H. McMillan [SEP-2] inverse of educated at William',
 '<pad> [SEP-2] 0 president of the United States employer [SEP-2] ',
 '<pad> Johann Sebastian Bach [SEP-2] 0 [SEP-2] 0 [S',
 '<pad> Aziz Nesin [SEP-2] German writer, writer, and political column',
 '<

In [None]:
'abc'.replace(

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import json
import time


class HitsCalculator:
    def __init__(self, emb_model, index):
        self.emb_model = emb_model
        self.index = index

    def hits(self, outputs, labels, tokens_to_replace=(" [SEP-2]", " [SEP-3]")):
        for token in tokens_to_replace:
            outputs = list(map(lambda x: x.replace(token, ""), outputs))

        vectors = self.emb_model.encode(outputs)
        _, indices = self.index.search(vectors, 10)

        hits = {"Hits@1": 0, "Hits@3": 0, "Hits@5": 0, "Hits@10": 0}

        for i, label in enumerate(labels):
            target = int(label[1:]) 

            if target == indices[i][0]:
                hits['Hits@1'] += 1
                hits['Hits@3'] += 1
                hits['Hits@5'] += 1
                hits['Hits@10'] += 1
            
            elif target in indices[i][:3]:
                hits['Hits@3'] += 1
                hits['Hits@5'] += 1
                hits['Hits@10'] += 1

            elif target in indices[i][:5]:
                hits['Hits@5'] += 1
                hits['Hits@10'] += 1
            
            elif target in indices[i][:10]:
                hits['Hits@10'] += 1
            
        return { metric: hits[metric]/len(labels) for metric in hits.keys() }

In [None]:
index = faiss.read_index("../faiss/entities.index") 
emb_model = SentenceTransformer('all-MiniLM-L6-v2')

hits_calculator = HitsCalculator(emb_model=emb_model, index=index)

2022-12-28 16:51:35,898 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2022-12-28 16:51:36,233 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cuda


In [None]:
gen = iter(train_dataloader)

In [None]:
sample = next(gen)
target_text = sample.pop('target_text')
output_ids = sample.pop('output_id')

In [None]:
target_text

['Nothing Records',
 'United States of America',
 'Climate Alliance',
 'writer',
 'Maria Rowohlt',
 'Universal Postal Union',
 'Albert Béguin',
 'Germany',
 'executive producer',
 'Alberto Fujimori',
 'Conrad Roland',
 'Peter Capell',
 'writer',
 'voice',
 'Svetlana Bondarchuk',
 'John Cho']

In [None]:
output_ids

['Q2085119',
 'Q30',
 'Q1768108',
 'Q36180',
 'Q110685',
 'Q17495',
 'Q124251',
 'Q183',
 'Q1053574',
 'Q133040',
 'Q102980',
 'Q97944',
 'Q36180',
 'Q17172850',
 'Q4093262',
 'Q312705']

In [None]:
hits_calculator.hits(target_text, output_ids)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'Hits@1': 1.0, 'Hits@3': 1.0, 'Hits@5': 1.0, 'Hits@10': 1.0}

In [None]:
model_outputs = tokenizer.batch_decode(gen_outputs, skip_special_tokens=True)
model_outputs

['musician [SEP-2] person who performs or composes music',
 'UK [SEP-2] citizen [SEP-2] citizen in the Kingdom of the Netherlands',
 'Christian Church [SEP-2] consists of the Latin Church and 23 Eastern Catholic Churche',
 'Johann Sebastian Bach [SEP-2] 0 [SEP-2] 0 [S',
 'John James [SEP-2] inverse of genre John James [SEP-2] ',
 'John S. Kennedy [SEP-2] inverse of country of citizenship',
 'economist [SEP-2] a person who writes and publishes poetry',
 'Paul Walker [SEP-2] inverse of languages spoken, written or signed John Wayne [',
 'Jean-François Pisier [SEP-2] French businessman, politician, and industrial',
 'William H. McMillan [SEP-2] inverse of educated at William',
 '[SEP-2] 0 president of the United States employer [SEP-2] ',
 'Johann Sebastian Bach [SEP-2] 0 [SEP-2] 0 [S',
 'Aziz Nesin [SEP-2] German writer, writer, and political column',
 'United States of America [SEP-2] federal republic in North America [SEP-2]',
 'French philosopher and philosopher [SEP-2] influenced by 

In [None]:
hits_calculator.hits(model_outputs, output_ids)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'Hits@1': 0.0, 'Hits@3': 0.0, 'Hits@5': 0.0, 'Hits@10': 0.0}

In [None]:
sample['labels'].shape, gen_outputs.shape

(torch.Size([2, 10]), torch.Size([2, 19]))

### weigh loss 

In [None]:
out.keys()

odict_keys(['loss', 'logits', 'past_key_values', 'encoder_last_hidden_state'])

In [None]:
out.logits.shape

torch.Size([2, 10, 32128])

In [None]:
lm_logits = out.logits
labels = sample['labels']

loss_fct = CrossEntropyLoss(ignore_index=-100)
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))

In [None]:
lm_logits.shape, labels.shape

(torch.Size([2, 10, 32128]), torch.Size([2, 10]))

In [None]:
lm_logits.view(-1, lm_logits.size(-1)).shape, labels.view(-1).shape

(torch.Size([20, 32128]), torch.Size([20]))

In [None]:
lm_logits[:, 0].view(-1, lm_logits.size(-1)).shape, labels[:, 0].view(-1).shape

(torch.Size([2, 32128]), torch.Size([2]))

In [None]:
lm_logits[0, 0].shape

torch.Size([32128])

In [None]:
labels[:1, :1].view(-1)

tensor([10001])

In [None]:

loss_fct = CrossEntropyLoss(ignore_index=-100, reduce=False)
loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)).shape

torch.Size([20])

In [None]:
loss_fct(lm_logits[0, 0], labels[0, :1])

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [None]:
out['loss']

tensor(5.0220, grad_fn=<NllLossBackward>)

In [None]:
sample['labels']a

In [None]:
tokenizer(sample['target_text'])

In [None]:
tokenizer.batch_decode(sample['labels'])

In [None]:
batch = [train_dataset[0]]

In [None]:
inputs = [b['input'][:args.input_seq_len * 10] for b in batch]
if 'outputs' in batch[0]:
    # if we have more than 1 label per example (only in valid) take only one of them
    # to compute loss on valid
    labels = [b['outputs'][:args.target_seq_len * 10] for b in batch]
else:
    labels = [b['output'][:args.target_seq_len * 10] for b in batch]
if args.input_prefix:
    inputs = [args.input_prefix + inp for inp in inputs]
features = tokenizer.batch_encode_plus(list(inputs), max_length=args.input_seq_len, return_tensors='pt',
                                       **encode_plus_kwargs)
with tokenizer.as_target_tokenizer():
    labels = tokenizer.batch_encode_plus(list(labels), max_length=args.target_seq_len, return_tensors='pt',
                                         **encode_plus_kwargs).input_ids
print(labels)
labels[labels == tokenizer.pad_token_id] = -100
features['labels'] = labels
# features['id'] = [b['id'] for b in batch]
if 'outputs' in batch[0]:
    features['target_text'] = [b['outputs'] for b in batch]
else:
    features['target_text'] = [b['output'] for b in batch]
if 'global_attention_mask' in features:
    raise RuntimeError('What global attention mask for Longformer and LongformerEncoder-Decoder should be?')


tensor([[936,   1]])


In [None]:
labels = [b['outputs'][:args.target_seq_len * 10] for b in batch]
tokenizer.batch_encode_plus(labels)

{'input_ids': [[936, 1]], 'attention_mask': [[1, 1]]}

In [None]:
labels

['human']

In [None]:
# tokenizer.batch_decode(labels)

In [None]:
# cpt_folder = "/home/bulatov/bulatov/KGLM/runs/t5-small/lr5e-05_constant_with_warmup_adamw_wd1e-03_512-512_bs64_iters1000000_baseline/run_0/"
# cpt_folder = "/share/home/export/rmt_internship/kglm/lr5e-05_constant_with_warmup_adamw_wd1e-03_512-512_bs64_iters1000000/run_0"
# cpt_folder = "/home/bulatov/bulatov/KGLM/runs/t5-small/lr5e-05_constant_with_warmup_adamw_wd1e-03_512-512_bs32_iters1500000_baseline/run_0"
cpt_folder = "/home/bulatov/bulatov/KGLM/runs/t5-small/lr5e-05_constant_with_warmup_adamw_wd1e-03_512-512_bs64_iters4000000/run_1/"
# cpt_folder = "/home/bulatov/bulatov/KGLM/tests/runs/test_t5_pretrain/"

In [None]:
import os


In [None]:
# !ls /share/home/export/rmt_internship/kglm/

In [None]:
model_cpt = os.path.join(cpt_folder, 'model_best.pth')
config_path = os.path.join(cpt_folder, 'config.json')

In [None]:
cpt = torch.load(model_cpt, map_location='cpu')

In [None]:
from transformers import T5ForConditionalGeneration, T5Config, AutoConfig

In [None]:
config = T5Config(config_path)

In [None]:
model_cfg = T5Config.from_pretrained(config_path)
# model_cfg

You are using a model of type encoder-decoder to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [None]:
model_cfg = AutoConfig.from_pretrained('t5-small')
# model_cfg

T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_en_to_fr": {
      "early_stopping": true,

In [None]:
model_cpt = os.path.join(cpt_folder, 'model_best.pth')
config_path = os.path.join(cpt_folder, 'config.json')

# model_cfg = AutoConfig.from_pretrained('t5-small')
model_cfg = T5Config.from_pretrained(config_path)
model = T5ForConditionalGeneration(config=model_cfg)

cpt = torch.load(model_cpt, map_location='cpu')
model.load_state_dict(cpt['model_state_dict'])

You are using a model of type encoder-decoder to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


<All keys matched successfully>

In [None]:
model.load_state_dict(cpt['model_state_dict'])

<All keys matched successfully>