In [1]:
import pandas as pd
from pymongo import MongoClient
from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import time
# import fasttext
import random

In [16]:
def get_neighbourhood(collection, node_id, relation_id=None, tail_id=None):

    neigh = []

    # relation_id and tail_id excluded for test dataset
    if relation_id == None or tail_id == None:
        for doc in collection.find({'head': node_id}, {'_id': False}):
            neigh.append(doc)

        for doc in collection.find({'tail': node_id}, {'_id': False}):
            doc['relation'] = 'inverse of ' + doc['relation']
            neigh.append(doc)

    # relation_id and tail_id included for train dataset verbalization to hide the target node
    else:
        for doc in collection.find({'head': node_id, 'tail': {'$ne': tail_id}, 'relation': {'$ne': relation_id}}, {'_id': False}):
            neigh.append(doc)

        for doc in collection.find({'tail': node_id, 'head': {'$ne': tail_id}, 'relation': {'$ne': relation_id}}, {'_id': False}):
            doc['relation'] = 'inverse of ' + doc['relation']
            neigh.append(doc)

    return neigh

In [17]:
client = MongoClient('localhost', 15017)
collection = client["KGLM-inductive"]['train']
collection.find_one()

{'_id': ObjectId('637ca5613c82960d48908165'),
 'head': 'Q5142631',
 'relation': 'P159',
 'tail': 'Q3141'}

In [18]:
entity = 'Q5142631'

In [19]:
get_neighbourhood(collection, entity)

[{'head': 'Q5142631', 'relation': 'P159', 'tail': 'Q3141'},
 {'head': 'Q5142631', 'relation': 'P31', 'tail': 'Q1589009'}]

In [21]:
client = MongoClient('localhost', 15017)
collection = client["KGLM-inductive"]['test']
collection.find_one()

In [22]:
db = client["KGLM-inductive"]
list(db.list_collections())

[{'name': 'train',
  'type': 'collection',
  'options': {},
  'info': {'readOnly': False,
   'uuid': Binary(b'M \x11\xa6\x8a\xc1N}\xb9\x0e\xb7h\xecEd]', 4)},
  'idIndex': {'v': 2, 'key': {'_id': 1}, 'name': '_id_'}}]

In [1]:
import json
import logging
import os
import shutil
from pathlib import Path
from pymongo import MongoClient

# from megatron.data.dataset_utils import get_indexed_dataset_
import sys
sys.path.append('..')

import horovod.torch as hvd
from dotenv import load_dotenv
import torch
import numpy as np
from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
import datasets
from huggingface_hub import hf_hub_download
from sklearn.metrics import f1_score, accuracy_score

from lm_experiments_tools import TrainerArgs
from lm_experiments_tools.trainer_refactor import Trainer
from torch.utils.data import Dataset

In [2]:
class KGLMDataset(Dataset):
    def __init__(self, port, db, collection, neighborhood=True):
        self.client = MongoClient('localhost', port)
        self.db_name = db
        self.collection_name = collection
        self.collection = self.client[db][collection]
        self.length = self.client[self.db_name].command("collstats", self.collection_name)['count']
        self.neighborhood = neighborhood

    def  __getitem__(self, idx):
        item = {}
        doc = self.collection.find_one({'_id': str(idx)})
        print(doc)
        
        if self.neighborhood:
            item["input"] = doc['verbalization']
        else:
            verbalization = doc['verbalization']
            inp = '[SEP]'.join(verbalization.split('[SEP]')[:2])
            item["input"] = inp
            
        item["outputs"] = doc['target']
        return item
        
    def __len__(self):
        return self.length

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('t5-small')



In [4]:
class Holder:
    def __init__(self):
        pass

In [5]:
args = Holder()
args.target_seq_len = 512
args.input_seq_len = 512
args.input_prefix = ''

In [6]:
encode_plus_kwargs = {'truncation': True, 'padding': 'longest', 'pad_to_multiple_of': 1}

In [7]:
def collate_fn(batch):
    # cut too long strings because they may slow down tokenization
    inputs = [b['input'][:args.input_seq_len * 10] for b in batch]
    print('inputs', inputs)
    if 'outputs' in batch[0]:
        # if we have more than 1 label per example (only in valid) take only one of them
        # to compute loss on valid
        labels = [b['outputs'][0][:args.target_seq_len * 10] for b in batch]
    else:
        labels = [b['output'][:args.target_seq_len * 10] for b in batch]
    if args.input_prefix:
        inputs = [args.input_prefix + inp for inp in inputs]
    features = tokenizer.batch_encode_plus(list(inputs), max_length=args.input_seq_len, return_tensors='pt',
                                           **encode_plus_kwargs)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer.batch_encode_plus(list(labels), max_length=args.target_seq_len, return_tensors='pt',
                                             **encode_plus_kwargs).input_ids
    labels[labels == tokenizer.pad_token_id] = -100
    features['labels'] = labels
    # features['id'] = [b['id'] for b in batch]
    if 'outputs' in batch[0]:
        features['target_text'] = [b['outputs'] for b in batch]
    else:
        features['target_text'] = [b['output'] for b in batch]
    if 'global_attention_mask' in features:
        raise RuntimeError('What global attention mask for Longformer and LongformerEncoder-Decoder should be?')
    return features

In [10]:
DB_PORTPORT = 15015
train_dataset = KGLMDataset(DB_PORT, 'KGLM', 'train', neighborhood=True)

In [11]:
train_dataset.collection.find_one()

{'_id': '0',
 'head': 'Q29387131',
 'relation': 'P31',
 'tail': 'Q5',
 'verbalization': 'predict [SEP] Lalit Kumar Goel instance of [SEP] ',
 'target': 'human'}

In [9]:
train_sampler = RandomSampler(train_dataset)
kwargs = {'pin_memory': True, }
train_dataloader = DataLoader(train_dataset, batch_size=2, sampler=train_sampler,
                              collate_fn=collate_fn, **kwargs)

In [22]:
list(client.list_databases())

[{'name': 'KGLM', 'sizeOnDisk': 5886840832, 'empty': False},
 {'name': 'KGLM-inductive', 'sizeOnDisk': 1347354624, 'empty': False},
 {'name': 'admin', 'sizeOnDisk': 40960, 'empty': False},
 {'name': 'config', 'sizeOnDisk': 73728, 'empty': False},
 {'name': 'local', 'sizeOnDisk': 73728, 'empty': False}]

In [25]:
client = MongoClient('localhost', 15017)
collection = client["KGLM-inductive"]['train']
collection.find_one()

{'_id': ObjectId('637ca5613c82960d48908165'),
 'head': 'Q5142631',
 'relation': 'P159',
 'tail': 'Q3141'}

In [24]:
client = MongoClient('localhost', 15017)
collection = client["KGLM"]['train']
collection.find_one()

{'_id': '0',
 'head': 'Q29387131',
 'relation': 'P31',
 'tail': 'Q5',
 'verbalization': 'predict [SEP] Lalit Kumar Goel instance of [SEP] ',
 'target': 'human'}

In [10]:
# b = train_dataset[10]

In [11]:
gen = iter(train_dataloader)
sample = next(gen)

{'_id': '19599398', 'head': 'Q16325376', 'relation': 'P17', 'tail': 'Q794', 'verbalization': 'predict [SEP] Nazhvan Recreational Complex country [SEP] ', 'target': 'Iran'}
{'_id': '20226282', 'head': 'Q3313042', 'relation': 'P2416', 'tail': 'Q165704', 'verbalization': 'predict [SEP] Miguel Ángel Sancho sports discipline competed in [SEP] sport athletics [SEP] country of citizenship Spain [SEP] instance of human [SEP] place of birth Valencia [SEP]', 'target': 'high jump'}
inputs ['predict [SEP] Nazhvan Recreational Complex country ', 'predict [SEP] Miguel Ángel Sancho sports discipline competed in ']


In [None]:
train_dataset[1]

In [None]:
sample['labels']

In [None]:
tokenizer(sample['target_text'])

In [None]:
tokenizer.batch_decode(sample['labels'])

In [None]:
batch = [train_dataset[0]]

In [19]:
inputs = [b['input'][:args.input_seq_len * 10] for b in batch]
if 'outputs' in batch[0]:
    # if we have more than 1 label per example (only in valid) take only one of them
    # to compute loss on valid
    labels = [b['outputs'][:args.target_seq_len * 10] for b in batch]
else:
    labels = [b['output'][:args.target_seq_len * 10] for b in batch]
if args.input_prefix:
    inputs = [args.input_prefix + inp for inp in inputs]
features = tokenizer.batch_encode_plus(list(inputs), max_length=args.input_seq_len, return_tensors='pt',
                                       **encode_plus_kwargs)
with tokenizer.as_target_tokenizer():
    labels = tokenizer.batch_encode_plus(list(labels), max_length=args.target_seq_len, return_tensors='pt',
                                         **encode_plus_kwargs).input_ids
print(labels)
labels[labels == tokenizer.pad_token_id] = -100
features['labels'] = labels
# features['id'] = [b['id'] for b in batch]
if 'outputs' in batch[0]:
    features['target_text'] = [b['outputs'] for b in batch]
else:
    features['target_text'] = [b['output'] for b in batch]
if 'global_attention_mask' in features:
    raise RuntimeError('What global attention mask for Longformer and LongformerEncoder-Decoder should be?')


tensor([[936,   1]])


In [20]:
labels = [b['outputs'][:args.target_seq_len * 10] for b in batch]
tokenizer.batch_encode_plus(labels)

{'input_ids': [[936, 1]], 'attention_mask': [[1, 1]]}

In [21]:
labels

['human']

In [22]:
# tokenizer.batch_decode(labels)

In [23]:
# cpt_folder = "/home/bulatov/bulatov/KGLM/runs/t5-small/lr5e-05_constant_with_warmup_adamw_wd1e-03_512-512_bs64_iters1000000_baseline/run_0/"
# cpt_folder = "/share/home/export/rmt_internship/kglm/lr5e-05_constant_with_warmup_adamw_wd1e-03_512-512_bs64_iters1000000/run_0"
# cpt_folder = "/home/bulatov/bulatov/KGLM/runs/t5-small/lr5e-05_constant_with_warmup_adamw_wd1e-03_512-512_bs32_iters1500000_baseline/run_0"
cpt_folder = "/home/bulatov/bulatov/KGLM/runs/t5-small/lr5e-05_constant_with_warmup_adamw_wd1e-03_512-512_bs64_iters4000000/run_1/"
# cpt_folder = "/home/bulatov/bulatov/KGLM/tests/runs/test_t5_pretrain/"

In [24]:
import os


In [25]:
# !ls /share/home/export/rmt_internship/kglm/

In [26]:
model_cpt = os.path.join(cpt_folder, 'model_best.pth')
config_path = os.path.join(cpt_folder, 'config.json')

In [27]:
cpt = torch.load(model_cpt, map_location='cpu')

In [28]:
from transformers import T5ForConditionalGeneration, T5Config, AutoConfig

In [29]:
config = T5Config(config_path)

In [38]:
model_cfg = T5Config.from_pretrained(config_path)
# model_cfg

You are using a model of type encoder-decoder to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [36]:
model_cfg = AutoConfig.from_pretrained('t5-small')
# model_cfg

T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_en_to_fr": {
      "early_stopping": true,

In [34]:
model_cpt = os.path.join(cpt_folder, 'model_best.pth')
config_path = os.path.join(cpt_folder, 'config.json')

# model_cfg = AutoConfig.from_pretrained('t5-small')
model_cfg = T5Config.from_pretrained(config_path)
model = T5ForConditionalGeneration(config=model_cfg)

cpt = torch.load(model_cpt, map_location='cpu')
model.load_state_dict(cpt['model_state_dict'])

You are using a model of type encoder-decoder to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


<All keys matched successfully>

In [32]:
model.load_state_dict(cpt['model_state_dict'])

<All keys matched successfully>