# Practice of different models in huggingface

In [None]:
import torch

## RAG

In [None]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration

In [None]:
# Initialize model
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True)
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever = retriever)

In [4]:
# Initialize data
inputs = tokenizer(["How many people live in Paris?", 'how old are you?'], return_tensors="pt", padding=True, truncation=True)
with tokenizer.as_target_tokenizer():
   targets = tokenizer(["In Paris, there are 10 million people.", 'I am 22 years old.'], return_tensors="pt", padding=True, truncation=True)
input_ids = inputs["input_ids"]
labels = targets["input_ids"]
print(input_ids.size())

torch.Size([2, 9])


In [5]:
# 1. Encode
question_hidden_states = model.question_encoder(input_ids)[0]
print(question_hidden_states.size())

torch.Size([2, 768])


In [8]:
# 2. Retrieve
docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)).squeeze(1)
print('question_hidden_states.unsqueeze(1)')
print(question_hidden_states.unsqueeze(1).size())
print('docs_dict["context_input_ids"]')
print(docs_dict['context_input_ids'].size())
print('docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)')
print(docs_dict["retrieved_doc_embeds"].float().transpose(1, 2).size())
print('doc_scores')
print(doc_scores.size())

question_hidden_states.unsqueeze(1)
torch.Size([2, 1, 768])
docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
torch.Size([2, 768, 5])
doc_scores
torch.Size([2, 5])


In [None]:
# 3. Forward to generator
outputs = model(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=labels)

## Bert

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
# Initialize model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

In [None]:
# Initialize data
inputs = tokenizer(["How many people live in Paris?", 'how old are you?'], return_tensors="pt", padding=True, truncation=True)
input_ids = inputs["input_ids"]
print(input_ids.size())

In [None]:
outputs = model(**inputs, output_hidden_states=True)
print(outputs)

In [None]:
a = torch.rand((2,3))

In [None]:
torch.cat((a, a))