In [1]:
from transformers import RagTokenizer, RagSequenceForGeneration
import rag
import torch

# Determine the device to use for running the generator.
device = rag.get_desired_computation_device(use_gpu_if_available=True)

model_name = "facebook/rag-sequence-base"
model = RagSequenceForGeneration.from_pretrained(model_name).to(device)
tokenizer = RagTokenizer.from_pretrained(model_name)

# Example input
question = "What is Andrew's favorite color?"
inputs = tokenizer(question, return_tensors="pt").to(device)

context_inputs = tokenizer("Andrew's favorite color is blue.", return_tensors="pt").to(device)

# Generate the answer
outputs = model.generate(
    input_ids=inputs.input_ids,
    context_input_ids=context_inputs.input_ids,
    max_new_tokens=50,
    num_return_sequences=1,
    temperature=None,
    top_k=None,
    do_sample=False,
)

# Decode the model output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clear memory appropriately.
del model, tokenizer, inputs, context_inputs, outputs  # delete objects to free memory
torch.cuda.empty_cache()

print(response)

2025-03-08 03:31:29.799545: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 03:31:30.041467: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741422690.132890   44170 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741422690.157837   44170 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 03:31:30.389866: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

LOGGER: 2025-03-08 03:31:33,722 - root - INFO - For computation we are using the device: NVIDIA GeForce RTX 4050 Laptop GPU.
LOGGER: 2025-03-08 03:31:33,725 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
LOGGER: 2025-03-08 03:31:33,920 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /facebook/rag-sequence-base/resolve/main/config.json HTTP/1.1" 200 0
LOGGER: 2025-03-08 03:31:33,967 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /facebook/rag-sequence-base/resolve/main/config.json HTTP/1.1" 200 0
LOGGER: 2025-03-08 03:31:34,019 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /facebook/rag-sequence-base/resolve/main/model.safetensors HTTP/1.1" 404 0
LOGGER: 2025-03-08 03:31:34,021 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
LOGGER: 2025-03-08 03:31:34,118 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "GET /api/models/facebook/

Some weights of the model checkpoint at facebook/rag-sequence-base were not used when initializing RagSequenceForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.bias', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing RagSequenceForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagSequenceForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


LOGGER: 2025-03-08 03:31:37,642 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /facebook/rag-sequence-base/resolve/main/generation_config.json HTTP/1.1" 404 0


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.


LOGGER: 2025-03-08 03:31:40,122 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /facebook/rag-sequence-base/resolve/main/config.json HTTP/1.1" 200 0
LOGGER: 2025-03-08 03:31:40,167 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /facebook/rag-sequence-base/resolve/main/question_encoder_tokenizer/tokenizer_config.json HTTP/1.1" 200 0
LOGGER: 2025-03-08 03:31:40,313 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /facebook/rag-sequence-base/resolve/main/generator_tokenizer/tokenizer_config.json HTTP/1.1" 200 0


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizerFast'.


AssertionError: Make sure that `context_input_ids` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.

In [3]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
import torch

# Determine the device to use for running the generator.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/rag-sequence-base"
model = RagSequenceForGeneration.from_pretrained(model_name).to(device)
tokenizer = RagTokenizer.from_pretrained(model_name)
retriever = RagRetriever.from_pretrained(model_name, use_dummy_dataset=True)

# Example input
question = "What is Andrew's favorite color?"
inputs = tokenizer(question, return_tensors="pt").to(device)

# Retrieve documents
question_hidden_states = model.question_encoder(inputs.input_ids)[0]
retrieved_docs = retriever(input_ids=inputs.input_ids.numpy(), question_hidden_states=question_hidden_states.detach().numpy(), return_tensors="pt")

# Generate the answer
outputs = model.generate(
    input_ids=inputs.input_ids,
    context_input_ids=retrieved_docs["context_input_ids"],
    context_attention_mask=retrieved_docs["context_attention_mask"],
    doc_scores=retrieved_docs["doc_scores"],
    max_new_tokens=50,
    num_return_sequences=1,
    temperature=None,
    top_k=None,
    do_sample=False,
)

# Decode the model output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clear memory appropriately.
del model, tokenizer, inputs, retrieved_docs, outputs  # delete objects to free memory
torch.cuda.empty_cache()

print(response)

Some weights of the model checkpoint at facebook/rag-sequence-base were not used when initializing RagSequenceForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.bias', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing RagSequenceForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagSequenceForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this functio

ImportError: 
RagRetriever requires the faiss library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [1]:
from transformers import RagTokenizer, RagSequenceForGeneration
import torch

# Determine the device to use for running the generator.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/rag-sequence-base"
model = RagSequenceForGeneration.from_pretrained(model_name).to(device)
tokenizer = RagTokenizer.from_pretrained(model_name)

# Example input
question = "What is Andrew's favorite color?"
inputs = tokenizer(question, return_tensors="pt").to(device)

# Manually set the context
context = "Andrew's favorite color is blue."
context_inputs = tokenizer(context, return_tensors="pt").to(device)

# Generate the answer
outputs = model.generate(
    input_ids=inputs.input_ids,
    context_input_ids=context_inputs.input_ids.repeat(inputs.input_ids.size(0), 1),
    context_attention_mask=context_inputs.attention_mask.repeat(inputs.input_ids.size(0), 1),
    max_new_tokens=50,
    num_return_sequences=1,
    temperature=None,
    top_k=None,
    do_sample=False,
)

# Decode the model output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clear memory appropriately.
del model, tokenizer, inputs, context_inputs, outputs  # delete objects to free memory
torch.cuda.empty_cache()

print(response)

2025-03-08 03:50:44.583880: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 03:50:44.814956: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741423844.901090   49502 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741423844.929338   49502 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 03:50:45.156429: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

AssertionError: Make sure that `context_input_ids` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.

In [1]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
import torch

# Determine the device to use for running the generator.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/rag-sequence-base"
model = RagSequenceForGeneration.from_pretrained(model_name).to(device)
tokenizer = RagTokenizer.from_pretrained(model_name)
retriever = RagRetriever.from_pretrained(model_name, use_dummy_dataset=True)

# Set the retriever for the model
model.set_retriever(retriever)

# Example input
question = "What is Andrew's favorite color?"
inputs = tokenizer(question, return_tensors="pt").to(device)

# Manually set the context
context = "Andrew's favorite color is blue."
context_inputs = tokenizer(context, return_tensors="pt").to(device)

# Generate the answer
outputs = model.generate(
    input_ids=inputs.input_ids,
    context_input_ids=context_inputs.input_ids.repeat(inputs.input_ids.size(0), 1),
    context_attention_mask=context_inputs.attention_mask.repeat(inputs.input_ids.size(0), 1),
    doc_scores=torch.tensor([[1.0]]).to(device),  # Dummy doc_scores
    max_new_tokens=50,
    num_return_sequences=1,
    temperature=None,
    top_k=None,
    do_sample=False,
)

# Decode the model output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clear memory appropriately.
del model, tokenizer, inputs, context_inputs, outputs  # delete objects to free memory
torch.cuda.empty_cache()

print(response)

2025-03-08 03:45:54.502142: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 03:45:54.512299: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741423554.524411   48903 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741423554.528094   48903 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 03:45:54.542396: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

ImportError: 
RagRetriever requires the faiss library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [1]:
from transformers import RagTokenizer, RagSequenceForGeneration
import torch

# Determine the device to use for running the generator.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/rag-sequence-base"
model = RagSequenceForGeneration.from_pretrained(model_name).to(device)
tokenizer = RagTokenizer.from_pretrained(model_name)

# Example input
question = "What is Andrew's favorite color?"
inputs = tokenizer(question, return_tensors="pt").to(device)

# Manually set the context
context = "Andrew's favorite color is blue."
context_inputs = tokenizer(context, return_tensors="pt").to(device)

# Generate the answer
outputs = model.generate(
    input_ids=inputs.input_ids,
    context_input_ids=context_inputs.input_ids,
    context_attention_mask=context_inputs.attention_mask,
    max_new_tokens=50,
    num_return_sequences=1,
    temperature=None,
    top_k=None,
    do_sample=False,
)

# Decode the model output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clear memory appropriately.
del model, tokenizer, inputs, context_inputs, outputs  # delete objects to free memory
torch.cuda.empty_cache()

print(response)

2025-03-08 03:56:04.062202: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 03:56:04.072668: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741424164.085542   50998 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741424164.089301   50998 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 03:56:04.104544: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

AssertionError: Make sure that `context_input_ids` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.

In [1]:
from transformers import RagTokenizer, RagSequenceForGeneration
import torch

# Determine the device to use for running the generator.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/rag-sequence-base"
model = RagSequenceForGeneration.from_pretrained(model_name).to(device)
tokenizer = RagTokenizer.from_pretrained(model_name)

# Example input
question = "What is Andrew's favorite color?"
inputs = tokenizer(question, return_tensors="pt").to(device)

# Manually set the context
context = "Andrew's favorite color is blue."
context_inputs = tokenizer(context, return_tensors="pt").to(device)

# Generate the answer
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    context_input_ids=context_inputs.input_ids,
    context_attention_mask=context_inputs.attention_mask,
    max_new_tokens=50,
    num_return_sequences=1,
    temperature=None,
    top_k=None,
    do_sample=False,
)

# Decode the model output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clear memory appropriately.
del model, tokenizer, inputs, context_inputs, outputs  # delete objects to free memory
torch.cuda.empty_cache()

print(response)

2025-03-08 03:58:17.618774: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 03:58:17.625985: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741424297.633842   51516 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741424297.636179   51516 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 03:58:17.645950: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

AssertionError: Make sure that `context_input_ids` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.

In [1]:
from transformers import RagTokenizer, RagSequenceForGeneration
import torch

# Determine the device to use for running the generator.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/rag-sequence-base"
model = RagSequenceForGeneration.from_pretrained(model_name).to(device)
tokenizer = RagTokenizer.from_pretrained(model_name)

# Example input
question = "What is Andrew's favorite color?"
inputs = tokenizer(question, return_tensors="pt").to(device)

# Manually set the context
context = "Andrew's favorite color is blue."
context_inputs = tokenizer(context, return_tensors="pt").to(device)

# Ensure the context inputs have the correct shape
batch_size = inputs.input_ids.size(0)
n_docs = model.config.n_docs
max_combined_length = context_inputs.input_ids.size(1)

context_input_ids = context_inputs.input_ids.unsqueeze(0).expand(batch_size, n_docs, max_combined_length).reshape(batch_size * n_docs, max_combined_length)
context_attention_mask = context_inputs.attention_mask.unsqueeze(0).expand(batch_size, n_docs, max_combined_length).reshape(batch_size * n_docs, max_combined_length)

# Generate the answer
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    context_input_ids=context_input_ids,
    context_attention_mask=context_attention_mask,
    max_new_tokens=50,
    num_return_sequences=1,
    temperature=None,
    top_k=None,
    do_sample=False,
)

# Decode the model output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clear memory appropriately.
del model, tokenizer, inputs, context_inputs, outputs  # delete objects to free memory
torch.cuda.empty_cache()

print(response)

2025-03-08 04:06:30.593751: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 04:06:30.600959: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741424790.608794   53129 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741424790.611117   53129 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 04:06:30.622255: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

AssertionError: Make sure that `context_input_ids` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.

In [1]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

prompt = (
    "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
    "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
    "researchers was the fact that the unicorns spoke perfect English."
)

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=100,
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]

2025-03-08 04:07:47.956973: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 04:07:47.967122: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741424867.979604   53428 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741424867.983438   53428 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 04:07:47.997827: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [2]:
gen_text

'In a shocking finding, scientists discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.\n\nAccording to experts, only 100 to 200 people live in the valley, and that’s just a single herd, which is why this is the first time researchers have found out the existence of these wild unicorns in the world’s largest country.\n'

In [1]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
import torch

# Determine the device to use for running the generator.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/rag-sequence-base"
model = RagSequenceForGeneration.from_pretrained(model_name).to(device)
tokenizer = RagTokenizer.from_pretrained(model_name)
retriever = RagRetriever.from_pretrained(model_name, use_dummy_dataset=True)

# Example input
question = "What is Andrew's favorite color?"
inputs = tokenizer(question, return_tensors="pt").to(device)

# Retrieve documents
question_hidden_states = model.question_encoder(inputs.input_ids)[0]
retrieved_docs = retriever(input_ids=inputs.input_ids.numpy(), question_hidden_states=question_hidden_states.detach().numpy(), return_tensors="pt")

# Generate the answer
outputs = model.generate(
    input_ids=inputs.input_ids,
    context_input_ids=retrieved_docs["context_input_ids"],
    context_attention_mask=retrieved_docs["context_attention_mask"],
    doc_scores=retrieved_docs["doc_scores"],
    max_new_tokens=50,
    num_return_sequences=1,
    temperature=None,
    top_k=None,
    do_sample=False,
)

# Decode the model output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clear memory appropriately.
del model, tokenizer, inputs, retrieved_docs, outputs  # delete objects to free memory
torch.cuda.empty_cache()

print(response)

2025-03-08 04:17:13.351866: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 04:17:13.376903: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741425433.390956   55332 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741425433.395362   55332 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 04:17:13.419494: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

README.md:   0%|          | 0.00/14.9k [00:00<?, ?B/s]

wiki_dpr.py:   0%|          | 0.00/8.63k [00:00<?, ?B/s]

ValueError: Loading wiki_dpr requires you to execute the dataset script in that repo on your local machine. Make sure you have read the code there to avoid malicious use, then set the option `trust_remote_code=True` to remove this error.

In [1]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
import torch

# Determine the device to use for running the generator.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/rag-sequence-base"
model = RagSequenceForGeneration.from_pretrained(model_name).to(device)
tokenizer = RagTokenizer.from_pretrained(model_name)

# Manually set the context
context = "Andrew's favorite color is blue."
retriever = RagRetriever.from_pretrained(model_name, index_name="custom", passages=[{"text": context}])

# Example input
question = "What is Andrew's favorite color?"
inputs = tokenizer(question, return_tensors="pt").to(device)

# Retrieve documents
question_hidden_states = model.question_encoder(inputs.input_ids)[0]
retrieved_docs = retriever(inputs.input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")

# Generate the answer
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    context_input_ids=retrieved_docs["context_input_ids"],
    context_attention_mask=retrieved_docs["context_attention_mask"],
    doc_scores=retrieved_docs["doc_scores"],
    max_new_tokens=50,
    num_return_sequences=1,
    temperature=None,
    top_k=None,
    do_sample=False,
)

# Decode the model output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clear memory appropriately.
del model, tokenizer, inputs, retrieved_docs, outputs, retriever  # delete objects to free memory
torch.cuda.empty_cache()

print(response)

2025-03-08 04:20:05.017223: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 04:20:05.030166: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741425605.044626   55936 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741425605.048912   55936 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 04:20:05.067489: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

ValueError: Please provide `dataset_path` and `index_path` after calling `dataset.save_to_disk(dataset_path)` and `dataset.get_index('embeddings').save(index_path)`.

In [1]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
from datasets import Dataset
import torch

# Determine the device to use for running the generator.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/rag-sequence-base"
model = RagSequenceForGeneration.from_pretrained(model_name).to(device)
tokenizer = RagTokenizer.from_pretrained(model_name)

# Manually set the context
context = "Andrew's favorite color is blue."
# Create a dataset with the context document
data = {"title": ["context"], "text": [context]}
dataset = Dataset.from_dict(data)

# Save the dataset to disk
dataset_path = "./context_dataset"
dataset.save_to_disk(dataset_path)

# Index the dataset
dataset.add_faiss_index(column="text", device=device)
index_path = "./context_index"
dataset.get_index("text").save(index_path)

# Load the retriever with the indexed dataset
retriever = RagRetriever.from_pretrained(model_name, index_name="custom", passages_path=dataset_path, index_path=index_path)

# Example input
question = "What is Andrew's favorite color?"
inputs = tokenizer(question, return_tensors="pt").to(device)

# Retrieve documents
question_hidden_states = model.question_encoder(inputs.input_ids)[0]
retrieved_docs = retriever(inputs.input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")

# Generate the answer
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    context_input_ids=retrieved_docs["context_input_ids"],
    context_attention_mask=retrieved_docs["context_attention_mask"],
    doc_scores=retrieved_docs["doc_scores"],
    max_new_tokens=50,
    num_return_sequences=1,
    temperature=None,
    top_k=None,
    do_sample=False,
)

# Decode the model output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clear memory appropriately.
del model, tokenizer, inputs, retrieved_docs, outputs, retriever, dataset  # delete objects to free memory
torch.cuda.empty_cache()

print(response)

2025-03-08 04:21:19.331065: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 04:21:19.547378: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741425679.630188   56186 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741425679.653766   56186 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 04:21:19.862121: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

ValueError: Wrong feature type for column 'text'. Expected 1d array, got Value(dtype='string', id=None)

In [None]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration, DPRContextEncoder, DPRContextEncoderTokenizerFast
from datasets import Dataset, Features, Sequence, Value
import torch

# Determine the device to use for running the generator.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/rag-sequence-base"
model = RagSequenceForGeneration.from_pretrained(model_name).to(device)
tokenizer = RagTokenizer.from_pretrained(model_name)

# Manually set the context
context = "Andrew's favorite color is blue."
# Create a dataset with the context document
data = {"title": ["context"], "text": [context]}
dataset = Dataset.from_dict(data)

# Compute embeddings for the context document
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(device)
ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

def embed(documents):
    inputs = ctx_tokenizer(documents["text"], truncation=True, padding="longest", return_tensors="pt")
    embeddings = ctx_encoder(**inputs.to(device)).pooler_output
    return {"embeddings": embeddings.detach().cpu().numpy()}

dataset = dataset.map(embed, batched=True, batch_size=1)

# Define the features for the dataset
features = Features({
    "text": Value("string"),
    "title": Value("string"),
    "embeddings": Sequence(Value("float32"))
})

dataset = dataset.cast(features)

# Save the dataset to disk
dataset_path = "./context_dataset"
dataset.save_to_disk(dataset_path)

# Index the dataset
dataset.add_faiss_index(column="embeddings")
index_path = "./context_index"
dataset.get_index("embeddings").save(index_path)

# Load the retriever with the indexed dataset
retriever = RagRetriever.from_pretrained(model_name, index_name="custom", passages_path=dataset_path, index_path=index_path)

# Example input
question = "What is Andrew's favorite color?"
inputs = tokenizer(question, return_tensors="pt").to(device)

# Retrieve documents
question_hidden_states = model.question_encoder(inputs.input_ids)[0]
retrieved_docs = retriever(inputs.input_ids.cpu().numpy(), question_hidden_states.detach().cpu().numpy(), return_tensors="pt")

# Generate the answer
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    context_input_ids=retrieved_docs["context_input_ids"],
    context_attention_mask=retrieved_docs["context_attention_mask"],
    doc_scores=retrieved_docs["doc_scores"],
    max_new_tokens=50,
    num_return_sequences=1,
    temperature=None,
    top_k=None,
    do_sample=False,
)

# Decode the model output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clear memory appropriately.
del model, tokenizer, inputs, retrieved_docs, outputs, retriever, dataset  # delete objects to free memory
torch.cuda.empty_cache()

print(response)

2025-03-08 04:32:49.859234: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 04:32:49.866308: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741426369.873859   59314 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741426369.876116   59314 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 04:32:49.885216: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

KeyError: 'doc_scores'

In [1]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration, DPRContextEncoder, DPRContextEncoderTokenizerFast
from datasets import Dataset, Features, Sequence, Value
import torch

# Determine the device to use for running the generator.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/rag-sequence-base"
model = RagSequenceForGeneration.from_pretrained(model_name).to(device)
tokenizer = RagTokenizer.from_pretrained(model_name)

# Manually set the context
context = "Andrew's favorite color is blue."
# Create a dataset with the context document
data = {"title": ["context"], "text": [context]}
dataset = Dataset.from_dict(data)

# Compute embeddings for the context document
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(device)
ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

def embed(documents):
    inputs = ctx_tokenizer(documents["text"], truncation=True, padding="longest", return_tensors="pt")
    embeddings = ctx_encoder(**inputs.to(device)).pooler_output
    return {"embeddings": embeddings.detach().cpu().numpy()}

dataset = dataset.map(embed, batched=True, batch_size=1)

# Define the features for the dataset
features = Features({
    "text": Value("string"),
    "title": Value("string"),
    "embeddings": Sequence(Value("float32"))
})

dataset = dataset.cast(features)

# Save the dataset to disk
dataset_path = "./context_dataset"
dataset.save_to_disk(dataset_path)

# Index the dataset
dataset.add_faiss_index(column="embeddings")
index_path = "./context_index"
dataset.get_index("embeddings").save(index_path)

# Load the retriever with the indexed dataset
retriever = RagRetriever.from_pretrained(model_name, index_name="custom", passages_path=dataset_path, index_path=index_path)

# Example input
question = "What is Andrew's favorite color?"
inputs = tokenizer(question, return_tensors="pt").to(device)

# Retrieve documents
question_hidden_states = model.question_encoder(inputs.input_ids)[0]
retrieved_docs = retriever(inputs.input_ids.cpu().numpy(), question_hidden_states.detach().cpu().numpy(), return_tensors="pt")

# Compute doc_scores
doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), retrieved_docs["retrieved_doc_embeds"].float().transpose(1, 2)).squeeze(1)

# Generate the answer
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    context_input_ids=retrieved_docs["context_input_ids"],
    context_attention_mask=retrieved_docs["context_attention_mask"],
    doc_scores=doc_scores,
    max_new_tokens=50,
    num_return_sequences=1,
    temperature=None,
    top_k=None,
    do_sample=False,
)

# Decode the model output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clear memory appropriately.
del model, tokenizer, inputs, retrieved_docs, outputs, retriever, dataset  # delete objects to free memory
torch.cuda.empty_cache()

print(response)

2025-03-08 04:35:55.310194: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 04:35:55.317538: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741426555.325144   60012 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741426555.327568   60012 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 04:35:55.337081: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)

In [1]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration, DPRContextEncoder, DPRContextEncoderTokenizerFast
from datasets import Dataset, Features, Sequence, Value
import torch

# Determine the device to use for running the generator.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/rag-sequence-base"
model = RagSequenceForGeneration.from_pretrained(model_name).to(device)
tokenizer = RagTokenizer.from_pretrained(model_name)

# Manually set the context
context = "Andrew's favorite color is blue."
# Create a dataset with the context document
data = {"title": ["context"], "text": [context]}
dataset = Dataset.from_dict(data)

# Compute embeddings for the context document
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(device)
ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

def embed(documents):
    inputs = ctx_tokenizer(documents["text"], truncation=True, padding="longest", return_tensors="pt")
    embeddings = ctx_encoder(**inputs.to(device)).pooler_output
    return {"embeddings": embeddings.detach().cpu().numpy()}

dataset = dataset.map(embed, batched=True, batch_size=1)

# Define the features for the dataset
features = Features({
    "text": Value("string"),
    "title": Value("string"),
    "embeddings": Sequence(Value("float32"))
})

dataset = dataset.cast(features)

# Save the dataset to disk
dataset_path = "./context_dataset"
dataset.save_to_disk(dataset_path)

# Index the dataset
dataset.add_faiss_index(column="embeddings")
index_path = "./context_index"
dataset.get_index("embeddings").save(index_path)

# Load the retriever with the indexed dataset
retriever = RagRetriever.from_pretrained(model_name, index_name="custom", passages_path=dataset_path, index_path=index_path)

# Example input
question = "What is Andrew's favorite color?"
inputs = tokenizer(question, return_tensors="pt").to(device)

# Retrieve documents
question_hidden_states = model.question_encoder(inputs.input_ids)[0]
retrieved_docs = retriever(inputs.input_ids.cpu().numpy(), question_hidden_states.detach().cpu().numpy(), return_tensors="pt")

# Move retrieved document embeddings to the same device as question_hidden_states
retrieved_doc_embeds = retrieved_docs["retrieved_doc_embeds"].to(device)

# Compute doc_scores
doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), retrieved_doc_embeds.float().transpose(1, 2)).squeeze(1)

# Generate the answer
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    context_input_ids=retrieved_docs["context_input_ids"].to(device),
    context_attention_mask=retrieved_docs["context_attention_mask"].to(device),
    doc_scores=doc_scores,
    max_new_tokens=50,
    num_return_sequences=1,
    temperature=None,
    top_k=None,
    do_sample=False,
)

# Decode the model output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clear memory appropriately.
del model, tokenizer, inputs, retrieved_docs, outputs, retriever, dataset  # delete objects to free memory
torch.cuda.empty_cache()

print(response)

2025-03-08 04:37:53.583324: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 04:37:53.590596: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741426673.598320   60486 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741426673.600686   60486 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 04:37:53.610069: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

AssertionError: Make sure that `context_input_ids` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.

In [1]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration, DPRContextEncoder, DPRContextEncoderTokenizerFast
from datasets import Dataset, Features, Sequence, Value
import torch

# Determine the device to use for running the generator.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/rag-sequence-base"
model = RagSequenceForGeneration.from_pretrained(model_name).to(device)
tokenizer = RagTokenizer.from_pretrained(model_name)

# Manually set the context
context = "Andrew's favorite color is blue."
# Create a dataset with the context document
data = {"title": ["context"], "text": [context]}
dataset = Dataset.from_dict(data)

# Compute embeddings for the context document
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(device)
ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

def embed(documents):
    inputs = ctx_tokenizer(documents["text"], truncation=True, padding="longest", return_tensors="pt")
    embeddings = ctx_encoder(**inputs.to(device)).pooler_output
    return {"embeddings": embeddings.detach().cpu().numpy()}

dataset = dataset.map(embed, batched=True, batch_size=1)

# Define the features for the dataset
features = Features({
    "text": Value("string"),
    "title": Value("string"),
    "embeddings": Sequence(Value("float32"))
})

dataset = dataset.cast(features)

# Save the dataset to disk
dataset_path = "./context_dataset"
dataset.save_to_disk(dataset_path)

# Index the dataset
dataset.add_faiss_index(column="embeddings")
index_path = "./context_index"
dataset.get_index("embeddings").save(index_path)

# Load the retriever with the indexed dataset
retriever = RagRetriever.from_pretrained(model_name, index_name="custom", passages_path=dataset_path, index_path=index_path)

# Set the retriever for the model
model.set_retriever(retriever)

# Example input
question = "What is Andrew's favorite color?"
inputs = tokenizer(question, return_tensors="pt").to(device)

# Generate the answer
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=50,
    num_return_sequences=1,
    temperature=None,
    top_k=None,
    do_sample=False,
)

# Decode the model output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clear memory appropriately.
del model, tokenizer, inputs, outputs, retriever, dataset  # delete objects to free memory
torch.cuda.empty_cache()

print(response)

2025-03-08 04:40:43.021099: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 04:40:43.037574: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741426843.054439   61109 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741426843.059759   61109 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 04:40:43.085439: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

 spendingwordpress Constantin coercive�oxicity233 childcare garden garden garden garden garden garden garden garden garden garden Guinea gou responded Acting undrafted healer Actingjump garden Actingjump garden garden garden garden garden Acting 330riminal ozriminal oz TG TG TG TG TG TG TG TG TG


In [1]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration, DPRContextEncoder, DPRContextEncoderTokenizerFast
from datasets import Dataset, Features, Sequence, Value
import torch

# Determine the device to use for running the generator.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/rag-sequence-base"
model = RagSequenceForGeneration.from_pretrained(model_name).to(device)
tokenizer = RagTokenizer.from_pretrained(model_name)

# Manually set the context
context = "Andrew's favorite color is blue."
# Create a dataset with the context document
data = {"title": ["context"], "text": [context]}
dataset = Dataset.from_dict(data)

# Compute embeddings for the context document
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(device)
ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

def embed(documents):
    inputs = ctx_tokenizer(documents["text"], truncation=True, padding="longest", return_tensors="pt")
    embeddings = ctx_encoder(**inputs.to(device)).pooler_output
    return {"embeddings": embeddings.detach().cpu().numpy()}

dataset = dataset.map(embed, batched=True, batch_size=1)

# Define the features for the dataset
features = Features({
    "text": Value("string"),
    "title": Value("string"),
    "embeddings": Sequence(Value("float32"))
})

dataset = dataset.cast(features)

# Save the dataset to disk
dataset_path = "./context_dataset"
dataset.save_to_disk(dataset_path)

# Index the dataset
dataset.add_faiss_index(column="embeddings")
index_path = "./context_index"
dataset.get_index("embeddings").save(index_path)

# Load the retriever with the indexed dataset
retriever = RagRetriever.from_pretrained(model_name, index_name="custom", passages_path=dataset_path, index_path=index_path)

# Set the retriever for the model
model.set_retriever(retriever)

# Example input
question = "What is Andrew's favorite color?"
inputs = tokenizer(question, return_tensors="pt").to(device)

# Generate the answer
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=50,
    num_return_sequences=1,
    temperature=None,
    top_k=None,
    do_sample=False,
)

# Decode the model output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clear memory appropriately.
del model, tokenizer, inputs, outputs, retriever, dataset  # delete objects to free memory
torch.cuda.empty_cache()

print(response)

2025-03-08 04:42:49.906224: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 04:42:49.913146: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741426969.920862   61620 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741426969.923138   61620 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 04:42:49.932050: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

 operators unheard imperialist deleting hospitalized hospitalized hospitalized hospitalized hospitalized Establishment adventurer hospitalized hospitalized hospitalized Establishment adventurer hospitalized hospitalized hospitalized hospitalized motive hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized hospitalized Rebeccaitect extremely


In [1]:
import os
import torch
from datasets import load_dataset
from transformers import DPRContextEncoder, \
                         DPRContextEncoderTokenizer, \
                         RagTokenizer, \
                         RagRetriever, \
                         RagSequenceForGeneration, \
                         logging
 
torch.set_grad_enabled(False)
 
# Suppress Warnings
logging.set_verbosity_error()
 
# Initialize context encoder & decoder model
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
 
dataset_name = "rony/soccer-dialogues"
localfile_name = dataset_name.split('/')[-1]
 
# load 100 rows from the dataset
ds = load_dataset(dataset_name, split='train[:100]')
def transforms(examples):
    """
    Transform dataset to be passed
    as an input to the RAG model
    """
    inputs = {}
    inputs['text'] = examples['text'].replace('_',' ')
    inputs['embeddings'] = ctx_encoder(**ctx_tokenizer(inputs['text'], return_tensors="pt"))[0][0].numpy()
    inputs['title'] = 'soccer'
    return inputs
ds = ds.map(transforms)
 
# Add faiss index to the dataset, it is needed for DPR
ds.add_faiss_index(column='embeddings')
 
# Initialize retriever and model
rag_model = "facebook/rag-sequence-nq"
tokenizer = RagTokenizer.from_pretrained(rag_model)
retriever = RagRetriever.from_pretrained(rag_model, indexed_dataset=ds)
model = RagSequenceForGeneration.from_pretrained(rag_model, retriever=retriever)
 
# Generate output for questions
question = "How old is Granit Xhaka"
input_dict = tokenizer(question, return_tensors="pt")
generated = model.generate(input_ids=input_dict["input_ids"], max_new_tokens=50)
 
print(f"{question}?")
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0])

2025-03-08 04:49:41.066969: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 04:49:41.074472: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741427381.082344   62964 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741427381.084717   62964 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 04:49:41.095173: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

train.txt:   0%|          | 0.00/10.7M [00:00<?, ?B/s]

validation.txt:   0%|          | 0.00/648k [00:00<?, ?B/s]

test.txt:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/330409 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/19919 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/45620 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

How old is Granit Xhaka?
kun Dock Andre tidalsaf Ny driving Sle 118 <!--="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="#="# laboratory


In [7]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

# prompt = (
#     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
#     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
#     "researchers was the fact that the unicorns spoke perfect English."
# )

prompt = (
    "Andrew's favorite color is blue. What is his favorite color?"
)

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

gen_tokens = model.generate(
    input_ids,
    # do_sample=True,
    do_sample=False,
    temperature=0.9,
    max_length=100,
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]

print(gen_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Andrew's favorite color is blue. What is his favorite color?

Andrew's favorite color is blue. What is his favorite color?

Andrew's favorite color is blue. What is his favorite color?

Andrew's favorite color is blue. What is his favorite color?

Andrew's favorite color is blue. What is his favorite color?

Andrew's favorite color is blue. What is his favorite color?

Andrew's favorite color is blue. What is his


In [5]:
print(gen_text)

Andrew's favorite color is blue. And his favorite foods are blue. And he even thinks it's time to have a blue birthday.

"I really think my birthday is coming up," he said. "Blue is the color of the sky. This is the color of the sky."

His favorite book is called "The Hobbit." He loves "The Lion King" and "Pocahontas" and "War Horse," and his favorite cartoon is "Peppa Pig."


In [29]:
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

# Load pre-trained model and tokenizer
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# Define context and question
# context = "Andrew's favorite color is blue."
# context = "Andrew's favorite color is blue and Laura's is red."
# question = "What is Laura's favorite color?"
# context = "In ultimate frisbee, a player may not hold the disc for more than 10 seconds. If they do, it is a turnover."
# question = "How long can a player hold the disc in ultimate frisbee?"
# context = "A player on the throwing team may not touch the pull in the air before a member of the receiving team touches it. If this violation occurs, the receiving team may request a re-pull immediately."
# question = "What happens if a player on the throwing team touches the pull in the air before a member of the receiving team touches it?"
context = "7.B.4.c. The player who had possession of the disc when the team timeout was called restarts play with a check at the pivot spot, and the marker resumes the stall count with the word “stalling” followed by the last number uttered before the timeout plus one or 9 if over 8, however 15.A.4 applies."
question = "Who starts with the disc at the end of a timeout?"

# Tokenize input
inputs = tokenizer.encode_plus(question, context, return_tensors="pt")

# Get model outputs
outputs = model(**inputs)

# Extract answer
answer_start_scores = outputs.start_logits
answer_end_scores = outputs.end_logits

# Find the tokens with the highest `start` and `end` scores
answer_start = torch.argmax(answer_start_scores)
answer_end = torch.argmax(answer_end_scores) + 1

# Convert tokens to answer
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

print(f"Answer: {answer}")

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Answer: the player


In [None]:
# This doesn't crash though the output is poor!

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

model_name = "deepseek-ai/deepseek-llm-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")

model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

# text = "An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is"
text = "Andrew's favorite color is violet and Laura's favorite color is green. What is Laura's favorite color?"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

2025-03-08 06:03:38.977530: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 06:03:39.029941: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741431819.043954   77375 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741431819.048305   77375 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 06:03:39.078468: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Andrew's favorite color is violet and Laura's favorite color is green. What is Laura's favorite color?

A. Violet
B. Green
C. Blue
D. Red
E. Yellow

____________________

____________________

____________________

____________________

____________________

____________________

____________________

____________________

____________________

____________________

____________________

____________________

____________________

____________________

____________________

____________________

____________________

____________________

____________________

________________


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

model_name = "deepseek-ai/deepseek-llm-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")

model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

context = "Andrew's favorite color is violet and Laura's favorite color is green. They both enjoy painting and often use their favorite colors in their artwork. Andrew prefers abstract art, while Laura loves painting landscapes. They recently participated in an art exhibition where Andrew's violet-themed abstract piece won first place, and Laura's green landscape painting received a special mention."
question = "What is Laura's favorite color?"
inputs = tokenizer(context + " " + question, return_tensors="pt")
outputs = model.generate(
    **inputs.to(model.device),
    max_new_tokens=50,
    temperature=0.2,
    top_p=1.0,
    top_k=0,
    repetition_penalty=1.0,
    do_sample=True
)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

2025-03-08 06:19:15.305235: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 06:19:15.338613: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741432755.348248   82505 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741432755.350988   82505 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 06:19:15.365701: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Andrew's favorite color is violet and Laura's favorite color is green. They both enjoy painting and often use their favorite colors in their artwork. Andrew prefers abstract art, while Laura loves painting landscapes. They recently participated in an art exhibition where Andrew's violet-themed abstract piece won first place, and Laura's green landscape painting received a special mention. What is Laura's favorite color?

A. Violet

B. Green

C. Blue

D. Red

E. Yellow

## 15.3

Andrew and Laura are both artists. Andrew's favorite color is violet,


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

model_name = "deepseek-ai/deepseek-llm-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")

model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

context = "Andrew's favorite color is violet and Laura's favorite color is green. They both enjoy painting and often use their favorite colors in their artwork. Andrew prefers abstract art, while Laura loves painting landscapes. They recently participated in an art exhibition where Andrew's violet-themed abstract piece won first place, and Laura's green landscape painting received a special mention."
question = "What is Laura's favorite color?"
inputs = tokenizer(context + " " + question, return_tensors="pt")
outputs = model.generate(
    **inputs.to(model.device),
    max_new_tokens=50,
    do_sample=False  # Turn off sampling for deterministic generation
)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

2025-03-08 06:21:01.810116: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 06:21:01.843765: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741432861.853536   83060 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741432861.856522   83060 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 06:21:01.871336: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Andrew's favorite color is violet and Laura's favorite color is green. They both enjoy painting and often use their favorite colors in their artwork. Andrew prefers abstract art, while Laura loves painting landscapes. They recently participated in an art exhibition where Andrew's violet-themed abstract piece won first place, and Laura's green landscape painting received a special mention. What is Laura's favorite color?

A. Violet

B. Green

C. Blue

D. Red

E. Yellow

## 10.10

## 10.10

## 10.


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

model_name = "deepseek-ai/deepseek-llm-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")

model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

context = "Andrew's favorite color is violet and Laura's favorite color is green."
question = "What is Laura's favorite color?"
inputs = tokenizer(context + " " + question, return_tensors="pt")
outputs = model.generate(
    **inputs.to(model.device),
    max_new_tokens=50,
    do_sample=False  # Turn off sampling for deterministic generation
)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

2025-03-08 06:23:44.843141: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 06:23:44.853545: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741433024.860584   83919 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741433024.862689   83919 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 06:23:44.872447: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Andrew's favorite color is violet and Laura's favorite color is green. What is Laura's favorite color?

A. Violet
B. Green
C. Blue
D. Red
E. Yellow

____________________

____________________

____________________

____________________

____________________

____________________

____________________



In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

model_name = "deepseek-ai/deepseek-llm-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")

model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

context = "Andrew's favorite color is violet and Laura's favorite color is green."
question = "What is Laura's favorite color? Provide a direct answer."
inputs = tokenizer(context + " " + question, return_tensors="pt")
outputs = model.generate(
    **inputs.to(model.device),
    max_new_tokens=10,
    do_sample=False  # Turn off sampling for deterministic generation
)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

2025-03-08 06:26:05.633716: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 06:26:05.666430: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741433165.675668   84643 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741433165.678494   84643 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 06:26:05.697729: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Andrew's favorite color is violet and Laura's favorite color is green. What is Laura's favorite color? Provide a direct answer.

### What is the value of the variable
