In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, RagTokenizer, RagRetriever, RagSequenceForGeneration
import logging, json
import spacy
from datasets import load_dataset
from torch.utils.data import DataLoader, SequentialSampler
from tqdm import tqdm
import json
import os

In [5]:
#check torch version and if working with CPU
print(torch.__version__)
print(torch.cuda.is_available())  # This should return False since you're using CPU

2.4.0+cpu
False


Since LLaMA is a causal language model, you should use the AutoModelForCausalLM class instead of AutoModelForSeq2SeqLM.

In [2]:
tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b_v2", use_fast=False)
model = AutoModelForCausalLM.from_pretrained("openlm-research/open_llama_7b_v2")

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
Loading checkpoint shards: 100%|██████████| 2/2 [01:16<00:00, 38.31s/it]


In [3]:
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d : %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO
)

logger = logging.getLogger(__name__)

# Load Spacy model for POS and DEP tagging
nlp = spacy.load("en_core_web_sm")

# Function for POS and DEP tagging
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    pos_tags = [token.pos_ for token in doc]
    dep_tags = [token.dep_ for token in doc]
    return tokens, pos_tags, dep_tags



In [6]:
device = torch.device("cpu")
model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-06)
      

In [7]:
# Load the tokenizer for RAG
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")

# Load the dataset with the correct configuration and trust_remote_code=True
dataset = load_dataset("wiki_dpr", "psgs_w100.nq.exact", split="train", trust_remote_code=True)

# Load the retriever using the dataset
retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq",
    index_name="exact",
    use_dummy_dataset=False,  # use the actual dataset
    indexed_dataset=dataset
)

# Load the RAG sequence model
model = RagSequenceForGeneration.from_pretrained(
    "facebook/rag-sequence-nq", 
    retriever=retriever
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

MemoryError: std::bad_alloc

alternative to the code above: Manual Configuration to Avoid Warnings

In [8]:
# Load the RAG components
rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True)
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)

# Manually set the tokenizers for specific components
question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/rag-token-nq")
generator_tokenizer = BartTokenizer.from_pretrained("facebook/rag-token-nq")

# Assign them to the model's components (requires manual model modification)
model.question_encoder.tokenizer = question_encoder_tokenizer
model.generator.tokenizer = generator_tokenizer


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

: 

In [None]:
# Ensure everything is set to CPU
device = torch.device("cpu")
model.to(device)

In [None]:
# Load and preprocess the dataset
def load_and_preprocess_dataset(dataset_name="squad", split="train[:10%]"):
    dataset = load_dataset(dataset_name, split=split)
    preprocessed_data = []
    
    for entry in tqdm(dataset):
        question = entry['question']
        context = entry['context']
        
        # POS and DEP tagging
        context_tokens, context_pos, context_dep = preprocess_text(context)
        question_tokens, question_pos, question_dep = preprocess_text(question)
        
        # Tokenize and encode the inputs
        input_ids = tokenizer(question, context, return_tensors="pt", padding=True, truncation=True).input_ids
        
        preprocessed_data.append({
            "input_ids": input_ids,
            "context_tokens": context_tokens,
            "context_pos": context_pos,
            "context_dep": context_dep,
            "question_tokens": question_tokens,
            "question_pos": question_pos,
            "question_dep": question_dep,
        })
    
    return preprocessed_data



In [None]:
# Load and preprocess the data
dataset_name = "squad"
preprocessed_data = load_and_preprocess_dataset(dataset_name=dataset_name)


In [1]:
# Function to perform evaluation or inference
def evaluate(model, preprocessed_data, device):
    model.eval()
    results = []
    
    for data in tqdm(preprocessed_data, desc="Evaluating"):
        input_ids = data['input_ids'].to(device)
        with torch.no_grad():
            generated = model.generate(input_ids=input_ids, max_length=50, num_beams=5)
        output = tokenizer.decode(generated[0], skip_special_tokens=True)
        
        results.append({
            "question": ' '.join(data['question_tokens']),
            "generated_answer": output
        })
    
    return results

In [2]:
# Evaluate the model on the preprocessed data
results = evaluate(model, preprocessed_data, device)

# Print a few results
for i, result in enumerate(results[:5]):
    logger.info(f"Question {i+1}: {result['question']}")
    logger.info(f"Generated Answer {i+1}: {result['generated_answer']}\n")

NameError: name 'model' is not defined

In [3]:
def load_json_files(json_files):
    documents = []
    for file in json_files:
        if not os.path.exists(file):
            print(f"Warning: {file} not found. Skipping.")
            continue
        
        with open(file, 'r') as f:
            try:
                data = json.load(f)
                if isinstance(data, dict):  # Handling for single JSON object files
                    data = [data]

                for item in data:
                    if 'question' in item:
                        document = {
                            'text': item['question'],
                            'pos_tags': item.get('question_pos_tokens', []),
                            'dep_tags': item.get('question_dep_ids', [])
                        }
                        documents.append(document)
                    else:
                        print(f"Warning: No suitable field found in {file}. Skipping this item.")
            except json.JSONDecodeError:
                print(f"Error: Could not decode JSON from {file}. Skipping.")
    return documents

# Define paths to your datasets
dataset_dirs = [
    'data/lcquad2',
    'data/qald9',
    'data/vquanda'
]

all_documents = []

# Loop through each dataset and load all the documents
for dataset_dir in dataset_dirs:
    json_files = [
        os.path.join(dataset_dir, 'dep_mapping.json'),
        os.path.join(dataset_dir, 'pos_mapping.json'),
        os.path.join(dataset_dir, 'test.json'),
        os.path.join(dataset_dir, 'train.json'),
        os.path.join(dataset_dir, 'val.json')
    ]
    
    # Load documents from current dataset and add them to the master list
    documents = load_json_files(json_files)
    all_documents.extend(documents)  # Combine all documents from different datasets

# Now, all_documents contains the combined data from all three datasets
print(f"Total documents loaded: {len(all_documents)}")

# Example: Putting data into RAG (This part will depend on your specific RAG setup)
# Assuming you have a function like this:
# rag_model.add_documents(all_documents)


NameError: name 'os' is not defined

In [None]:
# Load a pre-trained RAG model
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq")

# Initialize the retriever with the combined documents
retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq",
    index_name="custom",
    passages=all_documents  # Use combined documents
)

# Save the retriever for later use
retriever.save_pretrained("path/to/save/retriever")


In [4]:
# Load the RAG tokenizer and retriever
rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
rag_retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)

# Initialize the RAG model
rag_model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq")

# Tokenize the questions and contexts
inputs = rag_tokenizer(data['question'], return_tensors="pt", padding=True, truncation=True)
contexts = rag_tokenizer(data['context'], return_tensors="pt", padding=True, truncation=True)

# Retrieve the top-k documents using FAISS
retrieved_docs = rag_retriever(inputs['input_ids'], return_tensors="pt")

# Generate the answers using the retrieved documents
generated_ids = rag_model.generate(input_ids=inputs['input_ids'], context_input_ids=retrieved_docs['context_input_ids'])
generated_texts = rag_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

for question, generated_text in zip(data['question'], generated_texts):
    print(f"Question: {question}")
    print(f"Generated Answer: {generated_text}\n")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in 

ImportError: 
RagRetriever requires the 🤗 Datasets library but it was not found in your environment. You can install it with:
```
pip install datasets
```
In a notebook or a colab, you can install it by executing a cell with
```
!pip install datasets
```
then restarting your kernel.

Note that if you have a local folder named `datasets` or a local python file named `datasets.py` in your current
working directory, python may try to import this instead of the 🤗 Datasets library. You should rename this folder or
that python file if that's the case. Please note that you may need to restart your runtime after installation.


In [None]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
)
# Define the trainer
trainer = Seq2SeqTrainer(
    model=rag_model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
)
# Fine-tune the model
trainer.train()