In [None]:
import json
import torch
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup

# Load POS and DEP mappings
def load_mappings(dep_path, pos_path):
    with open(dep_path) as f:
        dep_mapping = json.load(f)
    with open(pos_path) as f:
        pos_mapping = json.load(f)
    return dep_mapping, pos_mapping

# Custom dataset class
class CustomRAGDataset(Dataset):
    def __init__(self, data, tokenizer, dep_mapping, pos_mapping, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.dep_mapping = dep_mapping
        self.pos_mapping = pos_mapping
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item['question']
        tokenized_input = self.tokenizer(question, truncation=True, padding='max_length', max_length=self.max_length)

        dep_tags = self.dep_mapping.get(item['id'], [])
        pos_tags = self.pos_mapping.get(item['id'], [])

        # Padding dep and pos tags if necessary
        dep_tags = dep_tags[:self.max_length] + [0] * (self.max_length - len(dep_tags))
        pos_tags = pos_tags[:self.max_length] + [0] * (self.max_length - len(pos_tags))

        return {
            'input_ids': torch.tensor(tokenized_input['input_ids']),
            'attention_mask': torch.tensor(tokenized_input['attention_mask']),
            'dep_tags': torch.tensor(dep_tags),
            'pos_tags': torch.tensor(pos_tags),
            'answer': torch.tensor(item['answer'])  # Assuming answer is already tokenized or can be
        }

# Load dataset
def load_dataset(data_path):
    with open(data_path) as f:
        data = json.load(f)
    return data

# Example usage
tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b_v2", use_fast=False)

dep_mapping, pos_mapping = load_mappings('path/to/dep_mapping.json', 'path/to/pos_mapping.json')
dataset = load_dataset('path/to/dataset.json')

# Create dataset and dataloaders
train_dataset = CustomRAGDataset(dataset, tokenizer, dep_mapping, pos_mapping)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=4)  # Adjust batch_size as needed

# Example usage in training loop
model = AutoModelForCausalLM.from_pretrained("openlm-research/open_llama_7b_v2")
model.to('cuda')  # Assuming you have a GPU

optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader))

model.train()
for epoch in range(3):  # Example: 3 epochs
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        dep_tags = batch['dep_tags'].to('cuda')
        pos_tags = batch['pos_tags'].to('cuda')
        labels = batch['answer'].to('cuda')

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()


In [None]:
def evaluate_model(model, eval_dataloader):
    model.eval()
    eval_loss = 0
    for batch in eval_dataloader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to('cuda')
            attention_mask = batch['attention_mask'].to('cuda')
            dep_tags = batch['dep_tags'].to('cuda')
            pos_tags = batch['pos_tags'].to('cuda')
            labels = batch['answer'].to('cuda')

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            eval_loss += loss.item()

    avg_eval_loss = eval_loss / len(eval_dataloader)
    print(f"Average Evaluation Loss: {avg_eval_loss}")

# Example evaluation
eval_dataset = CustomRAGDataset(dataset, tokenizer, dep_mapping, pos_mapping)
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=4)

evaluate_model(model, eval_dataloader)
