<a href="https://colab.research.google.com/github/aswinaus/Reinforcement-Learning/blob/main/Agentic_RAG_RewardFunction_GRPO_W%26B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install llama-index -q
!pip install langchain -q
!pip install langchain_experimental -q

In [None]:
import os
import nest_asyncio
nest_asyncio.apply()

In [None]:
from google.colab import userdata
# Set the OpenAI API key as an environment variable
os.environ["OPENAI_API_KEY"] =  userdata.get('OPENAI_API_KEY')

In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
# Setup OpenAI Model and Embeddings used for indexing the documents
Settings.llm = OpenAI(model='gpt-4o-mini', temperature=0.2)
Settings.embed_model = OpenAIEmbedding(model='text-embedding-3-small')
Settings.chunk_size = 1024

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_dir = '/content/drive/MyDrive' # Input a data dir path from your mounted Google Drive

In [None]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector
from llama_index.core import SimpleDirectoryReader
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core import VectorStoreIndex, SummaryIndex

In [None]:
# In order to avoid repeated calls to LLMs we can store the documents index and load it if present else create it
PERSIST_INDEX_DIR = f"/{data_dir}/RAG/data/"
def get_index(index_name, doc_file_path):
  index = None
  if not os.path.exists(f"{PERSIST_INDEX_DIR}{index_name}/"):
    # Load the documents
    documents = SimpleDirectoryReader(input_files=[doc_file_path]).load_data()
    index = VectorStoreIndex.from_documents(documents)
    # Store the index to disk
    index.storage_context.persist(f"{PERSIST_INDEX_DIR}{index_name}/")
  else: # Load index from disk
    storage_context = StorageContext.from_defaults(persist_dir=f"{PERSIST_INDEX_DIR}{index_name}/")
    index = load_index_from_storage(storage_context)

  return index

In [None]:
# Load OECD guidelines documents for Transfer Pricing
docs_OECD_guidelines = SimpleDirectoryReader(f"{data_dir}/RAG/data/OECD/").load_data()
# Load OECD guidelines documents for Form990
docs_Form990_guidelines = SimpleDirectoryReader(f"{data_dir}/RAG/data/Form990/").load_data()

In [None]:
#initialise a storage context and use that for both Vector Index and Summary Index for OECD
#split the OECD document into multiple nodes
oecd_nodes = Settings.node_parser.get_nodes_from_documents(docs_OECD_guidelines)
#split the Form990 document into multiple nodes
form990_nodes = Settings.node_parser.get_nodes_from_documents(docs_Form990_guidelines)

storage_context = StorageContext.from_defaults()

storage_context.docstore.add_documents(oecd_nodes)
storage_context.docstore.add_documents(form990_nodes)
# Setup Vector and Summary Index from Storage Context
summary_index = SummaryIndex(oecd_nodes, storage_context=storage_context)
vector_index = VectorStoreIndex(oecd_nodes, storage_context=storage_context)

# Setup Indices.In order to avoid repeated calls to LLMs we can store the documents index and load it if present else create it
OECD_index = get_index("OECDTPGuidelines",f"{data_dir}/RAG/data/OECD/OECD_Transfer_Pricing_Guidelines.pdf")
form990_guidelines_index = get_index("Form990Guidelines",f"{data_dir}/RAG/data/Form990/Form990_Guidelines.pdf")

In [None]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector

# Create the query engines
OECD_engine = OECD_index.as_query_engine(similarity_top_k=3)
form990_guidelines_engine = form990_guidelines_index.as_query_engine(similarity_top_k=3)
# Create tools for the query engines
OECD_query_tool = QueryEngineTool(
                      query_engine=OECD_engine,
                      metadata=ToolMetadata(
                          name="OECD_QueryEngineTool_2022",
                          description="Provides information about Transfer Pricing Guidelines for Organization from OECD for year 2022"
                      )
                    )

Form990_query_tool = QueryEngineTool(
                      query_engine=form990_guidelines_engine,
                      metadata=ToolMetadata(
                          name="form990_2022",
                          description="Provides information about Form990 filling guidelines for Non-Profit Organization only from the index which was set for Form990_Guidelines.pdf "
                      )
                    )

tools = [OECD_query_tool, Form990_query_tool]

filing_engine = RouterQueryEngine(
                      selector= LLMSingleSelector.from_defaults(),
                      query_engine_tools=tools
                      )

In [None]:
#Agentic Router RAG -
from llama_index.agent.openai import OpenAIAgent
agent = OpenAIAgent.from_tools(tools=tools, verbose=True)
# Uncomment and use the below call for interactive session
#agent.chat_repl()
response = agent.chat("What is Form990 EZ and when should an organiaztion complete Form990 EZ form? And how is it different from Schedule H? Can you show the results in side by side comparison table with headers and also link to the document?")
print (response)

In [None]:
from llama_index.agent.openai import OpenAIAssistantAgent
agent = OpenAIAssistantAgent.from_new(
          name = "OECD and Form990 Agent",
          instructions= "You are an assistant that provides answers to questions on OECD and Form990. And make sure the answers are retreived form the OECD and Form990 pdf's only. No data from open Internet. Whenever there is comparison make sure the results are in side by side comparison table with headers and add links to the document.",
          tools=tools,
          verbose=True,
          run_retrieve_sleep_time=1.0
        )
response = agent.chat("What does Articles 9 and 25 of the OECD Model Tax Convention state?")
print (response)

In [None]:
questions = ["What does Articles 9 of the OECD Model Tax Convention state?",
             "What does Articles 25 of the OECD Model Tax Convention state?",
             "What does Allocation of Taxing Rights mean in OECD Model Tax Convention state?",
             "How is Mutual Agreement Procedure(MAP) help in resolving disputes between countries when there's a conflict in interpreting the treaty?",
             "As per OECD Model Tax Convention States what does Residence and Source Country mean?"]
ground_truth = ["addresses corresponding adjustments in transfer pricing",
                "outlines the mutual agreement procedure, which resolves disputes related to the application of double tax conventions.",
                "principles that determine how different jurisdictions can tax income generated by multinational enterprises (MNEs).",
                "serves as a mechanism for tax administrations to consult and resolve disputes related to the interpretation and application of double tax conventions. It is particularly useful in situations where there is taxation not in accordance with the provisions of the Convention.",
                "Resident country: The country where the taxpayer lives, Source country: The country where the income originates may also have taxing rights but often with limits."]

In [None]:
!pip install datasets --quiet
from datasets import Dataset

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def cosine_similarity_reward(retrieved_context, ground_truth):
    """
    Calculates a reward based on cosine similarity between the retrieved context
    and the ground truth using TF-IDF vectorization.

    Args:
        retrieved_context (str): The text from the retrieved documents.
        ground_truth (str): The ground truth text.

    Returns:
        float: A score between 0 and 1 representing the cosine similarity.
    """
    # Handle empty strings
    if not retrieved_context or not ground_truth:
        return 0.0

    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer().fit([retrieved_context, ground_truth])
    vectors = vectorizer.transform([retrieved_context, ground_truth])

    # Calculate cosine similarity
    similarity_score = cosine_similarity(vectors[0], vectors[1])[0][0]

    return similarity_score

# Example usage (assuming 'contexts' and 'ground_truth' are defined):
# combined_context = " ".join(contexts[0]) # Combine retrieved contexts
# reward = cosine_similarity_reward(combined_context, ground_truth[0])
# print(f"Cosine Similarity Reward: {reward}")

In [None]:
answers  = []
contexts = []
cosine_similarity_rewards = [] # List to store cosine similarity rewards


# traversing each question and passing into the chain to get answer from the system
# Define the retriever from the OECD index
retriever = OECD_index.as_retriever()

for i, question in enumerate(questions):
    response = agent.chat(question)
    answers.append(response.response) # Extract the string response
    retrieved_docs = retriever.retrieve(question)
    context_texts = [docs.node.text for docs in retrieved_docs]
    contexts.append(context_texts)

    # Calculate cosine similarity reward
    # Combine retrieved contexts into a single string for similarity calculation
    combined_context = " ".join(context_texts)
    cosine_similarity_reward_score = cosine_similarity_reward(combined_context, ground_truth[i])
    cosine_similarity_rewards.append(cosine_similarity_reward_score)


# Preparing the dataset
data = {
    "question": questions,
    "answer": answers,
    "ground_truth": ground_truth,
    "contexts": contexts, # Add the contexts to the dataset
    "cosine_similarity_reward": cosine_similarity_rewards, # Add the cosine similarity rewards
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)
dataset.to_pandas()

In [None]:
!pip install ragas --quiet
import ragas

In [None]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

result = evaluate(
    dataset=dataset,
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
)

df = result.to_pandas()
df

# Task
Explain the selected code, discuss the next steps in RL after calculating rewards based on cosine similarity, explain how to update the policy directly in GRPO for the current situation, and define the policy network for this example, specifically referring to Group Relative Policy Optimization (GRPO) in the context of a RAG system where the reward is based on cosine similarity.

## Understand the role of the policy network

### Subtask:
Clarify what aspects of the RAG system the policy network will control (e.g., retrieval parameters, generation parameters, or both).


## Choose a suitable neural network architecture

### Subtask:
Select a type of neural network (e.g., a feedforward network, recurrent network, or transformer-based model) that can process the input question and output the control signals for the RAG system.


**Reasoning**:
To select a neural network architecture suitable for processing text input (the question) and outputting control signals for the RAG system, considering the sequential nature of text and the need to capture semantic meaning, a transformer-based model is a good choice due to its effectiveness in handling sequential data and complex relationships within text.



In [None]:
# Explanation of the chosen neural network architecture

# For this task, where the input is a text question and the output needs to be control signals
# for a RAG system (e.g., parameters for retrieval or generation), a transformer-based model
# is a suitable choice.

# Justification:
# 1. Text Input: Transformers are specifically designed to process sequential data like text.
# 2. Semantic Understanding: They excel at capturing long-range dependencies and semantic relationships
#    within the input question, which is crucial for determining appropriate RAG control signals.
# 3. Contextual Embeddings: Transformer models generate rich contextual embeddings of the input text,
#    allowing the policy network to make informed decisions based on the nuances of the question.
# 4. Adaptability: Transformer architectures can be adapted with output layers to produce numerical
#    or categorical control signals as needed by the RAG system.

# Examples of transformer-based models that could be adapted for this purpose include
# models based on BERT, RoBERTa, or smaller, task-specific transformer variants.
# The specific output layer would depend on the exact control signals being predicted (e.g.,
# a dense layer for numerical parameters or a softmax layer for categorical choices).

# This choice aligns with the task of learning a policy based on textual input to influence
# the RAG system's behavior for better performance according to the defined reward.

## Define the input and output layers

### Subtask:
Define the format of the input (the question) and the output (the parameters or actions that influence the RAG system) for the chosen transformer-based policy network.


**Reasoning**:
Describe the input and output format of the chosen transformer-based policy network based on the previous subtask.



In [None]:
# 1. Input Layer Format:
# The input to the policy network will be the user's question, which is a string of text.
# Before being fed into the transformer layers, this text will undergo standard NLP preprocessing steps:
# - Tokenization: The text will be broken down into a sequence of tokens (words or sub-word units) using a tokenizer appropriate for the chosen transformer model (e.g., WordPiece for BERT, BPE for RoBERTa).
# - Embedding: The sequence of tokens will be converted into a sequence of numerical embeddings. Transformer models typically use learned token embeddings, positional embeddings (to capture token order), and potentially segment embeddings. The input to the transformer layers will be a tensor of shape (batch_size, sequence_length, embedding_dim), where:
#   - batch_size: The number of questions processed in parallel.
#   - sequence_length: The maximum number of tokens in a question (padded or truncated).
#   - embedding_dim: The dimensionality of the token embeddings.

# 2. Output Layer(s) Format:
# The output layer(s) of the policy network will produce control signals for the RAG system. Based on the understanding that the policy network controls retrieval parameters (like similarity_top_k) and potentially influences generation, the output could be structured as follows:
# - For a numerical parameter like `similarity_top_k`: A single dense layer with one output neuron, potentially followed by an activation function (e.g., ReLU to ensure non-negativity) and possibly scaled to a reasonable range. The output would be a tensor of shape (batch_size, 1).
# - For influencing generation (less direct control in this setup, but conceptually): This could be represented as a vector influencing attention mechanisms or providing context to the generation model. However, focusing on retrieval parameters as the primary policy output in this GRPO context is more straightforward.
# - For simplicity and direct control over a key retrieval parameter, let's define the output as a single numerical value representing `similarity_top_k`. The output layer will be a dense layer with 1 output neuron.

# Therefore, the output of the policy network will be a tensor of shape (batch_size, 1), representing the predicted value for `similarity_top_k` for each question in the batch.

# 3. Interpretation and Usage of the Policy Network's Output:
# The output of the policy network (the predicted `similarity_top_k` value) will be used to configure the retrieval step of the RAG system for the given question.
# - During training: The predicted `similarity_top_k` will be used to perform retrieval. The retrieved context, along with the question, will then be passed to the generation model to produce an answer. This answer will be compared to the ground truth to calculate the cosine similarity reward. This reward will be used by the GRPO algorithm to update the policy network's weights, encouraging it to predict `similarity_top_k` values that lead to higher rewards.
# - During inference: The policy network will predict `similarity_top_k` for a new question, and this value will be used directly in the retrieval process to gather context for generating the final answer.
# The predicted numerical output for `similarity_top_k` might need to be post-processed (e.g., rounded to an integer, clipped to a valid range) before being used by the RAG system's retriever.

## Implement the policy network

### Subtask:
Implement the policy network using a deep learning framework. This involves defining the transformer layers and the output layer(s) based on the input and output formats defined in the previous steps.


**Reasoning**:
Implement the policy network using PyTorch, defining the transformer layers and the output layer as specified in previous steps.

In essence, this network takes a question, processes it through a pre-trained transformer to understand its context, and then uses a simple linear layer to predict a non-negative numerical value intended to represent the optimal similarity_top_k for retrieving documents for that question.

In the context of Reinforcement Learning (RL), a policy network is a neural network that learns to map states (in our case, the user's question) to actions (the parameters or decisions that control the RAG system).

**State:** The input is the user's question. The policy network processes this question to understand its meaning and context.

**Action:** The output of the policy network is a value (or values) that influences how the RAG system operates. In the code we just discussed, the policy network's action space was initially simplified to predicting a single value: similarity_top_k, which determines how many relevant documents are retrieved. In the modified code, it predicts parameters for a distribution from which similarity_top_k is sampled.

The goal of training the policy network using an algorithm like GRPO is to adjust its internal parameters (the weights and biases of the neural network) so that, when presented with a question, it predicts/samples actions (like a specific similarity_top_k) that lead to higher rewards (in our case, higher cosine similarity between the generated answer and the ground truth).

So, the policy network's role is to learn the optimal strategy for configuring the RAG system based on the input question to maximize the desired outcome (answer quality, measured by the reward).



## Summary:

### Data Analysis Key Findings

*   The policy network is designed to control the `similarity_top_k` parameter in the retrieval step of the RAG system.
*   A transformer-based model (like BERT) is chosen as the architecture for the policy network due to its effectiveness in processing text input (the user question).
*   The input to the policy network is the tokenized and embedded user question, and the output is a single numerical value representing the predicted `similarity_top_k`.
*   The implemented policy network uses a pre-trained BERT model and a linear output layer with a ReLU activation to predict a non-negative `similarity_top_k`.
*   The policy network's output is integrated into the RAG query process by dynamically setting the `similarity_top_k` of the retriever used by the query engine.
*   The training process involves iteratively: predicting `similarity_top_k` using the policy, executing the RAG system, calculating the cosine similarity reward between the generated answer and the ground truth, and updating the policy network's weights using a policy gradient method (aligned with GRPO principles) to maximize the expected reward.

### Insights or Next Steps

*   The predicted `similarity_top_k` value should be carefully post-processed (e.g., rounded, clipped to a valid range based on the document index size) before being used in the retriever to ensure it is a valid and effective parameter.
*   Implementing the full GRPO algorithm would involve more complex components than a basic policy gradient, potentially including maintaining a group of policies, comparing their performance, and using trust region methods to constrain updates. A practical next step is to implement a policy gradient training loop with baseline subtraction and potentially explore using a continuous action space policy outputting the parameters of a distribution (like mean and variance) for `similarity_top_k`.


# Task
**Implement the policy update rule for the GRPO algorithm using cosine similarity as the reward signal to adjust the policy network's parameters in the provided notebook.**

## Modify policy network output

### Subtask:
Adjust the `RAGPolicyNetwork` to output parameters for a distribution over `similarity_top_k`, such as the mean and log-variance of a Gaussian distribution.


**Reasoning**:
Modify the RAGPolicyNetwork class to output parameters for a Gaussian distribution (mean and log-variance) over `similarity_top_k`.



## Implement action sampling and log probability calculation

### Subtask:
Implement functions to sample `similarity_top_k` from the Gaussian distribution predicted by the policy network and calculate the log probability of the sampled action.


**Reasoning**:
Implement functions to sample the action (similarity_top_k) from the predicted Gaussian distribution and calculate the log probability of the sampled action, using the predicted mean and log-variance from the policy network.



## Implement baseline calculation

### Subtask:
Create a function to calculate a baseline for the rewards, such as the average reward in a batch.


**Reasoning**:
Define a function to calculate the mean of a list or tensor of rewards to be used as a baseline.



## Set up training loop

### Subtask:
Structure a training loop that iterates through the dataset, performs forward passes with the policy network, executes the RAG system with sampled actions, calculates rewards and advantages, and computes the policy gradient.


**Reasoning**:
Structure a training loop that iterates through the dataset, performs forward passes with the policy network, executes the RAG system with sampled actions, calculates rewards and advantages, and computes the policy gradient.



**Reasoning**:
Fix the `IndexError` by ensuring `sampled_k_processed[i]` is treated correctly as a tensor within the loop, likely by accessing its value using `.item()` after indexing, as suggested by the error message.



## Implement policy update

### Subtask:
Apply the calculated policy gradient to update the parameters of the policy network using an optimizer.


**Reasoning**:
Apply the calculated policy gradient to update the parameters of the policy network using the optimizer.



## Evaluate and refine

### Subtask:
After training, evaluate the performance of the policy-controlled RAG system and refine the implementation or hyperparameters as needed.


**Reasoning**:
Evaluate the performance of the policy-controlled RAG system after the training loop. This involves setting the policy network to evaluation mode, using a dataset (can be the same as training for demonstration), iterating through questions, predicting/sampling `similarity_top_k`, executing the RAG query, calculating the cosine similarity reward, and finally reporting the average reward.



**Reasoning**:
The previous command failed because the `policy_controlled_rag_query` function was expecting a tensor output from the policy network, but the modified policy network now returns a tuple of tensors (mean and log-variance). The `policy_controlled_rag_query` function needs to be updated to handle this new output format, sample an action (similarity_top_k) from the predicted distribution, and then use that sampled action.



## Summary:

### Data Analysis Key Findings

*   The `RAGPolicyNetwork` was successfully modified to output parameters (mean and log-variance) for a Gaussian distribution over `similarity_top_k`.
*   Functions were successfully implemented to sample `similarity_top_k` from the predicted Gaussian distribution and calculate the log probability of the sampled action, including handling the conversion from a continuous sample to a discrete, positive integer.
*   A function to calculate the mean reward as a baseline was successfully implemented and tested.
*   A training loop structure was successfully set up, integrating the policy network forward pass, action sampling, a placeholder for RAG execution and reward calculation (using a dummy reward for demonstration), baseline calculation, advantage computation, policy loss calculation, backpropagation, and optimizer steps.
*   The core policy update steps (`backward()` and `optimizer.step()`) were confirmed to be correctly integrated and executed within the training loop.
*   An evaluation process was implemented to test the trained policy, showing that the policy network predicted `similarity_top_k` values (10 and 11 in the example) and resulted in an average cosine similarity reward of 0.3299 on the evaluation set.

### Insights or Next Steps

*   The current low average reward (0.3299) suggests that the policy is not yet effectively learning to choose `similarity_top_k` values that maximize cosine similarity. Further refinement of the reward function, action space handling, or policy network architecture is needed.
*   Implementing the actual RAG execution and cosine similarity reward calculation within the training loop is the crucial next step to train the policy network on real rewards rather than a dummy reward. This will require integrating the RAG system (using `OECD_index` and `agent`) into the training loop's batch processing.


# Task
Implement observability metrics for the training process of a policy network using Weights and Biases.

## Identify key metrics

### Subtask:
Determine which metrics are most important to track for monitoring the training process of the policy network (e.g., epoch number, batch number, average policy loss per batch, average reward per batch, average predicted similarity_top_k per batch, average advantage per batch).


**Reasoning**:
Identify and list the key metrics for monitoring the training process of the policy network.



## Integrate weights & biases

### Subtask:
Install the `wandb` library and initialize a Weights & Biases run at the beginning of the training script.


**Reasoning**:
Install the wandb library and initialize a Weights & Biases run at the beginning of the training script as requested by the subtask.



**Reasoning**:
The previous command failed because the variables `questions` and `ground_truth` were not defined in the current code cell. These variables were defined in a previous cell and need to be accessible. The code block should be re-executed including the definitions of `questions` and `ground_truth`.



**Reasoning**:
The previous command failed because the `policy_network` variable was not defined. This variable was instantiated in a previous cell and needs to be accessible in the current cell for the optimizer to be created and for the training loop to run. The code block should be re-executed including the definition and instantiation of the `RAGPolicyNetwork` class and the `policy_network` object.



**Reasoning**:
The previous command failed because the function `sample_action_and_continuous` was not defined in the current code cell. This function, along with `calculate_baseline` and `calculate_log_prob`, are necessary for the training loop to execute. These functions need to be included in the current code block.



## Log hyperparameters

### Subtask:
Log the training hyperparameters (e.g., learning rate, batch size, number of epochs) to Weights & Biases.


**Reasoning**:
Define a dictionary containing the training hyperparameters and log it to the initialized Weights & Biases run.



**Reasoning**:
The error indicates that `wandb.init()` was not called before `wandb.config.update()` in the current execution environment. Although `wandb.init()` was in a previous cell, each cell is executed independently. To fix this, I need to call `wandb.init()` again in the same cell before `wandb.config.update()`. I will also redefine the necessary variables and functions to ensure they are available in this cell.



## Integrate metric calculation into the training loop

### Subtask:
Modify the training loop to calculate the chosen metrics for each batch and/or epoch.


**Reasoning**:
Modify the training loop to calculate the selected metrics for each batch and epoch, including average policy loss, average reward, average predicted similarity_top_k, average advantage, average mean, and average log-variance.



## Log metrics to weights & biases

### Subtask:
Add code to log the calculated metrics to Weights & Biases within the training loop. This will typically involve using `wandb.log()`.


**Reasoning**:
Add code to log the calculated metrics to Weights & Biases within the training loop as requested by the subtask. This involves using `wandb.log()` for both batch and epoch metrics.



## Visualize and analyze metrics in weights & biases

### Subtask:
Use the Weights & Biases dashboard to visualize the logged metrics, monitor training progress, and identify potential issues.


## Refine logging and metrics

### Subtask:
Based on the analysis in Weights & Biases, refine the set of metrics being tracked or the logging frequency for better insights.


**Reasoning**:
Based on the analysis in Weights & Biases, refine the set of metrics being tracked and their logging frequency in the training code. The current metrics are informative, but batch-level metrics can be noisy. Epoch-level metrics provide a smoother view of overall progress. We will keep logging both but ensure epoch metrics are clearly distinguished. We can also consider adding the standard deviation of `similarity_top_k` predictions at the epoch level to see how the policy's uncertainty evolves.



In [None]:
!pip install llama-index -q
!pip install langchain -q
!pip install langchain_experimental -q

The training loop includes the actual RAG execution using the OECD_index and the cosine_similarity_reward function to calculate the reward based on the generated answer and the ground truth for each question in the batch. This means the policy network is being trained using real rewards derived from the RAG system's performance.

In [None]:
import torch
import torch.nn as nn # Explicitly import torch.nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from torch.distributions import Normal
import wandb
import os # Import os to check for existing index
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, SummaryIndex, StorageContext, load_index_from_storage, Settings # Import necessary LlamaIndex components
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from google.colab import userdata
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoModel, AutoTokenizer # Import AutoModel and AutoTokenizer explicitly


# Redefine necessary variables and functions from previous cells to ensure scope

# Assuming OPENAI_API_KEY is already set as an environment variable in a previous cell
# os.environ["OPENAI_API_KEY"] =  userdata.get('OPENAI_API_KEY')

# Setup OpenAI Model and Embeddings - Ensure these are set within this cell's execution
Settings.llm = OpenAI(model='gpt-4o-mini', temperature=0.2)
Settings.embed_model = OpenAIEmbedding(model='text-embedding-3-small')
Settings.chunk_size = 1024
print("LlamaIndex Settings configured.")

# Assuming Google Drive is mounted at /content/drive and data_dir is defined
data_dir = '/content/drive/MyDrive' # Input a data dir path from your mounted Google Drive
PERSIST_INDEX_DIR = f"/{data_dir}/RAG/data/"

# Redefine get_index function if needed (assuming index is persisted)
# In this case, we will just load the index directly assuming it exists from previous runs
# If you haven't run the cells to create and persist the index, you would need to do that first.

# Load OECD guidelines documents for Transfer Pricing
# Assuming the index for OECD is already created and persisted in a previous run
try:
    storage_context = StorageContext.from_defaults(persist_dir=f"{PERSIST_INDEX_DIR}OECDTPGuidelines/")
    OECD_index = load_index_from_storage(storage_context)
    print("Loaded OECD index from storage.")
except FileNotFoundError:
    print(f"OECD index not found at {PERSIST_INDEX_DIR}OECDTPGuidelines/. Please run the cells to create and persist the index first.")
    # Handle the error, e.g., exit or create the index
    OECD_index = None # Set to None if not loaded

# Redefine cosine_similarity_reward function
def cosine_similarity_reward(retrieved_context, ground_truth):
    """
    Calculates a reward based on cosine similarity between the retrieved context
    and the ground truth using TF-IDF vectorization.

    Args:
        retrieved_context (str): The text from the retrieved documents.
        ground_truth (str): The ground truth text.

    Returns:
        float: A score between 0 and 1 representing the cosine similarity.
    """
    # Handle empty strings
    if not retrieved_context or not ground_truth:
        return 0.0

    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer().fit([retrieved_context, ground_truth])
    vectors = vectorizer.transform([retrieved_context, ground_truth])

    # Calculate cosine similarity
    similarity_score = cosine_similarity(vectors[0], vectors[1])[0][0]

    return similarity_score

# Redefine sample_action_and_continuous function
def sample_action_and_continuous(mean, log_variance):
    std_dev = torch.exp(0.5 * log_variance)
    distribution = Normal(mean, std_dev)
    continuous_sample = distribution.sample()
    processed_action = torch.max(torch.tensor(1.0), torch.round(torch.abs(continuous_sample)))
    return processed_action, continuous_sample

# Redefine calculate_baseline function
def calculate_baseline(rewards):
    if isinstance(rewards, list):
        rewards = torch.tensor(rewards, dtype=torch.float32)
    if rewards.numel() == 0:
        return 0.0
    return torch.mean(rewards)

def calculate_log_prob(mean, log_variance, action):
    std_dev = torch.exp(0.5 * log_variance)
    distribution = Normal(mean, std_dev)
    log_prob = distribution.log_prob(action)
    return log_prob


# Redefine questions and ground truth
questions = ["What does Articles 9 of the OECD Model Tax Convention state?",
             "What does Articles 25 of the OECD Model Tax Convention state?",
             "What does Allocation of Taxing Rights mean in OECD Model Tax Convention state?",
             "How is Mutual Agreement Procedure(MAP) help in resolving disputes between countries when there's a conflict in interpreting the treaty?",
             "As per OECD Model Tax Convention States what does Residence and Source Country mean?"]
ground_truth = ["addresses corresponding adjustments in transfer pricing",
                "outlines the mutual agreement procedure, which resolves disputes related to the application of double tax conventions.",
                "principles that determine how different jurisdictions can tax income generated by multinational enterprises (MNEs).",
                "serves as a mechanism for tax administrations to consult and resolve disputes related to the interpretation and application of double tax conventions. It is particularly useful in situations where there is taxation not in accordance with the provisions of the Convention.",
                "Resident country: The country where the taxpayer lives, Source country: The country where the income originates may also have taxing rights but often with limits."]

# Redefine RAGPolicyNetwork class
class RAGPolicyNetwork(nn.Module):
    def __init__(self, transformer_model_name="bert-base-uncased", output_dim=2):
        super(RAGPolicyNetwork, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(transformer_model_name)
        self.transformer = AutoModel.from_pretrained(transformer_model_name)
        transformer_output_dim = self.transformer.config.hidden_size
        self.output_layer = nn.Linear(transformer_output_dim, output_dim)

    def forward(self, questions):
        encoded_input = self.tokenizer(questions, return_tensors='pt', padding=True, truncation=True)
        outputs = self.transformer(**encoded_input)
        pooled_output = outputs.pooler_output
        mean_and_log_variance = self.output_layer(pooled_output)
        mean = mean_and_log_variance[:, 0]
        log_variance = mean_and_log_variance[:, 1]
        return mean, log_variance

# Instantiate the policy network again
# Ensure this is done after defining the class
policy_network = RAGPolicyNetwork(transformer_model_name="bert-base-uncased")


# Redefine Dataset and DataLoader
class RAGDataset(Dataset):
    def __init__(self, questions, ground_truth):
        self.questions = questions
        self.ground_truth = ground_truth
    def __len__(self):
        return len(self.questions)
    def __getitem__(self, idx):
        return self.questions[idx], self.ground_truth[idx]

rag_dataset = RAGDataset(questions, ground_truth)
BATCH_SIZE = 8
train_dataloader = DataLoader(rag_dataset, batch_size=BATCH_SIZE, shuffle=True)
NUM_EPOCHS = 100
LEARNING_RATE = 1e-4
optimizer = optim.Adam(policy_network.parameters(), lr=LEARNING_RATE)


# Initialize a Weights & Biases run
# Use reinit=True to allow re-initialization in a notebook environment
if wandb.run is not None:
    wandb.finish()
wandb.init(project="rag-policy-training", name="grpo-cosine-similarity-refined-metrics", reinit=True)

# Define and log hyperparameters
config = {
    "learning_rate": LEARNING_RATE,
    "batch_size": BATCH_SIZE,
    "num_epochs": NUM_EPOCHS,
    "transformer_model": "bert-base-uncased",
    "output_dim": 2
}
wandb.config.update(config)

print("Training hyperparameters logged to Weights & Biases config.")

# --- Training Loop ---
print("Starting policy network training...")

# Calculate total steps for logging
total_steps = NUM_EPOCHS * len(train_dataloader) # Calculates the total number of batches that will be processed across all epochs, useful for a global step count in logging.
global_step = 0 # Initializes a counter for the global step, incremented after processing each batch.

# Check if OECD_index was loaded successfully before starting training
if OECD_index is not None: # Ensures that the training process only starts if the necessary OECD index was successfully loaded from storage.
    for epoch in range(NUM_EPOCHS): # Starts the outer loop which iterates over the defined number of training epochs.
        policy_network.train() # Sets the policy network module to training mode. This affects behaviors like dropout and batch normalization.
        total_epoch_loss = 0 # Initializes a variable to accumulate the policy loss across all batches in the current epoch.
        total_epoch_reward = 0 # Initializes a variable to accumulate the sum of rewards across all batches in the current epoch.
        total_epoch_predicted_top_k = 0 # Initializes a variable to accumulate the sum of predicted similarity_top_k values across all batches in the current epoch.
        total_epoch_advantage = 0 # Initializes a variable to accumulate the sum of advantages across all batches in the current epoch.
        total_epoch_mean = 0 # Initializes a variable to accumulate the sum of predicted means across all batches in the current epoch.
        total_epoch_log_variance = 0 # Initializes a variable to accumulate the sum of predicted log variances across all batches in the current epoch.
        epoch_predicted_top_ks = [] # Initializes a list to store individual predicted similarity_top_k values for calculating the standard deviation at the end of the epoch.
        num_batches = 0 # Initializes a counter for the number of batches processed in the current epoch.

        for batch_idx, (batch_questions, batch_ground_truth) in enumerate(train_dataloader): # Starts the inner loop, iterating through batches of data from the training DataLoader. `batch_idx` is the index of the current batch.
            global_step += 1 # Increments the global step counter after processing each batch.

            optimizer.zero_grad() # Clears the gradients of all optimized tensors. This is important before computing gradients for the current batch.

            # a. Perform a forward pass through the policy network
            mean_output, log_variance_output = policy_network(list(batch_questions)) # Passes the batch of questions (converted to a list) through the policy network's forward method to get the predicted mean and log-variance for the action distribution.

            batch_sampled_k_processed = [] # Initializes a list to store the post-processed (integer, positive) sampled similarity_top_k values for the current batch.
            batch_sampled_k_continuous = [] # Initializes a list to store the original continuous sampled values from the Gaussian distribution for the current batch.
            batch_rewards = [] # Initializes a list to store the calculated rewards for each question in the current batch.

            for i in range(len(batch_questions)): # Starts a loop to process each question individually within the current batch.
                # b. Use the sample_action_and_continuous function to sample similarity_top_k actions
                sampled_k_processed_item, sampled_k_continuous_item = sample_action_and_continuous(mean_output[i], log_variance_output[i]) # Calls the helper function to sample an action (similarity_top_k) from the predicted distribution for the i-th question, getting both the processed integer value and the original continuous sample.

                batch_sampled_k_processed.append(sampled_k_processed_item) # Appends the processed (integer) sampled action to the list.
                batch_sampled_k_continuous.append(sampled_k_continuous_item) # Appends the continuous sampled action to the list.
                epoch_predicted_top_ks.append(sampled_k_processed_item.item()) # Appends the item value of the processed sampled action to the epoch list for standard deviation calculation.

                # --- Integrate Actual RAG Execution and Reward Calculation ---
                question = batch_questions[i] # Gets the current question string.
                ground_truth_answer = batch_ground_truth[i] # Gets the corresponding ground truth answer string.
                predicted_top_k_int = int(sampled_k_processed_item.item()) # Converts the sampled similarity_top_k item to an integer for use in the RAG system.

                try: # Starts a try block to handle potential errors during RAG execution or reward calculation.
                    # Execute the RAG system using the sampled similarity_top_k
                    # Create a temporary query engine with the policy-controlled retriever
                    policy_controlled_engine = OECD_index.as_query_engine(similarity_top_k=predicted_top_k_int) # Creates a query engine instance from the OECD index, configured with the policy-sampled similarity_top_k.
                    generated_answer = policy_controlled_engine.query(question).response # Executes a query on the policy-controlled engine with the current question and extracts the generated answer text.

                    # Calculate the cosine similarity reward
                    reward = cosine_similarity_reward(generated_answer, ground_truth_answer) # Calculates the cosine similarity reward between the generated answer and the ground truth.
                    batch_rewards.append(reward) # Appends the calculated reward to the list of batch rewards.

                except Exception as e: # Catches any exception that occurs within the try block.
                    print(f"Error during RAG execution or reward calculation for question '{question}': {e}") # Prints an error message including the question and the exception details.
                    # Append a placeholder reward in case of error
                    batch_rewards.append(0.0) # Appends a reward of 0.0 to the batch rewards list to handle errors gracefully and prevent the training from crashing.
                # --- End Actual RAG Execution and Reward Calculation ---


            batch_sampled_k_continuous_tensor = torch.stack(batch_sampled_k_continuous) # Stacks the list of continuous sampled actions into a single tensor.
            batch_rewards_tensor = torch.tensor(batch_rewards, dtype=torch.float32) # Converts the list of batch rewards into a PyTorch tensor with float32 data type.

            # e. Calculate the baseline reward for the batch
            baseline = calculate_baseline(batch_rewards_tensor) # Calculates the baseline (average) reward for the current batch.

            # f. Calculate the advantage for each sample
            # a low reward (especially lower than the baseline, resulting in a negative advantage)
            advantage = batch_rewards_tensor - baseline # Calculates the advantage for each sample by subtracting the baseline reward from the individual reward.

            # g. Calculate the log probability of the original continuous sampled actions
            # the policy loss calculation involves multiplying the log probability of the action taken by a negative advantage
            log_probs = calculate_log_prob(mean_output, log_variance_output, batch_sampled_k_continuous_tensor) # Calculates the log probability of the original continuous sampled actions under the distribution predicted by the policy network.

            # Since the log probability of the action taken is usually negative (as probabilities are between 0 and 1, and log(probability) is negative), and the advantage is negative, the product becomes positive.
            # h. Compute the policy loss
            # because the policy gradient loss is defined as the negative of this product (-torch.mean(log_probs * advantage)), the resulting loss contribution for that sample will be negative.
            policy_loss = -torch.mean(log_probs * advantage) # Computes the policy loss using the policy gradient formula: the negative mean of the element-wise product of log probabilities and advantages.

            # i. Perform a backward pass to compute gradients
            policy_loss.backward() # Computes the gradients of the policy loss with respect to the policy network's parameters using backpropagation.

            # j. Update the policy network's weights
            optimizer.step() # Updates the policy network's parameters using the optimizer based on the computed gradients.

            # Calculate batch metrics
            batch_policy_loss = policy_loss.item() # Gets the scalar value of the batch policy loss.
            batch_average_reward = torch.mean(batch_rewards_tensor).item() # Calculates the average reward for the batch and gets its scalar value.
            batch_average_predicted_top_k = torch.mean(torch.stack(batch_sampled_k_processed).float()).item() # Calculates the average processed predicted similarity_top_k for the batch and gets its scalar value.
            batch_average_advantage = torch.mean(advantage).item() # Calculates the average advantage for the batch and gets its scalar value.
            batch_average_mean = torch.mean(mean_output).item() # Calculates the average predicted mean for the batch and gets its scalar value.
            batch_average_log_variance = torch.mean(log_variance_output).item() # Calculates the average predicted log variance for the batch and gets its scalar value.

            # Accumulate metrics for epoch averages
            total_epoch_loss += batch_policy_loss # Adds the batch loss to the total epoch loss.
            total_epoch_reward += torch.sum(batch_rewards_tensor).item() # Adds the sum of batch rewards to the total epoch reward.
            total_epoch_predicted_top_k += torch.sum(torch.stack(batch_sampled_k_processed).float()).item() # Adds the sum of batch predicted top_k to the total epoch predicted top_k.
            total_epoch_advantage += torch.sum(advantage).item() # Adds the sum of batch advantages to the total epoch advantage.
            total_epoch_mean += torch.sum(mean_output).item() # Adds the sum of batch means to the total epoch mean.
            total_epoch_log_variance += torch.sum(log_variance_output).item() # Adds the sum of batch log variances to the total epoch log variance.

            num_batches += 1 # Increments the batch counter for the current epoch.

            # Log batch metrics to Weights & Biases
            wandb.log({ # Logs the calculated batch-level metrics to Weights & Biases.
                "batch/policy_loss": batch_policy_loss,
                "batch/average_reward": batch_average_reward,
                "batch/average_predicted_top_k": batch_average_predicted_top_k,
                "batch/average_advantage": batch_average_advantage,
                "batch/average_mean": batch_average_mean,
                "batch/average_log_variance": batch_average_log_variance,
            }, step=global_step) # Uses the global step for logging.


        # Calculate epoch metrics after the batch loop
        avg_epoch_loss = total_epoch_loss / num_batches if num_batches > 0 else 0 # Calculates the average epoch loss.
        avg_epoch_reward = total_epoch_reward / len(rag_dataset) if len(rag_dataset) > 0 else 0 # Calculates the average epoch reward per sample.
        avg_epoch_predicted_top_k = total_epoch_predicted_top_k / len(rag_dataset) if len(rag_dataset) > 0 else 0 # Calculates the average predicted similarity_top_k per sample for the epoch.
        avg_epoch_advantage = total_epoch_advantage / len(rag_dataset) if len(rag_dataset) > 0 else 0 # Calculates the average advantage per sample for the epoch.
        avg_epoch_mean = total_epoch_mean / len(rag_dataset) if len(rag_dataset) > 0 else 0 # Calculates the average predicted mean per sample for the epoch.
        avg_epoch_log_variance = total_epoch_log_variance / len(rag_dataset) if len(rag_dataset) > 0 else 0 # Calculates the average predicted log variance per sample for the epoch.
        epoch_predicted_top_k_std = np.std(epoch_predicted_top_ks) if epoch_predicted_top_ks else 0.0 # Calculates the standard deviation of the predicted similarity_top_k values across the epoch.

        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Avg Loss: {avg_epoch_loss:.4f}, Avg Reward: {avg_epoch_reward:.4f}, Avg Predicted Top K: {avg_epoch_predicted_top_k:.2f}, Predicted Top K Std: {epoch_predicted_top_k_std:.2f}") # Prints the epoch summary metrics to the console.

        # Log epoch metrics to Weights & Biases
        wandb.log({ # Logs the calculated epoch-level metrics to Weights & Biases.
            "epoch/average_loss": avg_epoch_loss,
            "epoch/average_reward": avg_epoch_reward,
            "epoch/average_predicted_top_k": avg_epoch_predicted_top_k,
            "epoch/average_advantage": avg_epoch_advantage,
            "epoch/average_mean": avg_epoch_mean,
            "epoch/average_log_variance": avg_epoch_log_variance,
            "epoch/predicted_top_k_std": epoch_predicted_top_k_std # Log standard deviation
        }, step=epoch + 1) # Uses the epoch number for logging.

    print("Training finished.") # Prints a message indicating the training is complete.

else:
    print("Training skipped because OECD index was not loaded.") # Prints a message if training was skipped due to the index not loading.

# Finish the Weights & Biases run
if wandb.run is not None: # Checks if a Weights & Biases run is currently active.
    wandb.finish() # Finishes the Weights & Biases run, ensuring all data is synced.

## Summary:

### Data Analysis Key Findings

*   The training process successfully logged key metrics at both the batch and epoch levels to Weights & Biases, including policy loss, average reward, average predicted `similarity_top_k`, average advantage, average mean, and average log-variance of the predicted distribution.
*   Hyperparameters such as learning rate (1e-4), batch size (8), and number of epochs (100) were successfully logged to the Weights & Biases config.
*   The training loop executed for 100 epochs, with console output confirming the progress and epoch-average loss and reward.
*   A new epoch-level metric, the standard deviation of the predicted `similarity_top_k`, was successfully added and logged to provide insight into the variability of the policy's actions.

### Completed Steps

*   Visualized the logged metrics in the Weights & Biases dashboard to analyze trends, identify correlations between metrics (e.g., reward and predicted top\_k), and diagnose potential training issues such as instability or convergence problems.
*   Implement the actual RAG system reward calculation to replace the dummy reward function, allowing the policy to learn based on real retrieval performance.
