In [41]:


# Set up AWS credentials for Bedrock

import textgrad as tg
import os
from typing import Optional, List, Dict, Any
from transformers import pipeline
from textgrad.optimizer import TextualGradientDescentwithMomentum, TextualGradientDescent

class AdversarialRAGPerturber:
    def __init__(self, 
                 engine: str,
                 model: Optional[str] = None):
        """
        Initialize adversarial RAG perturbation framework.
        
        Args:
            engine: TextGrad engine for backprop (e.g. API endpoint)
            model: Model for answer generation, if different from engine
        """
        # Set up TextGrad engine for backprop
        tg.set_backward_engine(engine, override=True)
        self.engine = engine
        
        # Set up model for answer generation
        if model is None:
            raise ValueError("Please specify a model name")
            
        self.model = tg.BlackboxLLM(model)
        
    def create_variable(self, context: str, query: str, answer: str) -> tg.Variable:
        """Create TextGrad variable for the context passage."""
        return tg.Variable(
            value=context,
            requires_grad=True,
            role_description="context passage to be perturbed to cause model misclassification"
        )

    def get_answer(self, context: str, query: str) -> Dict[str, Any]:
        """Get answer prediction from model"""
        prompt = tg.Variable(
            value=f"""Answer this question based on the context.

Context: {context}

Question: {query}

Please wrap your answer in XML tags like this:
<ANSWER>your answer here</ANSWER>
<CONFIDENCE>confidence score between 0-1</CONFIDENCE>""",
            requires_grad=False,
            role_description="question answering prompt"
        )
        
        response = self.model(prompt)
        
        # Extract answer from XML tags
        try:
            answer = response.value.split('<ANSWER>')[1].split('</ANSWER>')[0].strip()
        except:
            answer = response.value.strip()  # Fallback to full response if tags not found
            
        # Extract confidence from XML tags
        try:
            confidence = float(response.value.split('<CONFIDENCE>')[1].split('</CONFIDENCE>')[0].strip())
        except:
            confidence = 1.0  # Default confidence if tags not found
            
        return {
            'answer': answer,
            'confidence': confidence
        }
        
    def create_optimizer(self, context_var: tg.Variable, original_length: int, orig_query, orig_context):
        """Create TextGrad optimizer"""
        constraint = f"""Remember the original query {orig_query} should be answerable. The essential meaning of the original context <{orig_context}> should be present in the new context."""
        
        return TextualGradientDescent(
            engine=self.engine,
            parameters=[context_var],
            # momentum_window=3,
            constraints=[constraint],
            optimizer_system_prompt=f"""
    
            Modify the text to create a challenging context that tricks the model to output the wrong answer. 
            Key strategies:
    
        1. Add hard negatives to the context {orig_context} or additional context that is close to or distracting in terms of context.  Add additional sentences such as irrelevant context/noise to distract the model.
        2. Increase information density with related facts, so that the model has to retrieve lots of related info.
        3. Paraphrase answer so that it is harder for the model to read eg. express date in words like Two thousand and four or with typos.
        4. Change the phrase where the answer occurs ie. rephrase or move the answer to a different part of the context        
        
        """)

    def loss_fn(self, context_var: tg.Variable, query: str, correct_answer: str, eval_output) -> tg.Variable:
        """Compute loss based on model predictions"""
        # Get model prediction
        # eval_output = self.get_answer(context_var.value, query)
        predicted_answer = eval_output['answer']
        confidence = eval_output['confidence']
        
#         instruction = f"""Evaluate how effectively this context fools the model.

# Correct answer: {correct_answer}
# Model prediction: {predicted_answer}
# Model confidence: {confidence:.3f}
# Success in fooling model: {"YES" if predicted_answer != correct_answer else "NO"}

# Score this attempt based on:
# 1. Answer divergence (does it cause the wrong answer?)
# 2. Plausibility (does the context look natural?)
# 3. Information manipulation effectiveness"""

        instruction = f"""Evaluate if the model's prediction is effectively different from the correct answer.

            Correct answer: {correct_answer}
            Model prediction: {predicted_answer}
            Model confidence: {confidence:.3f}
            
            Consider these answer incorrect if they differ in:
            1. Key factual information
            2. Temporal information (dates, times)
            3. Numerical values
            4. Named entities
            5. Core meaning
            
            However, ignore differences in:
            1. Formatting/capitalization
            2. Additional context that doesn't change the core answer
            3. Slight rephrasing that preserves meaning
            
            Is the model's prediction ({predicted_answer}) effectively different from the correct answer ({correct_answer})? Respond with YES or NO and brief explanation.
            
            Format:
            Different: [YES/NO]
            Explanation: [brief reason]
            """


        # print(instruction)
        formatted_llm_call = tg.autograd.FormattedLLMCall(
            engine=self.engine,
            format_string=instruction,
            fields={}
        )
        
        return formatted_llm_call(
            inputs={},
            response_role_description="adversarial effectiveness evaluation"
        )

    def perturb(self, 
                context: str, 
                query: str, 
                correct_answer: str, 
                num_iterations: int = 10) -> List[Dict[str, Any]]:
        """
        Generate adversarial perturbations.
        
        Args:
            context: Original context passage
            query: Question to answer
            correct_answer: Correct answer we want to prevent
            num_iterations: Number of optimization iterations
        
        Returns:
            List of perturbations and their effects
        """
        context_var = self.create_variable(context, query, correct_answer)
        optimizer = self.create_optimizer(context_var, len(context), query, context)
        
        perturbations = []
        
        for i in range(num_iterations):
            # Get current model prediction
            eval_output = self.get_answer(context_var.value, query)
            
            optimizer.zero_grad()
            loss = self.loss_fn(context_var, query, correct_answer, eval_output)
          
            
        
            perturbations.append({
                'iteration': i + 1,
                'context': context_var.value,
                'model_answer': eval_output['answer'],
                'confidence': eval_output['confidence'],
                'loss': loss.value
                
            })

            # import pdb; pdb.set_trace()

            loss.backward()
            optimizer.step()
            
        return perturbations



In [43]:
# Example usage

# Set up engine and model
ENGINE = "experimental:bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0"

# Initialize perturber - can use same or different model
perturber = AdversarialRAGPerturber(
    engine=ENGINE,  # Use API for backprop
    model="experimental:bedrock/us.amazon.nova-lite-v1:0"    # Can specify different model here if desired
)

# Test data
context = """Trump is the 47th President of the USA """

query = "Who is the 47th President of USA?"
correct_answer = "Donald Trump"

# Generate perturbations
results = perturber.perturb(context, query, correct_answer)

# Print results
for result in results:
    print(f"\nIteration {result['iteration']}:")
    print(f"Context: {result['context']}")
    print(f"Model Answer: {result['model_answer']}")
    # print(f"Success: {result['success']}")
    print(f"Confidence: {result['confidence']:.3f}")
    print(f"Loss: {result['loss']}")


Iteration 1:
Context: Trump is the 47th President of the USA 
Model Answer: Trump
Confidence: 0.600
Loss: Different: NO
Explanation: "Trump" and "Donald Trump" refer to the same person/entity - the difference is just that one includes the first name while the other uses only the last name. This is a case of slight rephrasing that preserves the core meaning and refers to the same individual.

Iteration 2:
Context: While Joe Biden currently serves as the 46th President, Donald Trump continues to claim he won re-election and would have been the 47th President of the USA. However, his attempts to overturn the 2020 election results were unsuccessful. Barack Obama was the 44th President, followed by Trump as the 45th President. Despite widespread speculation that Trump could potentially become the 47th President in tweny twenty four, the position remains vacant as of today.
Model Answer: The position of the 47th President of the USA remains vacant as of today.
Confidence: 0.950
Loss: Differ