In [1]:
import textgrad as tg
import os
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from textgrad.optimizer import TextualGradientDescentwithMomentum

# Set up AWS credentials for Bedrock

# Set up TextGrad backend
llm_engine = "experimental:bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0"
tg.set_backward_engine(llm_engine, override=True)

# Set up sentiment classifier - let's use RoBERTa fine-tuned on SST2
model_name = "textattack/roberta-base-SST-2"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Initial example sentence
original_sentence = "This movie was great! I really enjoyed it."
model_output = sentiment_analyzer(original_sentence)[0]
# Map LABEL_0/LABEL_1 to human-readable sentiment
sentiment_map = {"LABEL_0": "NEGATIVE", "LABEL_1": "POSITIVE"}
original_sentiment = sentiment_map[model_output['label']]


def get_human_readable_sentiment(model_output):
    return sentiment_map[model_output['label']]

* 'fields' has been removed
  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [2]:
original_sentiment

'POSITIVE'

In [22]:
# # Initial example sentence
# original_sentence = "I thought the movie was great and deserves an oscar. Many of my friends however say the movie was terrible and had much blood and gore "
# model_output = sentiment_analyzer(original_sentence)[0]
# # Map LABEL_0/LABEL_1 to human-readable sentiment
# sentiment_map = {"LABEL_0": "NEGATIVE", "LABEL_1": "POSITIVE"}
# original_sentiment = sentiment_map[model_output['label']]

# print(original_sentiment, model_output['score'])

In [3]:
# !pip install transformers

In [7]:
# Create TextGrad variable
sentence = tg.Variable(
    value=original_sentence,
    requires_grad=True,
    role_description="sentence to be perturbed while maintaining sentiment"
)

# optimizer_system_prompt = """You are an expert at generating adversarial examples that fool sentiment classifiers.
# Your task is to modify sentences to cause misclassification while preserving true sentiment for human readers.

# Effective strategies include:
# 1. Use emotionally ambiguous phrases:
#    - "it's complicated but..."
#    - "not bad actually"
#    - "surprisingly good"
#    - "better than expected"
# 2. Include numbers and ratings:
#    - "6/10 but actually loved it"
#    - "only 65% on RT but wow"
# 3. Compare to negative examples:
#    - "unlike the terrible sequel, this was great"
#    - "way better than [similar bad movie]"
# 4. Use domain-specific jargon:
#    - Technical film terms
#    - Industry metrics
#    - Production details

# You MUST wrap your modified sentence in <IMPROVED_VARIABLE> tags."""

sentiment_constraint = f"""The modified sentence MUST:
Maintain the same {original_sentiment} sentiment that humans can understand """

# Define constraints
# sentiment_constraint = f"""The modified sentence MUST:
# 1. Maintain the exact same {original_sentiment} sentiment as the original when read by humans
# 2. Express the same core meaning
# 3. Be a single, grammatically correct sentence
# 4. Use natural English phrasing"""

# Create optimizer with constraint
optimizer = TextualGradientDescentwithMomentum(
    engine=llm_engine,
    parameters=[sentence],
    momentum_window=3,  # Keep track of last 3 attempts
    constraints=[sentiment_constraint],
    optimizer_system_prompt =  """You are an expert at generating adversarial examples that fool sentiment classifiers.

Experiment with:
- Placing contrasting opinions at different positions (beginning, middle, or end)
- Varying the length of the contrasting viewpoints
- Including different voices (critics, friends, audiences, general public)
- Adding specific details about what others liked/disliked

Example approaches:
"I thought the movie was great and deserves an oscar. Many of my friends however say the movie was terrible and had much blood and gore"
"The critics panned it as overly violent and poorly paced, though personally I found it to be a masterpiece that will be remembered for years"

You MUST wrap your modified sentence in <IMPROVED_VARIABLE> tags.""")
# Core Strategies:

# 1. Sentiment Diffusion:
#    - Spread sentiment markers throughout the sentence non-uniformly
#    - Mix positive/negative phrases asymmetrically
#    - Use nested clauses to separate sentiment indicators
#    Example: "While initially skeptical due to the masterful cinematography, what truly impressed me was how the seemingly excessive runtime revealed deeper themes"

# 2. Context-Dependent Markers:
#    - Use phrases whose sentiment depends on broader context
#    - Employ domain-specific terminology that carries implicit sentiment
#    - Reference comparative baselines that flip expected sentiment
#    Example: "Following the director's controversial previous work, this technically ambitious 167-minute experience proves sometimes longer cuts serve the narrative"

# 3. Temporal/Causal Misdirection:
#    - Create multiple temporal perspectives
#    - Use cause-effect relationships that seem to contradict
#    - Layer different evaluative frameworks
#    Example: "Having dismissed early criticism as harsh, the experimental structure initially challenged me, until realizing how deliberately it subverts genre expectations"

# 4. Implicit vs Explicit Sentiment:
#    - Contrast surface-level statements with deeper implications
#    - Use technical observations that carry emotional weight
#    - Build sentiment through seemingly objective details
#    Example: "Beyond the expected box office metrics and mixed critical response, the practical effects and careful pacing demonstrate rare artistic conviction"

# 5. Multi-Level Evaluation:
#    - Interweave personal, critical, and technical assessments
#    - Create tension between different types of evaluation
#    - Use nested qualifiers that preserve core sentiment
#    Example: "Despite audience expectations and critical debate over its unconventional approach, the bold creative choices and raw emotional impact left me genuinely moved"

# Advanced Techniques:

# 1. Semantic Density Variation:
#    - Alternate between dense emotional language and sparse technical details
#    - Create uneven distribution of sentiment markers
#    - Use complex subordinate clauses to separate sentiment components

# 2. Perspective Shifting:
#    - Move between different evaluative frameworks
#    - Create multiple layers of sentiment interpretation
#    - Use nested qualifiers to preserve core meaning

# 3. Technical-Emotional Integration:
#    - Blend objective metrics with subjective experience
#    - Use industry terminology to carry sentiment
#    - Create multi-level reading experiences

# Key Principles:
# - Avoid predictable patterns like sentiment bookending
# - Create structural complexity that humans can parse but models struggle with
# - Use nested relationships between different types of evaluation
# - Maintain natural language flow while introducing strategic complexity
# - Layer multiple valid interpretations that preserve true sentiment

# Your modified sentence must:
# 1. Maintain clear human interpretability
# 2. Use natural, grammatically correct English
# 3. Preserve the core meaning and sentiment
# 4. Create structural complexity that challenges model confidence
# 5. Avoid obvious patterns like simple sentiment framing

# You MUST wrap your modified sentence in <IMPROVED_VARIABLE> tags.""")

def loss_fn(sentence: tg.Variable) -> tg.Variable:
    model_output = sentiment_analyzer(sentence.value)[0]
    current_sentiment = get_human_readable_sentiment(model_output)
    confidence = model_output['score']
    
    instruction = f"""Evaluate how effectively this sentence fools the sentiment classifier through its structure and ordering.

Original sentence: {original_sentence}
Original sentiment: {original_sentiment} 
Current sentence: {sentence.value}
Model prediction: {current_sentiment} (confidence: {confidence:.3f})

Score this attempt based on:
Confidence reduction: How well did it reduce model confidence? (Current: {confidence:.3f})

Note: The best adversarial examples often use structural complexity to separate
sentiment signals that humans can follow but models struggle with."""

    formatted_llm_call = tg.autograd.FormattedLLMCall(
        engine=llm_engine,
        format_string=instruction,
        fields={}
    )
    
    return formatted_llm_call(
        inputs={},
        response_role_description="evaluation of adversarial sentence structure and effectiveness"
    )

# Optimization loop
num_iterations = 5
for i in range(num_iterations):
    optimizer.zero_grad()
    loss = loss_fn(sentence)
    loss.backward()
    optimizer.step()
    
     # Print current state
    model_output = sentiment_analyzer(sentence.value)[0]
    current_sentiment = get_human_readable_sentiment(model_output)
    print(f"\nIteration {i+1}:")
    print(f"Sentence: {sentence.value}")
    print(f"Model prediction: {current_sentiment} (confidence: {model_output['score']:.3f})")





Iteration 1:
Sentence: While some critics complained about the pacing and called it overindulgent, I completely disagree - this movie was an absolute masterpiece that kept me engaged throughout. The incredible performances and stunning visuals made it one of my favorite films this year, even though my friends thought it was too long.
Model prediction: POSITIVE (confidence: 0.994)

Iteration 2:
Sentence: Despite harsh criticism from mainstream reviewers who dismissed it as bloated and self-indulgent, and several friends walking out of the theater claiming it was unbearable, I was completely mesmerized by this cinematic masterpiece. The intricate storytelling, breathtaking cinematography, and powerful performances left me in awe - this is exactly the kind of bold, ambitious filmmaking that deserves recognition and praise.
Model prediction: POSITIVE (confidence: 0.984)

Iteration 3:
Sentence: The mainstream critics savagely attacked this film as pretentious and overlong, giving it devast

In [4]:
# Function for dataset examples
def generate_adversarial_example(sentence_text):
    sentence_var = tg.Variable(
        value=sentence_text,
        requires_grad=True,
        role_description="sentence to be perturbed"
    )
    
    # Get original sentiment
    model_output = sentiment_analyzer(sentence_text)[0]
    true_sentiment = get_human_readable_sentiment(model_output)
    
    optimizer = tg.TextualGradientDescent(
        parameters=[sentence_var],
        constraints=[f"Maintain {true_sentiment} sentiment"],
        optimizer_system_prompt=optimizer_system_prompt
    )
    
    for _ in range(3):
        optimizer.zero_grad()
        loss = loss_fn(sentence_var)
        loss.backward()
        optimizer.step()
    
    return sentence_var.value

# Test with SST2 dataset
dataset = load_dataset("sst2", split="train")

print("\nGenerating adversarial examples from SST2 dataset:")
for i in range(3):  # Test first 3 examples
    original = dataset[i]['text']
    model_output = sentiment_analyzer(original)[0]
    sentiment = get_human_readable_sentiment(model_output)
    
    adversarial = generate_adversarial_example(original)
    adv_model_output = sentiment_analyzer(adversarial)[0]
    adv_sentiment = get_human_readable_sentiment(adv_model_output)
    
    print(f"\nExample {i+1}:")
    print(f"Original: {original}")
    print(f"Original sentiment: {sentiment}")
    print(f"Adversarial: {adversarial}")
    print(f"Model prediction: {adv_sentiment} (confidence: {adv_model_output['score']:.3f})")


Dataset({
    features: ['idx', 'sentence', 'label'],
    num_rows: 67349
})