In [6]:
!pip uninstall textattack transformers datasets pyarrow torch -y
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install pyarrow>=12.0.0
!pip install datasets>=2.14.0
!pip install transformers>=4.30.0
!pip install textattack
!pip install scikit-learn scipy matplotlib seaborn

Found existing installation: textattack 0.3.10
Uninstalling textattack-0.3.10:
  Successfully uninstalled textattack-0.3.10
Found existing installation: transformers 4.56.2
Uninstalling transformers-4.56.2:
  Successfully uninstalled transformers-4.56.2
Found existing installation: datasets 2.4.0
Uninstalling datasets-2.4.0:
  Successfully uninstalled datasets-2.4.0
Found existing installation: pyarrow 21.0.0
Uninstalling pyarrow-21.0.0:
  Successfully uninstalled pyarrow-21.0.0
Found existing installation: torch 2.7.1+cu118
Uninstalling torch-2.7.1+cu118:
  Successfully uninstalled torch-2.7.1+cu118
[0mLooking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Using cached https://download.pytorch.org/whl/cu118/torch-2.7.1%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Using cached https://download.pytorch.org/whl/cu118/torch-2.7.1%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl (905.3 MB)
Installing collected packages: torch
[31mERROR: pip's depende

In [10]:
!pip uninstall datasets -y
!pip install datasets --no-cache-dir

Found existing installation: datasets 2.19.0
Uninstalling datasets-2.19.0:
  Successfully uninstalled datasets-2.19.0
[0mCollecting datasets
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Downloading datasets-4.1.1-py3-none-any.whl (503 kB)
Installing collected packages: datasets
Successfully installed datasets-4.1.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
import pyarrow as pa
import datasets
import torch
import transformers

print(f"PyArrow version: {pa.__version__}")
print(f"Datasets version: {datasets.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")

PyArrow version: 21.0.0
Datasets version: 4.1.1
PyTorch version: 2.7.1+cu118
Transformers version: 4.56.2


In [2]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from textattack.models.wrappers import ModelWrapper
from textattack.attack_recipes import TextFoolerJin2019, PWWSRen2019
from textattack import AttackArgs, Attacker
from textattack.datasets import HuggingFaceDataset

import numpy as np
from typing import Dict, List

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise_distances
from scipy import stats

In [4]:
# Model Wrapper

class AttentionExtractorModel:
    def __init__(self, model_name, num_labels=2):
        """
        Initialize model that outputs both predictions and attention weights
        
        Args:
            model_name: HuggingFace model identifier (e.g., 'bert-base-uncased')
            num_labels: Number of classification labels
        """
        
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # Load model with output_attentions capability
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,          # classification labels
            output_attentions=True # CRITICAL: enables attention extraction
        )
        
        # Set model to eval mode
        self.model.eval()


    def get_attention_maps(self, text):
        """
        Process text and extract attention maps
        
        Args:
            text: Input string
            
        Returns:
            attentions: List of attention tensors [num_layers, num_heads, seq_len, seq_len]
        """
        # Tokenize text
        encoded = self.tokenizer(
            text,                    # str or List[str]
            padding=True,            # pad to longest in batch
            truncation=True,         # truncate to max_length
            max_length=512,          # maximum sequence length
            return_tensors='pt'      # return PyTorch tensors
        )
        
        # Get model outputs
        with torch.no_grad():
            outputs = self.model(**encoded)
        
        # Process attention tensors into numpy arrays
        attentions = []
        for layer_attention in outputs.attentions:
            # layer_attention shape: [batch_size, num_heads, seq_len, seq_len]
            attentions.append(layer_attention.cpu().numpy())
        
        # Get predictions
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()
        
        # Return structured attention data
        return {
            'predictions': predictions,
            'attentions': attentions,  # List of [batch_size, num_heads, seq_len, seq_len]
            'input_ids': encoded['input_ids'].cpu().numpy(),
            'attention_mask': encoded['attention_mask'].cpu().numpy()
        }
    
    def forward(self, input_ids, attention_mask):
        """
        Forward pass that returns logits and attention weights
        
        Returns:
            logits: Model predictions
            attentions: Tuple of attention tensors from each layer
        """
        # Run model with output_attentions=True
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_attentions=True  # Must be True to get attentions
        )
        # Extract and return both logits and attentions
        logits = outputs.logits        # [batch_size, num_labels] - predictions
        attentions = outputs.attentions    # Tuple of length num_layers

        return logits, attentions

In [5]:
# Initialize model
extractor = AttentionExtractorModel('bert-base-uncased')

# Get attention maps
text = "This is a sample text for analysis"
results = extractor.get_attention_maps(text)

# Access results
print("Predictions shape:", results['predictions'].shape)
print("Number of layers:", len(results['attentions']))
print("Attention shape per layer:", results['attentions'][0].shape)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predictions shape: (1, 2)
Number of layers: 12
Attention shape per layer: (1, 12, 9, 9)


In [23]:
import nltk

# Download required NLTK data
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('universal_tagset')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
# textattack_wrapper

class CustomModelWrapper(ModelWrapper):
    def __init__(self, model, tokenizer, batch_size=32):
        """
        Wrapper to make your model compatible with TextAttack
        
        Args:
            model: Your PyTorch model
            tokenizer: HuggingFace tokenizer
            batch_size: Batch size for processing
        """
        # Store model and tokenizer
        self.model = model
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        
        # PyTorch tensors need to be on same device to interact
        self.device = next(model.parameters()).device
    
    def __call__(self, text_input_list):
        """
        TextAttack calls this method with a list of strings
        
        Args:
            text_input_list: List of text strings to classify
            
        Returns:
            predictions: Tensor of shape [batch_size, num_labels]
        """
        # Tokenize inputs
        encoded = self.tokenizer(
            text_input_list,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        
        # Move to same device as model
        encoded = {k: v.to(self.device) for k, v in encoded.items()}
        
        # Get model predictions (logits only, not attentions)
        with torch.no_grad():
            outputs = self.model(**encoded, output_attentions=False)
        
        # TextAttack expects raw logits (before softmax)
        return outputs.logits
    
    def to(self, device):
        """Move model to specified device"""
        self.model.to(device)
        self.device = device
        return self

In [29]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from textattack.datasets import HuggingFaceDataset
from textattack import AttackArgs, Attacker
from textattack.attack_results import SuccessfulAttackResult
from textattack.attack_recipes import (
    BAEGarg2019,           # BERT-based attack
    PWWSRen2019,           # Probability Weighted Word Saliency WORKS W/O TENSORFLOW
    TextBuggerLi2018,      # Character-level perturbations
    DeepWordBugGao2018     # Another character-level attack
)
import nltk

# Download NLTK data first
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load a TRAINED model (fine-tuned on sentiment analysis)
model_name = "textattack/bert-base-uncased-SST-2"  # Pre-trained on SST-2 sentiment dataset
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create wrapper
from textattack.models.wrappers import HuggingFaceModelWrapper
model_wrapper = HuggingFaceModelWrapper(model, tokenizer)

# Create attack
attack = PWWSRen2019.build(model_wrapper)

# Test on a single example
text = "The movie was fantastic and incredibly entertaining!"
ground_truth_label = 1  # 1 = positive

print(f"Original text: {text}")
print(f"Ground truth label: {ground_truth_label}")

# Run attack
result = attack.attack(text, ground_truth_label)

# Print results properly
if isinstance(result, SuccessfulAttackResult):
    print(f"\n✓ Attack SUCCEEDED!")
    print(f"Adversarial text: {result.perturbed_text()}")  # Note: calling the method with ()
    print(f"Original prediction: {result.original_result.output}")
    print(f"Adversarial prediction: {result.perturbed_result.output}")
    print(f"Number of queries: {result.num_queries}")
else:
    print(f"\n✗ Attack FAILED or SKIPPED")
    print(f"Reason: {type(result).__name__}")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  0%|          | 0/2 [08:01<?, ?it/s]
textattack: Unknown if model of class <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


Original text: The movie was fantastic and incredibly entertaining!
Ground truth label: 1

✓ Attack SUCCEEDED!
Adversarial text: The movie was grotesque and incredibly harbor!
Original prediction: 1
Adversarial prediction: 0
Number of queries: 51


In [None]:
# attack_pipeline

def setup_attack(model_wrapper, attack_type='textfooler'):
    """
    Configure TextAttack attack
    
    Args:
        model_wrapper: Your TextAttack-compatible model wrapper
        attack_type: Type of attack ('textfooler', 'pwws', etc.)
        
    Returns:
        attacker: Configured Attacker object
    """
    # TODO: Choose attack recipe based on attack_type
    # TODO: Build attack with your model wrapper
    # TODO: Configure attack arguments (num_examples, etc.)
    # TODO: Return configured attacker
    pass

def generate_adversarial_examples(attacker, dataset, num_examples=100):
    """
    Generate adversarial examples from dataset
    
    Args:
        attacker: TextAttack Attacker object
        dataset: Dataset to attack
        num_examples: Number of examples to generate
        
    Returns:
        results: List of AttackResult objects
    """
    # TODO: Run attacker on dataset
    # TODO: Collect successful and failed attacks
    # TODO: Return results with original and perturbed texts
    pass

In [None]:
# attention_extraction

def extract_attention_pairs(model, attack_results):
    """
    Extract attention maps for original and adversarial examples
    
    Args:
        model: Your AttentionExtractorModel
        attack_results: Results from TextAttack
        
    Returns:
        attention_data: Dictionary containing paired attention maps
    """
    attention_data = {
        'original_texts': [],
        'adversarial_texts': [],
        'original_attentions': [],  # List of [num_layers, num_heads, seq_len, seq_len]
        'adversarial_attentions': [],
        'labels': [],
        'predictions_original': [],
        'predictions_adversarial': []
    }
    
    # TODO: Iterate through attack results
    # TODO: For each successful attack:
    #       - Extract attention maps from original text
    #       - Extract attention maps from adversarial text
    #       - Store both with metadata
    # TODO: Handle sequence length differences (padding/truncation)
    pass

def align_attention_maps(attn1, attn2):
    """
    Align attention maps of different sequence lengths
    
    Args:
        attn1, attn2: Attention tensors of potentially different shapes
        
    Returns:
        aligned_attn1, aligned_attn2: Aligned attention maps
    """
    # TODO: Determine shorter sequence length
    # TODO: Truncate or pad attention maps to match
    # TODO: Handle special tokens ([CLS], [SEP], [PAD])
    pass

In [None]:
# visualization

def visualize_attention_head(attention_matrix, tokens, save_path=None):
    """
    Visualize a single attention head
    
    Args:
        attention_matrix: 2D attention weights [seq_len, seq_len]
        tokens: List of token strings
        save_path: Optional path to save figure
    """
    # TODO: Create heatmap with seaborn
    # TODO: Add token labels on axes
    # TODO: Add colorbar
    # TODO: Save or show figure
    pass

def visualize_layer_comparison(original_attn, adversarial_attn, layer_idx, head_idx):
    """
    Side-by-side comparison of attention maps
    
    Args:
        original_attn: Original attention tensor
        adversarial_attn: Adversarial attention tensor
        layer_idx: Which layer to visualize
        head_idx: Which attention head to visualize
    """
    # TODO: Create 1x2 subplot
    # TODO: Plot original attention on left
    # TODO: Plot adversarial attention on right
    # TODO: Use same color scale for both
    pass

def plot_attention_difference_map(original_attn, adversarial_attn):
    """
    Visualize the difference between attention patterns
    
    Args:
        original_attn: Original attention [seq_len, seq_len]
        adversarial_attn: Adversarial attention [seq_len, seq_len]
    """
    # TODO: Compute difference: adversarial - original
    # TODO: Plot difference heatmap
    # TODO: Use diverging colormap (e.g., RdBu)
    pass

In [None]:
# topological_analysis

def compute_attention_entropy(attention_matrix):
    """
    Compute entropy of attention distribution for each query token
    
    Args:
        attention_matrix: [seq_len, seq_len] attention weights
        
    Returns:
        entropies: Entropy values for each query position
    """
    # TODO: For each row (query token)
    # TODO: Compute Shannon entropy: -sum(p * log(p))
    # TODO: Return entropy values
    pass

def compute_attention_distance_metrics(original_attn, adversarial_attn):
    """
    Compute various distance metrics between attention patterns
    
    Returns:
        metrics: Dictionary of distance metrics
    """
    metrics = {}
    
    # TODO: Flatten attention matrices
    # TODO: Compute L2 distance (Euclidean)
    # TODO: Compute cosine similarity
    # TODO: Compute KL divergence (treat as probability distributions)
    # TODO: Compute Wasserstein distance (Earth Mover's Distance)
    
    return metrics

def compute_attention_graph_metrics(attention_matrix, threshold=0.1):
    """
    Treat attention as a graph and compute graph properties
    
    Args:
        attention_matrix: [seq_len, seq_len]
        threshold: Minimum attention weight to consider an edge
        
    Returns:
        graph_metrics: Dictionary of graph properties
    """
    # TODO: Threshold attention to create adjacency matrix
    # TODO: Compute degree centrality
    # TODO: Compute clustering coefficient
    # TODO: Compute connected components
    # TODO: Return metrics
    pass

def analyze_layer_wise_changes(original_attentions, adversarial_attentions):
    """
    Analyze how perturbations propagate through layers
    
    Args:
        original_attentions: List of attention tensors per layer
        adversarial_attentions: List of attention tensors per layer
        
    Returns:
        layer_metrics: Per-layer analysis
    """
    layer_metrics = []
    
    # TODO: For each layer:
    #       - Average attention across heads
    #       - Compute distance metrics
    #       - Compute entropy changes
    #       - Track how perturbation effects evolve
    
    return layer_metrics

In [None]:
# statistical_analysis

def aggregate_results(attention_data_list):
    """
    Aggregate metrics across multiple adversarial examples
    
    Args:
        attention_data_list: List of attention data dictionaries
        
    Returns:
        aggregated_metrics: Statistical summary
    """
    # TODO: Collect all distance metrics
    # TODO: Compute mean, std, median for each metric
    # TODO: Perform statistical tests (e.g., t-test, Wilcoxon)
    # TODO: Analyze layer-wise trends
    pass

def analyze_perturbation_impact():
    """
    Analyze which types of perturbations cause largest attention changes
    """
    # TODO: Group by perturbation type (word substitution, insertion, deletion)
    # TODO: Compute average attention distance per perturbation type
    # TODO: Correlate perturbation position with attention change magnitude
    pass

def identify_vulnerable_layers():
    """
    Determine which layers are most affected by adversarial perturbations
    """
    # TODO: Compute per-layer attention distance metrics
    # TODO: Rank layers by average change magnitude
    # TODO: Visualize layer vulnerability
    pass

In [None]:
# Main Function

def main():
    # 1. Load model and setup
    print("Loading model...")
    # TODO: Initialize AttentionExtractorModel
    # TODO: Create TextAttack wrapper
    
    # 2. Setup attack
    print("Setting up attack...")
    # TODO: Configure attack (TextFooler, PWWS, etc.)
    # TODO: Load dataset
    
    # 3. Generate adversarial examples
    print("Generating adversarial examples...")
    # TODO: Run attack on dataset
    # TODO: Filter successful attacks
    
    # 4. Extract attention maps
    print("Extracting attention maps...")
    # TODO: For each original/adversarial pair, extract attentions
    # TODO: Save raw attention data
    
    # 5. Compute topological metrics
    print("Computing topological metrics...")
    # TODO: Run topological analysis
    # TODO: Compute distance metrics
    # TODO: Analyze layer-wise changes
    
    # 6. Visualize results
    print("Generating visualizations...")
    # TODO: Create attention heatmaps
    # TODO: Plot difference maps
    # TODO: Create layer-wise comparison plots
    
    # 7. Statistical analysis
    print("Performing statistical analysis...")
    # TODO: Aggregate results
    # TODO: Test for significance
    # TODO: Generate summary report
    
    # 8. Save results
    print("Saving results...")
    # TODO: Save processed data
    # TODO: Save figures
    # TODO: Generate report

if __name__ == "__main__":
    main()