 # **"How do attention mechanisms in smaller language models (Gemma2 2B and LLaMa 3.1 1B) process semantic equivalence across English, Hindi, and Hinglish, and what specific Sparse Autoencoder (SAE) features activate during code-switching that might explain performance differences in multilingual contexts?"**


In [3]:
# !pip install 'accelerate>=0.26.0'
# !pip install --upgrade transformers
# !pip install imageio

## Loading Gemma2 2b model


In [4]:
import os
os.getcwd()

'/notebooks'

In [5]:
from huggingface_hub import login, whoami
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os

# Hardcode Hugging Face token
HF_TOKEN = "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"

# Log in to Hugging Face Hub
login(token=HF_TOKEN)

# Verify login (optional)
print("Logged in as:", whoami(token=HF_TOKEN)["name"])


Logged in as: astroanand


In [6]:

# Set memory configuration to avoid fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()

# Completely disable torch dynamo to avoid compatibility issues
import torch._dynamo
torch._dynamo.config.disable = True

# Define your device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU Memory: {torch.cuda.mem_get_info()[0] / 1024**2:.2f} MB free out of {torch.cuda.mem_get_info()[1] / 1024**2:.2f} MB total")


Using device: cuda
GPU Memory: 15379.12 MB free out of 16108.75 MB total


In [7]:

# Function to load model and tokenizer in FP16
def load_model_and_tokenizer(model_name, token=HF_TOKEN):
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        token=token,
        trust_remote_code=True,
        use_fast=True  # Ensure fast tokenizer for better performance
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        token=token,
        device_map="auto",  # Automatically map to GPU
        torch_dtype=torch.float16,  # Use FP16 for efficiency
        low_cpu_mem_usage=True,  # Reduce CPU memory usage
        trust_remote_code=True,
        attn_implementation="eager"  # Ensure compatibility
    )
    return tokenizer, model

# Load Gemma2 2B in FP16
print("Loading Gemma2 2B in FP16...")
tokenizer_2b, model_2b = load_model_and_tokenizer("google/gemma-2-2b")
print("Gemma2 2B loaded in FP16!")

# Move model to GPU explicitly (if not handled by device_map)
model_2b.to(device)

# Function to inspect model architecture (layers and heads)
def inspect_model_architecture(model):
    num_layers = len(model.model.layers)  # Number of transformer layers
    
    # For Gemma 2, we need to access the attention configuration differently
    # Get the head dimension
    head_dim = model.model.layers[0].self_attn.head_dim
    
    # Calculate number of query heads
    q_heads = model.model.layers[0].self_attn.q_proj.out_features // head_dim
    
    # Calculate number of key/value heads (for GQA, this is fewer than q_heads)
    kv_heads = model.model.layers[0].self_attn.k_proj.out_features // head_dim
    
    return num_layers, q_heads, kv_heads

# Check model architecture
num_layers, q_heads, kv_heads = inspect_model_architecture(model_2b)
print(f"Gemma2-2B has {num_layers} layers with {q_heads} query heads and {kv_heads} key/value heads per layer.")

# Sample prompt for analysis
sample_prompt = "Will you please help me understand the concept of kinetic energy?"
inputs_2b = tokenizer_2b(sample_prompt, return_tensors="pt").to(device)

# Run a forward pass to capture attention on the inputs
with torch.no_grad():
    outputs = model_2b(**inputs_2b, output_attentions=True, return_dict=True)

# outputs.attentions is a tuple with one element per layer
attentions = outputs.attentions

# Print attention shapes for all layers and heads
print("\nExtracted attention shapes for all layers and heads:")
for layer_idx, attn in enumerate(attentions):
    batch_size, num_heads, seq_length, _ = attn.shape
    print(f"Layer {layer_idx}: {num_heads} heads, sequence length {seq_length}x{seq_length}")

Loading Gemma2 2B in FP16...


2025-02-27 09:49:30.441798: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-27 09:49:30.484890: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-27 09:49:30.484940: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-27 09:49:30.485883: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-27 09:49:30.491295: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Gemma2 2B loaded in FP16!
Gemma2-2B has 26 layers with 8 query heads and 4 key/value heads per layer.

Extracted attention shapes for all layers and heads:
Layer 0: 8 heads, sequence length 13x13
Layer 1: 8 heads, sequence length 13x13
Layer 2: 8 heads, sequence length 13x13
Layer 3: 8 heads, sequence length 13x13
Layer 4: 8 heads, sequence length 13x13
Layer 5: 8 heads, sequence length 13x13
Layer 6: 8 heads, sequence length 13x13
Layer 7: 8 heads, sequence length 13x13
Layer 8: 8 heads, sequence length 13x13
Layer 9: 8 heads, sequence length 13x13
Layer 10: 8 heads, sequence length 13x13
Layer 11: 8 heads, sequence length 13x13
Layer 12: 8 heads, sequence length 13x13
Layer 13: 8 heads, sequence length 13x13
Layer 14: 8 heads, sequence length 13x13
Layer 15: 8 heads, sequence length 13x13
Layer 16: 8 heads, sequence length 13x13
Layer 17: 8 heads, sequence length 13x13
Layer 18: 8 heads, sequence length 13x13
Layer 19: 8 heads, sequence length 13x13
Layer 20: 8 heads, sequence length

### Trilingual prompts

In [8]:
# Define trilingual prompts for later use
prompts = {
    "trio1": {
        "english": "Will you please help me understand the concept of kinetic energy?",
        "hindi": "क्या आप कृपया मुझे गतिज ऊर्जा की अवधारणा को समझने में मदद करेंगे?",
        "hinglish": "Kya aap mujhe kinetic energy ke concept ko samajhne mein help karoge?"
    },
    "trio2": {
        "english": "I want you to tell me a secret about the stars tonight.",
        "hindi": "मैं चाहता हूँ कि आप आज रात मुझे सितारों के बारे में एक रहस्य बताएँ।",
        "hinglish": "Main chahta hoon ki aaj raat aap mujhe stars ke baare mein ek secret batao."
    },
    "trio3": {
        "english": "I understand kinetic energy.",
        "hindi": "मुझे काइनेटिक ऊर्जा समझ आती है।",
        "hinglish": "Mujhe kinetic energy samajh aata hai."
    },
    "trio4": {
        "english": "Can you help me learn about gravity?",
        "hindi": "क्या आप मुझे गुरुत्वाकर्षण के बारे में सिखा सकते हैं?",
        "hinglish": "Kya aap mujhe gravity ke bare mein sikha sakte hain?"
    }
}

## Experiment 1

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [10]:
# Function to extract attention for all layers and heads and store model outputs
def extract_and_store_data(model, tokenizer, prompts, attention_path="attention_data_gemma2.pkl", output_path="output_data_gemma2.pkl"):
    model.eval()
    attention_dict = {}
    output_dict = {}
    
    for trio_name, languages in prompts.items():
        attention_dict[trio_name] = {}
        output_dict[trio_name] = {}
        
        for lang, prompt in languages.items():
            attention_dict[trio_name][lang] = {}
            
            # Tokenize the input
            inputs = tokenizer(prompt, return_tensors="pt").to(device)
            
            # Store the original prompt
            attention_dict[trio_name][lang]["prompt"] = prompt
            
            # Get token IDs and decode them to get individual tokens
            token_ids = inputs.input_ids[0].cpu().numpy()
            tokens = [tokenizer.decode([token_id], skip_special_tokens=True) for token_id in token_ids]
            
            # Store the tokens
            attention_dict[trio_name][lang]["tokens"] = tokens
            
            with torch.no_grad():
                outputs = model(**inputs, output_attentions=True, return_dict=True)
                attentions = outputs.attentions  # Tuple of attention tensors, one per layer
                
                # Generate model's response for the prompt
                generation_output = model.generate(
                    **inputs,
                    max_length=512,
                    temperature=0.7,
                    top_p=0.9,
                    do_sample=True
                )
                model_response = tokenizer.decode(generation_output[0], skip_special_tokens=True)
                
                # Store the model's response in output_dict
                output_dict[trio_name][lang] = model_response
            
            # Create a sub-dictionary for attention matrices
            attention_dict[trio_name][lang]["attention"] = {}
            
            for layer_idx in range(len(attentions)):  # 0 to 25
                attention_dict[trio_name][lang]["attention"][layer_idx] = {}
                layer_attention = attentions[layer_idx].squeeze(0)  # Shape: (num_heads, seq_len, seq_len)
                for head_idx in range(layer_attention.shape[0]):  # 0 to 7
                    attention_dict[trio_name][lang]["attention"][layer_idx][head_idx] = layer_attention[head_idx].cpu().numpy()
    
    # Save attention data to pickle file
    with open(attention_path, 'wb') as f:
        pickle.dump(attention_dict, f)
    print(f"Attention data saved to {attention_path}")
    
    # Save model outputs to pickle file
    with open(output_path, 'wb') as f:
        pickle.dump(output_dict, f)
    print(f"Model outputs saved to {output_path}")

    return attention_dict, output_dict


# Extract and store attention patterns and model outputs
print("Extracting and storing attention patterns and model outputs...")
attention_patterns, model_outputs = extract_and_store_data(
    model_2b, 
    tokenizer_2b, 
    prompts, 
    attention_path="attention_data_gemma2.pkl", 
    output_path="output_data_gemma2.pkl"
)
print("Data extraction and storage complete.")


Extracting and storing attention patterns and model outputs...
Attention data saved to attention_data_gemma2.pkl
Model outputs saved to output_data_gemma2.pkl
Data extraction and storage complete.
