 # **"How do attention mechanisms in smaller language models (Gemma2 2B and LLaMa 3.1 1B) process semantic equivalence across English, Hindi, and Hinglish, and what specific Sparse Autoencoder (SAE) features activate during code-switching that might explain performance differences in multilingual contexts?"**


In [1]:
# !pip install 'accelerate>=0.26.0'
# !pip install --upgrade transformers
# !pip install imageio


In [2]:
import os
os.getcwd()

'/notebooks'

In [3]:
from huggingface_hub import login, whoami
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Hardcoding HF_Token because IDK how to use Secrets in paperspace Gradient
HF_TOKEN = "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"

login(token=HF_TOKEN)

print("Logged in as: ", whoami(token=HF_TOKEN)['name'])

Logged in as:  astroanand


In [4]:

# Set memory configuration to avoid fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()

# Completely disable torch dynamo to avoid compatibility issues
import torch._dynamo
torch._dynamo.config.disable = True

# Define your device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU Memory: {torch.cuda.mem_get_info()[0] / 1024**2:.2f} MB free out of {torch.cuda.mem_get_info()[1] / 1024**2:.2f} MB total")


Using device: cuda
GPU Memory: 8965.12 MB free out of 16108.75 MB total


### Loading Llama3.1 1B model

In [5]:
import os
import sys
import torch

# Completely remove TensorFlow from sys.modules before importing transformers
for module in list(sys.modules.keys()):
    if 'tensorflow' in module or 'tf_' in module or module == 'tf':
        sys.modules.pop(module, None)

# Set environment variables to prevent TensorFlow loading
os.environ["USE_TF"] = "0"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# Import AutoTokenizer instead of LlamaTokenizer
from transformers import AutoTokenizer, LlamaForCausalLM

print("Setting up device...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

print("Loading Llama3.2 1B model...")
model_id = "meta-llama/Llama-3.2-1B-Instruct"

# Load tokenizer using AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True,
    use_fast=True
)
print("Tokenizer loaded successfully")

# Load model with minimal options
model = LlamaForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True,  # Reduce CPU memory usage
    #trust_remote_code=True,
    attn_implementation="eager"    
)
print("Model loaded successfully")

# Count layers
num_layers = len(model.model.layers)
print(f"\nLlama3.2-1B has {num_layers} layers")

# Get head dimensions from first layer
head_dim = model.model.layers[0].self_attn.head_dim
q_heads = model.model.layers[0].self_attn.q_proj.out_features // head_dim
kv_heads = model.model.layers[0].self_attn.k_proj.out_features // head_dim
print(f"Query heads: {q_heads}, Key/Value heads: {kv_heads}")

# Sample text for attention analysis
text = "Explain the concept of kinetic energy."
inputs = tokenizer(text, return_tensors="pt").to(device)

# Generate with attention outputs
with torch.no_grad():
    outputs = model(**inputs, output_attentions=True)

# Print attention shapes
if outputs.attentions:
    print("\nExtracted attention shapes for all layers and heads:")
    for layer_idx, attn in enumerate(outputs.attentions):
        batch_size, num_heads, seq_length, _ = attn.shape
        print(f"Layer {layer_idx}: {num_heads} heads, sequence length {seq_length}x{seq_length}")
else:
    print("\nNo attention outputs were returned.")
    print("Manually inspecting attention configuration:")
    for layer_idx, layer in enumerate(model.model.layers):
        head_dim = layer.self_attn.head_dim
        q_heads = layer.self_attn.q_proj.out_features // head_dim
        kv_heads = layer.self_attn.k_proj.out_features // head_dim
        print(f"Layer {layer_idx}: {kv_heads} KV heads (GQA with {q_heads} query heads)")


Setting up device...
Using device: cuda
Loading Llama3.2 1B model...
Tokenizer loaded successfully
Model loaded successfully

Llama3.2-1B has 16 layers
Query heads: 32, Key/Value heads: 8

Extracted attention shapes for all layers and heads:
Layer 0: 32 heads, sequence length 9x9
Layer 1: 32 heads, sequence length 9x9
Layer 2: 32 heads, sequence length 9x9
Layer 3: 32 heads, sequence length 9x9
Layer 4: 32 heads, sequence length 9x9
Layer 5: 32 heads, sequence length 9x9
Layer 6: 32 heads, sequence length 9x9
Layer 7: 32 heads, sequence length 9x9
Layer 8: 32 heads, sequence length 9x9
Layer 9: 32 heads, sequence length 9x9
Layer 10: 32 heads, sequence length 9x9
Layer 11: 32 heads, sequence length 9x9
Layer 12: 32 heads, sequence length 9x9
Layer 13: 32 heads, sequence length 9x9
Layer 14: 32 heads, sequence length 9x9
Layer 15: 32 heads, sequence length 9x9


In [6]:
# Define trilingual prompts for later use
prompts = {
    "trio1": {
        "english": "Will you please help me understand the concept of kinetic energy?",
        "hindi": "क्या आप कृपया मुझे गतिज ऊर्जा की अवधारणा को समझने में मदद करेंगे?",
        "hinglish": "Kya aap mujhe kinetic energy ke concept ko samajhne mein help karoge?"
    },
    "trio2": {
        "english": "I want you to tell me a secret about the stars tonight.",
        "hindi": "मैं चाहता हूँ कि आप आज रात मुझे सितारों के बारे में एक रहस्य बताएँ।",
        "hinglish": "Main chahta hoon ki aaj raat aap mujhe stars ke baare mein ek secret batao."
    },
    "trio3": {
        "english": "I understand kinetic energy.",
        "hindi": "मुझे काइनेटिक ऊर्जा समझ आती है।",
        "hinglish": "Mujhe kinetic energy samajh aata hai."
    },
    "trio4": {
        "english": "Can you help me learn about gravity?",
        "hindi": "क्या आप मुझे गुरुत्वाकर्षण के बारे में सिखा सकते हैं?",
        "hinglish": "Kya aap mujhe gravity ke bare mein sikha sakte hain?"
    }
}

In [7]:
import pickle

# Function to extract attention for all layers and heads and store model outputs
def extract_and_store_data(model, tokenizer, prompts, attention_path="attention_data_Llama3.2.pkl", output_path="output_data_Llama3.2.pkl"):
    model.eval()
    attention_dict = {}
    output_dict = {}
    
    for trio_name, languages in prompts.items():
        attention_dict[trio_name] = {}
        output_dict[trio_name] = {}
        
        for lang, prompt in languages.items():
            attention_dict[trio_name][lang] = {}
            
            # Tokenize the input
            inputs = tokenizer(prompt, return_tensors="pt").to(device)
            
            # Store the original prompt
            attention_dict[trio_name][lang]["prompt"] = prompt
            
            # Get token IDs and decode them to get individual tokens
            token_ids = inputs.input_ids[0].cpu().numpy()
            tokens = [tokenizer.decode([token_id], skip_special_tokens=True) for token_id in token_ids]
            
            # Store the tokens
            attention_dict[trio_name][lang]["tokens"] = tokens
            
            with torch.no_grad():
                outputs = model(**inputs, output_attentions=True, return_dict=True)
                attentions = outputs.attentions  # Tuple of attention tensors, one per layer
                
                # Generate model's response for the prompt
                generation_output = model.generate(
                    **inputs,
                    max_length=512,
                    temperature=0.7,
                    top_p=0.9,
                    do_sample=True
                )
                model_response = tokenizer.decode(generation_output[0], skip_special_tokens=True)
                
                # Store the model's response in output_dict
                output_dict[trio_name][lang] = model_response
            
            # Create a sub-dictionary for attention matrices
            attention_dict[trio_name][lang]["attention"] = {}
            
            for layer_idx in range(len(attentions)):  # Iterate through all layers
                attention_dict[trio_name][lang]["attention"][layer_idx] = {}
                layer_attention = attentions[layer_idx].squeeze(0)  # Shape: (num_heads, seq_len, seq_len)
                for head_idx in range(layer_attention.shape[0]):  # Iterate through all heads
                    attention_dict[trio_name][lang]["attention"][layer_idx][head_idx] = layer_attention[head_idx].cpu().numpy()
    
    # Save attention data to pickle file
    with open(attention_path, 'wb') as f:
        pickle.dump(attention_dict, f)
    print(f"Attention data saved to {attention_path}")
    
    # Save model outputs to pickle file
    with open(output_path, 'wb') as f:
        pickle.dump(output_dict, f)
    print(f"Model outputs saved to {output_path}")

    return attention_dict, output_dict


# Extract and store attention patterns and model outputs
print("Extracting and storing attention patterns and model outputs...")
attention_patterns, model_outputs = extract_and_store_data(
    model, 
    tokenizer, 
    prompts, 
    attention_path="attention_data_Llama3.2.pkl", 
    output_path="output_data_Llama3.2.pkl"
)
print("Data extraction and storage complete.")


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Extracting and storing attention patterns and model outputs...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Attention data saved to attention_data_Llama3.2.pkl
Model outputs saved to output_data_Llama3.2.pkl
Data extraction and storage complete.
