## OpenAI

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from openai import OpenAI

import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get the OpenAI API key from the environment variable
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize the OpenAI client
client = OpenAI(api_key=openai_api_key)

# Get the embeddings for the two input texts
response1 = client.embeddings.create(input="Joanne is a girl", model="text-embedding-3-small")
response2 = client.embeddings.create(input="Joanne is a good girl", model="text-embedding-3-small")

# Extract the embeddings as numpy arrays
embedding_1 = np.array(response1.data[0].embedding)
embedding_2 = np.array(response2.data[0].embedding)

# Compute the differences between the embeddings
embedding_diff = embedding_1 - embedding_2

# Step 1: Compute the dot product
dot_product = np.dot(embedding_1, embedding_2)

# Step 2: Compute norms (magnitudes)
norm_1 = np.linalg.norm(embedding_1)
norm_2 = np.linalg.norm(embedding_2)

# Step 3: Compute cosine similarity
cosine_similarity = dot_product / (norm_1 * norm_2)

print(f"Cosine Similarity: {cosine_similarity}")

# Step 4: Plot the differences with red-green coloring (red for negative, green for positive)
plt.figure(figsize=(12, 6))

# Create a color map: red for negative and green for positive
colors = ['r' if diff < 0 else 'g' for diff in embedding_diff]

# Plot the bar chart
plt.bar(range(1536), embedding_diff, color=colors, alpha=0.7)

# Labels and title
plt.xlabel('Embedding Dimension Index')
plt.ylabel('Difference in Embedding Values')
plt.title('Difference Between the Two Embeddings (Joanne vs Alosh)')

plt.show()

## Sentence Transformers

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define sentences
sentence1 = "Joanne is a girl"
sentence2 = "Joanne is a good girl"

# Encode sentences into embeddings
embedding1 = model.encode(sentence1)
embedding2 = model.encode(sentence2)

# Compute the differences between the embeddings
embedding_diff = embedding1 - embedding2

# Sort the differences by magnitude
sorted_indices = np.argsort(np.abs(embedding_diff))[::-1]

# Step 1: Visualize the differences
plt.figure(figsize=(12, 6))

# Step 2: Create a color map: red for negative and green for positive
colors = ['r' if embedding_diff[i] < 0 else 'g' for i in sorted_indices]

# Step 3: Plot the most significant differences (sorted by absolute magnitude)
plt.bar(range(len(sorted_indices)), embedding_diff[sorted_indices], color=colors, alpha=0.7)

# Labels and title
plt.xlabel('Embedding Dimension Index (sorted by magnitude)')
plt.ylabel('Difference in Embedding Values')
plt.title('Difference Between Embeddings ("Joanne is a girl" vs "Joanne is a good girl")')

plt.show()

## Animating Tokenizer State

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define the sentence and split into words
sentence = "If you're going to write history, you need to know how to read it."
words = sentence.split()

# Generate embeddings for each cumulative sentence
embeddings = []
for i in range(1, len(words) + 1):
    partial_sentence = " ".join(words[:i])
    embedding = model.encode(partial_sentence)
    embeddings.append(embedding)

# Calculate differences between consecutive embeddings
diffs = [embeddings[i] - embeddings[i - 1] for i in range(1, len(embeddings))]

# Find global min and max for y-axis scaling
global_min = min(np.min(diff) for diff in diffs)
global_max = max(np.max(diff) for diff in diffs)

sentence = []
sentence.append(words[0])
"".join(sentence)
# Create and save individual frames
for j, (diff, word) in enumerate(zip(diffs, words[1:])):  # Skip first word
    plt.figure(figsize=(12, 6))
    
    # Sort dimensions by magnitude of change
    sorted_indices = np.argsort(np.abs(diff))[::-1]
    colors = ['r' if diff[i] < 0 else 'g' for i in sorted_indices]
    sentence.append(word)
    "".join(sentence)
    # Create the bar plot
    plt.bar(range(len(diff)), diff[sorted_indices], color=colors, alpha=0.7)
    plt.xlabel('Embedding Dimension Index (sorted by magnitude)')
    plt.ylabel('Difference in Embedding Values')
    plt.title(f'{" ".join(sentence)}')
    
    # Set consistent y-axis limits for all frames
    plt.ylim(global_min, global_max)
    
    # Save frame with zero-padded index
    plt.savefig(f'frame_{j:03d}.png', bbox_inches='tight')
    plt.close()  # Important: close the figure to free memory

print(f"Generated {len(diffs)} frames as PNG files with consistent y-axis scaling")

import imageio

images = []
for j in range(len(diffs)):
    images.append(imageio.imread(f'frame_{j:03d}.png'))
    os.remove(f'frame_{j:03d}.png')  # Delete the PNG file after reading it
imageio.mimsave('tokenizer.gif', images, duration=0.5)

from IPython.display import Image

Image(filename='tokenizer.gif')

## Output of GPT2

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import matplotlib.pyplot as plt
import imageio
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", token='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')
model = GPT2LMHeadModel.from_pretrained("gpt2", token='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')
model.eval()

# Function to get hidden states
def get_hidden_states(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)  # Get hidden states from all layers
    return inputs.input_ids, outputs.hidden_states  # Return tokenized input & hidden states

# Streaming input simulation
def stream_tokens(text, cache_update_fn):
    words = text.split()
    cache = None  # Initialize empty cache
    for i in range(1, len(words) + 1):
        partial_text = " ".join(words[:i])
        token_ids, hidden_states = get_hidden_states(partial_text)
        
        if cache is None:
            cache = hidden_states[-1]  # Use the final layer's hidden state
        else:
            cache = cache_update_fn(cache, hidden_states[-1])  # Update cache with final layer state
        
        yield token_ids, cache  # Yield current state for testing

# Cache update function using diffusion-like update
def diffusion_update(old_cache, new_hidden_state, alpha=0.5):
    min_len = min(old_cache.shape[1], new_hidden_state.shape[1])  # Ensure dimensional alignment
    diff = new_hidden_state[:, -min_len:] - old_cache[:, -min_len:]  # Compute difference
    update = old_cache.clone()
    update[:, -min_len:] += alpha * diff  # Apply weighted update
    return update

# Example test text
sample_text = "If you're going to write history, you need to know how to read it."

# Function to set the y-axis limits based on the mean and standard deviation of the diff
def plot_with_custom_ylimits(diff, j, sample_text, saved_frames):
    plt.figure(figsize=(12, 6))

    # We expect diff to be a 2D array (tokens, hidden dimension)
    # Flatten it to a 1D array of differences
    flat_diff = diff.flatten()

    # Step 2: Create a color map: red for negative and green for positive
    colors = ['r' if val < 0 else 'g' for val in flat_diff]

    # Step 3: Plot the most significant differences (sorted by absolute magnitude)
    sorted_indices = np.argsort(np.abs(flat_diff))[::-1]

    plt.bar(range(len(sorted_indices)), flat_diff[sorted_indices], color=colors, alpha=0.7)

    # Add the sentence as the plot title
    partial_sentence = " ".join(sample_text.split()[:j + 1])  # Current sentence being processed
    plt.title(f"{partial_sentence}")

    # Labels and title
    plt.xlabel('Output Dimension Index (sorted by magnitude)')
    plt.ylabel('Difference in Output Layer Values')

    # Calculate mean and standard deviation for y-axis limits
    mean_diff = np.mean(flat_diff)
    std_diff = np.std(flat_diff)

    # Set the y-axis limits based on mean ± 2 * standard deviation
    y_min = mean_diff - 2 * std_diff
    y_max = mean_diff + 2 * std_diff

    # Apply y-axis limits
    plt.ylim(y_min, y_max)
    
    # Save the frame
    frame_filename = f'frame_{j:03d}.png'
    plt.savefig(frame_filename, bbox_inches='tight')
    plt.close()  # Important: close the figure to free memory

    # Add the frame filename to the list
    saved_frames.append(frame_filename)

# Initialize variables to track global min and max
saved_frames = []  # Initialize the list for saving frames
cache = None  # Initialize the cache

# Single pass to compute diffs
for j, (token_ids, updated_cache) in enumerate(stream_tokens(sample_text, diffusion_update)):    
    updated_cache = updated_cache.detach().cpu().numpy()

    # If cache is still None, initialize it with the first update
    if cache is None:
        cache = updated_cache
        continue  # Skip the diff calculation for the first iteration

    diff = updated_cache - cache  # Compute difference between updated cache and previous cache
    cache = updated_cache  # Update cache with the new hidden state

    # Visualize the differences with dynamic y-axis limits based on mean and standard deviation
    plot_with_custom_ylimits(diff, j, sample_text, saved_frames)

# Now, create the GIF using the saved frames
images = []
for frame_filename in saved_frames:
    images.append(imageio.imread(frame_filename))  # Read each saved frame
    os.remove(frame_filename)  # Delete the frame after reading it

# Save the frames as a GIF
imageio.mimsave('model.gif', images, duration=0.5)

# Display the GIF
from IPython.display import Image
Image(filename='model.gif')

## GPT2 All Layers

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from PIL import Image as Image
import imageio
import numpy as np
import matplotlib.pyplot as plt
import imageio
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2", output_hidden_states=True)
model.to(device)
model.eval()

# Function to get hidden states from all layers
def get_hidden_states(text):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return inputs.input_ids, outputs.hidden_states  # Return tokenized input & hidden states

# Streaming input simulation
def stream_tokens(text, cache_update_fn):
    words = text.split()
    cache = [None] * (model.config.n_layer + 1)  # Initialize cache for all layers (including embeddings)
    for i in range(1, len(words) + 1):
        partial_text = " ".join(words[:i])
        token_ids, hidden_states = get_hidden_states(partial_text)
        
        # Update cache for each layer
        for layer_idx in range(len(hidden_states)):
            if cache[layer_idx] is None:
                cache[layer_idx] = hidden_states[layer_idx]  # Initialize cache for the first iteration
            else:
                cache[layer_idx] = cache_update_fn(cache[layer_idx], hidden_states[layer_idx])  # Update cache
        
        yield token_ids, cache  # Yield current state for testing

# Cache update function using diffusion-like update
def diffusion_update(old_cache, new_hidden_state, alpha=0.5):
    min_len = min(old_cache.shape[1], new_hidden_state.shape[1])  # Ensure dimensional alignment
    diff = new_hidden_state[:, -min_len:] - old_cache[:, -min_len:]  # Compute difference
    update = old_cache.clone()
    update[:, -min_len:] += alpha * diff  # Apply weighted update
    return update

# Example test text
sample_text = "If you're going to write history, you need to know how to read it."

# Function to set the y-axis limits based on the mean and standard deviation of the diff
def plot_with_custom_ylimits(diff, j, sample_text, saved_frames, layer_idx, num_layers):
    plt.figure(figsize=(12, 6))

    # Flatten diff for easier visualization
    flat_diff = diff.flatten()

    # Create a color map: red for negative and green for positive
    colors = ['r' if val < 0 else 'g' for val in flat_diff]

    # Sort by absolute magnitude to highlight the most significant differences
    # sorted_indices = np.argsort(np.abs(flat_diff))[::-1]

    sorted_indices = range(len(flat_diff))  # No sorting, just iterate through all the indices

    # Plot the most significant differences
    plt.bar(range(len(sorted_indices)), flat_diff[sorted_indices], color=colors, alpha=0.7)

    # Add the sentence as the plot title
    partial_sentence = " ".join(sample_text.split()[:j + 1])  # Current sentence being processed
    plt.title(f"Layer {layer_idx}: {partial_sentence}")

    # Labels and title
    plt.xlabel('Output Dimension Index (sorted by magnitude)')
    plt.ylabel('Difference in Output Layer Values')

    # Calculate mean and standard deviation for y-axis limits
    mean_diff = np.mean(flat_diff)
    std_diff = np.std(flat_diff)

    # Set the y-axis limits based on mean ± 2 * standard deviation
    y_min = mean_diff - 2 * std_diff
    y_max = mean_diff + 2 * std_diff

    # Apply y-axis limits
    plt.ylim(y_min, y_max)
    
    # Save the frame
    frame_filename = f'frame_{j:03d}_layer_{layer_idx}.png'
    plt.savefig(frame_filename, bbox_inches='tight')
    plt.close()  # Important: close the figure to free memory

    # Add the frame filename to the list
    saved_frames[layer_idx].append(frame_filename)

# Initialize variables to track global min and max
num_layers = model.config.n_layer + 1  # Number of layers (including embeddings)
saved_frames = [[] for _ in range(num_layers)]  # Initialize the list for saving frames for each layer
cache = [None] * num_layers  # Initialize cache for all layers (including embeddings)

# Single pass to compute diffs across all layers
for j, (token_ids, updated_cache) in enumerate(stream_tokens(sample_text, diffusion_update)):
    # For each layer in the cache, compute the difference and visualize it
    for layer_idx, updated_hidden_state in enumerate(updated_cache):
        updated_hidden_state = updated_hidden_state.detach().cpu().numpy()

        # If cache for this layer is still None, initialize it with the first update
        if cache[layer_idx] is None:
            cache[layer_idx] = updated_hidden_state
            continue  # Skip the diff calculation for the first iteration

        diff = updated_hidden_state - cache[layer_idx]  # Compute difference between updated cache and previous cache
        cache[layer_idx] = updated_hidden_state  # Update cache with the new hidden state

        # Visualize the differences with dynamic y-axis limits based on mean and standard deviation
        plot_with_custom_ylimits(diff, j, sample_text, saved_frames, layer_idx, num_layers)

# Create individual GIFs for each layer
for layer_idx in range(num_layers):
    images = []
    for frame_filename in saved_frames[layer_idx]:
        images.append(imageio.imread(frame_filename))  # Read each saved frame
        os.remove(frame_filename)  # Delete the frame after reading it

    # Save the frames as a GIF for the current layer
    imageio.mimsave(f'layer_{layer_idx + 1}.gif', images, duration=0.5)

# Load all GIFs
layer_gifs = [imageio.get_reader(f'layer_{i+1}.gif') for i in range(num_layers)]

# Get the number of frames (assuming all GIFs have the same number of frames)
num_frames = min([gif.get_length() for gif in layer_gifs])

# Number of rows and columns for the grid (adjust these based on the number of layers)
grid_rows = 4
grid_cols = (num_layers + grid_rows - 1) // grid_rows  # Ensure all layers fit in the grid

# Create a list to store the combined frames in grid format
combined_frames_grid = []

for frame_idx in range(num_frames):
    frames = [Image.fromarray(gif.get_next_data()) for gif in layer_gifs]  # Extract frames
    widths, heights = zip(*(frame.size for frame in frames))  # Get dimensions

    # Determine the size of each cell in the grid
    max_width = max(widths)
    max_height = max(heights)

    # Create a blank canvas for the grid (larger enough to hold the grid of images)
    total_width = grid_cols * max_width
    total_height = grid_rows * max_height
    new_frame = Image.new('RGB', (total_width, total_height))

    # Place each frame in the grid at the appropriate position
    for idx, frame in enumerate(frames):
        row = idx // grid_cols  # Determine the row in the grid
        col = idx % grid_cols  # Determine the column in the grid

        # Calculate position to paste the current frame in the grid
        x_offset = col * max_width
        y_offset = row * max_height

        # Paste the frame at the calculated position
        new_frame.paste(frame, (x_offset, y_offset))

    combined_frames_grid.append(new_frame)

# Save as final GIF
output_gif_grid = 'combined_layers_grid.gif'
combined_frames_grid[0].save(output_gif_grid, save_all=True, append_images=combined_frames_grid[1:], duration=500, loop=0)

# Cleanup individual layer GIFs if needed
for i in range(num_layers):
    os.remove(f'layer_{i+1}.gif')

print(f"Combined grid GIF saved as {output_gif_grid}")