<a href="https://colab.research.google.com/github/calicartels/XAI-in-LLMS/blob/main/XAI_in_LLMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install umap-learn plotly
!pip install datasets
!pip install transformers sentence-transformers


In [None]:
# Login using my credentials:

my_secret_key = userdata.get("HF_TOKEN")
login(my_secret_key)

In [None]:
from google.colab import userdata
from huggingface_hub import login
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
from sentence_transformers import SentenceTransformer
import torch
from transformers import AutoTokenizer, AutoModel
import plotly.express as px
import pandas as pd
import plotly.io as pio
pio.renderers.default = 'colab'


# Sample text data to get embeddings:
texts = [
    "The quick brown fox jumps over the lazy dog",
    "Machine learning is transforming the world",
    "Natural language processing is fascinating",
    "Deep learning models are becoming more powerful",
    "Artificial intelligence continues to evolve",
    "Data science helps us understand patterns",
    "Neural networks learn from examples",
    "Computer vision can recognize objects",
    "Robotics combines hardware and software",
    "Cloud computing enables scalability",
    "The Internet revolutionized communication",
    "Big data drives business decisions",
    "Quantum computing explores new frontiers",
    "Cybersecurity protects digital assets",
    "Edge computing reduces latency",
    "Blockchain ensures data integrity",
    "Virtual reality creates immersive experiences",
    "Augmented reality enhances reality",
    "5G networks enable faster connectivity",
    "Internet of Things connects devices"
]

# Model 1: Nvidia's Embed v1 model

def get_nv_embeddings(texts):
    try:
        model = SentenceTransformer('nvidia/nv-embed-v1',
                                   token=my_secret_key,
                                   trust_remote_code=True)
        embeddings = model.encode(texts)
        return embeddings
    except Exception as e:
        print(f"Error with NV-Embed-v1: {str(e)}")
        print("Falling back to all-MiniLM-L6-v2...")
        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        return model.encode(texts)

# Model 2: Linkedin's Embed model

def get_linq_embeddings(texts):
    try:
        model = SentenceTransformer('linkedin/linq-embed-mistral',
                                   token=my_secret_key)
        embeddings = model.encode(texts)
        return embeddings
    except Exception as e:
        print(f"Error with Linq-Embed-Mistral: {str(e)}")
        print("Falling back to all-MiniLM-L6-v2...")
        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        return model.encode(texts)

# Model 3 : Beijing's AI academy's Embed model

def get_bge_embeddings(texts):
    model = SentenceTransformer('BAAI/bge-large-en-v1.5')
    embeddings = model.encode(texts, normalize_embeddings=True)
    return embeddings


In [None]:
def reduce_dimensions(embeddings):

    # PCA
    pca = PCA(n_components=3)
    pca_result = pca.fit_transform(embeddings)

    # t-SNE
    tsne = TSNE(n_components=3,
                random_state=42,
                perplexity=5,
                n_iter=1000)
    tsne_result = tsne.fit_transform(embeddings)

    # UMAP
    umap_reducer = umap.UMAP(n_components=3,
                            random_state=42,
                            n_neighbors=5)
    umap_result = umap_reducer.fit_transform(embeddings)

    return pca_result, tsne_result, umap_result

In [None]:
def create_visualization(reduced_data, texts, method_name, model_name):
    df = pd.DataFrame(
        reduced_data,
        columns=['Dimension 1', 'Dimension 2', 'Dimension 3']
    )
    df['Text'] = texts

    fig = px.scatter_3d(
        df,
        x='Dimension 1',
        y='Dimension 2',
        z='Dimension 3',
        text='Text',
        title=f'{method_name} visualization of {model_name} embeddings',
        labels={'Text': 'Document'},
        hover_data=['Text']
    )

    # Update layout for better visibility
    fig.update_layout(
        width=800,
        height=800,
        showlegend=False,
        scene=dict(
            xaxis_title='Dimension 1',
            yaxis_title='Dimension 2',
            zaxis_title='Dimension 3'
        ),
        title=dict(
            y=0.95,
            x=0.5,
            xanchor='center',
            yanchor='top'
        )
    )

    # Update markers
    fig.update_traces(
        marker=dict(size=8, opacity=0.8),
        textposition='top center',
        hoverinfo='text'
    )

    # Display the figure
    fig.show()


In [None]:
def main():
    # Generate embeddings using different models
    models = {
        'NV-Embed-v1': get_nv_embeddings,
        'Linq-Embed-Mistral': get_linq_embeddings,
        'BGE-EN-ICL': get_bge_embeddings
    }

    for model_name, embedding_func in models.items():
        print(f"\nProcessing {model_name} embeddings...")

        try:
            # Generate embeddings
            embeddings = embedding_func(texts)

            # Apply dimensionality reduction
            pca_result, tsne_result, umap_result = reduce_dimensions(embeddings)

            # Create visualizations
            methods = {
                'PCA': pca_result,
                't-SNE': tsne_result,
                'UMAP': umap_result
            }

            print(f"\nGenerating visualizations for {model_name}...")
            for method_name, reduced_data in methods.items():
                print(f"Displaying {method_name} plot...")
                create_visualization(reduced_data, texts, method_name, model_name)
                print(f"Created {method_name} visualization for {model_name}")

                # Add a small delay between plots for better display
                import time
                time.sleep(1)

        except Exception as e:
            print(f"Error processing {model_name}: {str(e)}")
            continue



if __name__ == "__main__":
    main()

1. Principal Component Analysis : it misses nuanced clusters in complex datasets because it focuses on linear structures.
2. t-SNE, excellent for visualizing dense clusters and discovering hidden groupings, although it often distorts the global arrangement between clusters.
Although in my case, it is more suited to becasue of the dataset size being small.

3. UMAP, faster than t-SNE, scales well with larger datasets, and reveals relationships within and between clusters. Shows out large distances between the embeddings

# References

[1] https://huggingface.co/docs/huggingface_hub/en/guides/cli#huggingface-cli-login

[2] https://www.youtube.com/watch?v=7iAe3DmIXJY

[3] Claude for error debugging