<a href="https://colab.research.google.com/github/besimorhino/ai-workshop/blob/main/text_vector_math.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Vector Math on Text Embeddings (No Kings Involved - LOL)

In [None]:
!pip install -q sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

In [None]:
# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [10]:
# Define a set of sentences
phrases = {
    'USC Trojans': 'University of Southern California',
    'Los Angeles': 'Los Angeles, California',
    'Oregon Ducks': 'University of Oregon',
    'Eugene': 'Eugene, Oregon',
}

# Encode them into vectors
embeddings = {k: model.encode(v) for k, v in phrases.items()}

In [None]:
embeddings

In [11]:
# Perform vector math
# USC Trojans - Los Angeles + Eugene ≈ Oregon Ducks?
result_vector = embeddings['USC Trojans'] - embeddings['Los Angeles'] + embeddings['Eugene']

In [None]:
result_vector

In [12]:
# Search over a small corpus to find the best match
corpus = [
    "University of Washington",
    "Seattle, Washington",
    "Stanford University",
    "Palo Alto, California",
    "University of California, Berkeley",
    "Berkeley, California",
    "Oregon Ducks", # Include the expected result in the corpus
    "University of Oregon", # Include the expected result in the corpus
    "Eugene, Oregon" # Include the expected result in the corpus
]
corpus_embeddings = model.encode(corpus)

In [13]:
# Compute cosine similarities
scores = util.cos_sim(result_vector, corpus_embeddings)[0]
sorted_indices = np.argsort(-scores)

print("Query: USC Trojans - Los Angeles + Eugene")
print("\nTop matching results:")
for idx in sorted_indices[:5]:
    print(f"{corpus[idx]} (score: {scores[idx]:.4f})")

Query: USC Trojans - Los Angeles + Eugene

Top matching results:
University of Oregon (score: 0.7655)
Eugene, Oregon (score: 0.6564)
Stanford University (score: 0.5306)
University of Washington (score: 0.4744)
University of California, Berkeley (score: 0.4595)
