In [13]:
!pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.46.0-py3-none-any.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.1 kB ? eta -:--:--
     --------- ------------------------------ 10.2/44.1 kB ? eta -:--:--
     ----------------- -------------------- 20.5/44.1 kB 330.3 kB/s eta 0:00:01
     ----------------------------------- -- 41.0/44.1 kB 393.8 kB/s eta 0:00:01
     -------------------------------------- 44.1/44.1 kB 271.1 kB/s eta 0:00:00
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.5.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-0.26.1-py3-none-any.whl.metadata (13 kB)
Collecting sympy==1.13.1 (from torch>=1.11.0->sentence-transformers)
  Using cached sympy-1.13.1-py3-none-any

In [14]:
#Import required libraries 
from sentence_transformers import SentenceTransformer
import numpy as np 

#Simple test to ensure everything works
print("✓ Imports successful!")

  from tqdm.autonotebook import tqdm, trange


✓ Imports successful!


In [33]:
#Creating our notes dataset
# Initialize our collection of notes (Random notes)
notes = [
    "Python lists are mutable sequences used to store collections of items. They can contain mixed types and are defined using square brackets.",
    "Lists in Python can be modified after creation. Common operations include append(), extend(), and insert().",
    "Data structures are fundamental building blocks in programming. They help organize and store data efficiently.",
    "Arrays in NumPy provide efficient storage and operations for numerical data. They are widely used in scientific computing.",
    "Object-oriented programming in Python uses classes and objects. Classes define the structure and behavior of objects.",
    "The pandas library is built on top of NumPy and provides powerful data manipulation tools through DataFrames.",
    "Version control with Git helps track changes in code. Common commands include commit, push, and pull.",
    "Python functions are defined using the def keyword. They can accept parameters and return values.",
    "Montreal is a multi-cultural city",
    "Peru offers an extreme variety of ethnicities with amazing culture and cultural heritage"
]

# Print the number of notes we have
print(f"Created {len(notes)} notes")

Created 10 notes


In [None]:
#Testing our dataset
# Print each note with its length
for i, note in enumerate(notes): #if we dont write enumerate we dont have index (i)
    print(f"\nNote {i+1} (Length: {len(note)} characters):")
    print(f"{'='*50}")
    print(note)

In [17]:
#Comprehension of list of loops in just one line without appending every single time
# Quick analysis of our notes
note_lengths = [len(note.split()) for note in notes]
# note.split() breaks each note (string) into a list of words by splitting it at spaces.
# len(note.split()) calculates the number of words in each note.
# The result is a list, note_lengths, where each entry corresponds to the word count of a note.

print(f"Average words per note: {sum(note_lengths)/len(note_lengths):.1f}")
print(f"Shortest note: {min(note_lengths)} words")
print(f"Longest note: {max(note_lengths)} words")

Average words per note: 15.4
Shortest note: 5 words
Longest note: 22 words


In [18]:
from sentence_transformers import SentenceTransformer

# Load the model - this may take a few seconds
model = SentenceTransformer('all-MiniLM-L6-v2')

print("Model loaded successfully!")

Model loaded successfully!


In [19]:
# UNDERSTANDING EMBEDDINGS
# Create an embedding for a simple sentence
test_sentence = "This is a test sentence to understand embeddings."
embedding = model.encode(test_sentence)

# Look at the embedding's properties
print(f"Embedding shape: {embedding.shape}")
print(f"First 5 values: {embedding[:5]}")

#Each embedding is a vector of 384 numbers that represents the semantic meaning of the text.

Embedding shape: (384,)
First 5 values: [ 0.03313288 -0.0281372   0.10922699  0.02421217  0.04646194]


In [20]:
#TESTING BATCH PROCESSING
test_sentences = [
    "Python is a programming language",
    "Programming languages are used to write software",
    "Pythons are large snakes"
]

# Generate embeddings for all sentences at once
embeddings = model.encode(test_sentences)

print(f"Number of embeddings: {len(embeddings)}")
print(f"Shape of each embedding: {embeddings[0].shape}")
#The output without the comma would be an int. Only () would still being tuples

Number of embeddings: 3
Shape of each embedding: (384,)


In [43]:
#TESTING SIMILARITY
import numpy as np

# Calculate similarities between sentences
def calculate_similarity(emb1, emb2):
    return np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))

# Get similarities between all pairs
for i in range(len(test_sentences)): #To loop through each item of "test_sentences"
    for j in range(i + 1, len(test_sentences)):
        similarity = calculate_similarity(embeddings[2], embeddings[0]) #Compare between each sentence
        print(f"\nSimilarity between:\n'{test_sentences[i]}' and\n'{test_sentences[j]}':\n{similarity:.3f}")
# print(similarity)


Similarity between:
'Python is a programming language' and
'Programming languages are used to write software':
0.497

Similarity between:
'Python is a programming language' and
'Pythons are large snakes':
0.497

Similarity between:
'Programming languages are used to write software' and
'Pythons are large snakes':
0.497


In [34]:
# Convert all notes to embeddings
note_embeddings = model.encode(notes)

print(f"Created embeddings for {len(notes)} notes")
print(f"Each embedding has shape: {note_embeddings[0].shape}")

Created embeddings for 10 notes
Each embedding has shape: (384,)


In [35]:
def search_notes(query, top_k=3):
    # Convert search query to embedding
    query_embedding = model.encode(query)
    
    # Calculate similarities with all notes
    similarities = np.dot(note_embeddings, query_embedding) / (
        np.linalg.norm(note_embeddings, axis=1) * np.linalg.norm(query_embedding)
    )
    
    # Get top k matches
    top_idx = np.argsort(similarities)[::-1][:top_k]
    
    # A list of dictionaries
    results = []
    for idx in top_idx:
        results.append({
            'note': notes[idx],
            'similarity': similarities[idx]
        })
        
    # Return matching notes with thier similarity scores
    return results

In [38]:
#Trying some searches
# Function to display search results nicely
def display_results(query, results):
    print(f"\nSearch Query: '{query}'")
    print("=" * 50)
    for i, r in enumerate(results, 1):
        print(f"\n{i}. Match ({r['similarity']:.2%} similar):")
        print(r['note'])

# Try some example searches
queries = [
    "Places that have extreme culture"
    # "How do Python lists work?",
    # "Tell me about data structures",
    # "What is object oriented programming?"
]

for query in queries:
    results = search_notes(query)
    display_results(query, results)


Search Query: 'Places that have extreme culture'

1. Match (48.76% similar):
Peru offers an extreme variety of ethnicities with amazing culture and cultural heritage

2. Match (41.51% similar):
Montreal is a multi-cultural city

3. Match (2.54% similar):
The pandas library is built on top of NumPy and provides powerful data manipulation tools through DataFrames.
