In [13]:
import json
import os
import time
from typing import Any, Dict, List

import faiss
import numpy as np
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.datasets import SentencesDataset
from sklearn.metrics.pairwise import cosine_similarity

In [None]:

model = {
    'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
}

In [None]:
print("Loaded embedding models:", list(model.keys()))

Loaded embedding models: ['all-MiniLM-L6-v2']


In [None]:
documents = [
    "Machine learning is a subset of artificial intelligence",
    "Deep learning uses neural networks with multiple layers",
    "Natural language processing helps computers understand text",
    "Computer vision enables machines to interpret visual data",
    "Python is a popular programming language for data science",
    "JavaScript is used for web development",
    "The weather is sunny today",
    "I love eating pizza and pasta"
]

queries = [
    "What is AI and machine learning?",
    "How do neural networks work?",
    "Programming languages for data",
    "Food and restaurants"
]

print("=== DOCUMENT EMBEDDINGS ===")

doc_embeddings = model['all-MiniLM-L6-v2'].encode(documents)
print(f"Document embeddings shape: {doc_embeddings.shape}")

print("\n=== QUERY TESTING ===")
for query in queries:
    print(f"\nQuery: '{query}'")
    
    query_embedding = model['all-MiniLM-L6-v2'].encode([query])
    
    similarities = cosine_similarity(query_embedding, doc_embeddings)[0]
    
    # Get top 3 most similar documents
    top_indices = np.argsort(similarities)[::-1][:3]
    
    print("Top 3 matches:")
    for i, idx in enumerate(top_indices):
        print(f"  {i+1}. Score: {similarities[idx]:.4f} - '{documents[idx]}'")

=== DOCUMENT EMBEDDINGS ===
Document embeddings shape: (8, 384)

=== QUERY TESTING ===

Query: 'What is AI and machine learning?'
Top 3 matches:
  1. Score: 0.7654 - 'Machine learning is a subset of artificial intelligence'
  2. Score: 0.4179 - 'Computer vision enables machines to interpret visual data'
  3. Score: 0.3716 - 'Deep learning uses neural networks with multiple layers'

Query: 'How do neural networks work?'
Top 3 matches:
  1. Score: 0.5788 - 'Deep learning uses neural networks with multiple layers'
  2. Score: 0.3732 - 'Computer vision enables machines to interpret visual data'
  3. Score: 0.3636 - 'Machine learning is a subset of artificial intelligence'

Query: 'Programming languages for data'
Top 3 matches:
  1. Score: 0.6775 - 'Python is a popular programming language for data science'
  2. Score: 0.3591 - 'Natural language processing helps computers understand text'
  3. Score: 0.2917 - 'JavaScript is used for web development'

Query: 'Food and restaurants'
Top 3 matc

In [None]:
wiki_sample = load_dataset("wikipedia", "20220301.simple", split="train[:100]", trust_remote_code=True)
documents = [doc['text'][:500] for doc in wiki_sample]  # First 500 chars

print(f"Loaded {len(documents)} Wikipedia documents")

Generating train split: 100%|██████████| 205328/205328 [00:00<00:00, 782675.20 examples/s]

Loaded 100 Wikipedia documents





In [14]:
documents

["April is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of four months to have 30 days.\n\nApril always begins on the same day of week as July, and additionally, January in leap years. April always ends on the same day of the week as December.\n\nApril's flowers are the Sweet Pea and Daisy. Its birthstone is the diamond. The meaning of the diamond is innocence.\n\nThe Month \n\nApril comes between March and May, making it the fourth month o",
 'August (Aug.) is the eighth month of the year in the Gregorian calendar, coming between July and September. It has 31 days. It is named after the Roman emperor Augustus Caesar.\n\nAugust does not begin on the same day of the week as any other month in common years, but begins on the same day of the week as February in leap years. August always ends on the same day of the week as November.\n\nThe Month \n\nThis month was first called Sextilis in Latin, because it was the sixth mont

In [15]:
doc_embeddings = model['all-MiniLM-L6-v2'].encode(documents)
print(f"Wikipedia embeddings shape: {doc_embeddings.shape}")

test_queries = [
    "artificial intelligence",
    "history of computers", 
    "physics and chemistry",
    "sports and games"
]

for query in test_queries:
    query_embedding = model['all-MiniLM-L6-v2'].encode([query])
    similarities = cosine_similarity(query_embedding, doc_embeddings)[0]
    top_indices = np.argsort(similarities)[::-1][:3]
    
    print(f"\nQuery: '{query}'")
    for i, idx in enumerate(top_indices):
        print(f"  {i+1}. Score: {similarities[idx]:.4f}")
        print(f"     Text: {documents[idx][:100]}...")

Wikipedia embeddings shape: (100, 384)

Query: 'artificial intelligence'
  1. Score: 0.3552
     Text: A computer is a machine that uses electronics to input, process, store, and output data. Data is inf...
  2. Score: 0.3520
     Text: Computer science deals with the theoretical foundations of computation and practical techniques for ...
  3. Score: 0.2235
     Text: Adobe Illustrator is a computer program for making graphic design and illustrations. It is made by A...

Query: 'history of computers'
  1. Score: 0.4750
     Text: A computer is a machine that uses electronics to input, process, store, and output data. Data is inf...
  2. Score: 0.3906
     Text: Computer science deals with the theoretical foundations of computation and practical techniques for ...
  3. Score: 0.3084
     Text: Adobe Illustrator is a computer program for making graphic design and illustrations. It is made by A...

Query: 'physics and chemistry'
  1. Score: 0.5430
     Text: Chemistry is a branch of  scie

In [17]:
wiki_embeddings = load_dataset(
    "sentence-transformers/wikipedia-en-embeddings", 
    split="train[:1000]",
    trust_remote_code=True
)

embeddings = np.array(wiki_embeddings['embeddings'])
texts = wiki_embeddings['text']

query = "artificial intelligence"
query_embedding = model['all-MiniLM-L6-v2'].encode([query])

similarities = cosine_similarity(query_embedding, embeddings)[0]
top_indices = np.argsort(similarities)[::-1][:5]

print("Pre-computed Wikipedia results (all-MiniLM-L6-v2):")
for i, idx in enumerate(top_indices):
    print(f"{i+1}. Score: {similarities[idx]:.4f}")
    print(f"   Text: {texts[idx][:100]}...")

DatasetNotFoundError: Dataset 'sentence-transformers/wikipedia-en-embeddings' doesn't exist on the Hub or cannot be accessed.

In [19]:
from datasets import load_dataset_builder

builder = load_dataset_builder("maloyan/wikipedia-22-12-en-embeddings-all-MiniLM-L6-v2")

dataset_size_gb = builder.info.dataset_size / (1024**3)
download_size_gb = builder.info.download_size / (1024**3)

print(f"Dataset size: {dataset_size_gb:.2f} GB")
print(f"Download size: {download_size_gb:.2f} GB")

Dataset size: 67.17 GB
Download size: 79.98 GB


In [None]:
ds_small = load_dataset("maloyan/wikipedia-22-12-en-embeddings-all-MiniLM-L6-v2", split="train[:100]")
print(f"Loaded sample: {len(ds_small)} rows")

Downloading data:  12%|█▏        | 18/145 [09:44<1:08:27, 32.34s/files]

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = np.array(ds_small['embeddings'])
texts = ds_small['text']

def search(query, top_k=3):
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        results.append({
            'text': texts[idx],
            'score': similarities[idx]
        })
    return results

query = "artificial intelligence"
results = search(query)

for i, result in enumerate(results):
    print(f"{i+1}. Score: {result['score']:.4f}")
    print(f"