In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.2.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<6.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting tqdm (from sentence-transformers)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.9.1-cp312-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.8.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.3-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-1.2.3-py3-none-any.whl.metadata (13 kB)
Collecting typing_extensions>=4.5.0 (from sentence-transformers)
  Using cached typing_extensions-4.15.0-py3-none-any.whl.me

In [2]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle

# Configuration
CHUNKS_FILE = "chunks.json"  # Your existing chunks file
OUTPUT_FILE = "embeddings_database.pkl"
MODEL_NAME = 'all-MiniLM-L6-v2'  # Fast and efficient model

def load_chunks(chunks_file):
    """Load chunks from your JSON file"""
    print(f"Loading chunks from {chunks_file}...")
    
    with open(chunks_file, 'r', encoding='utf-8') as f:
        chunks = json.load(f)
    
    print(f"Loaded {len(chunks)} chunks")
    
    # Display structure of first chunk to understand the format
    if chunks:
        print("\nFirst chunk structure:")
        print(json.dumps(chunks[0], indent=2)[:300] + "...")
    
    return chunks

def create_embeddings(chunks, model_name=MODEL_NAME):
    """
    Create embeddings for all chunks
    Model options:
    - 'all-MiniLM-L6-v2': Fast, good balance (384 dimensions)
    - 'all-mpnet-base-v2': Better quality, slower (768 dimensions)
    - 'paraphrase-multilingual-MiniLM-L12-v2': For multilingual content
    """
    print(f"\nLoading embedding model: {model_name}")
    model = SentenceTransformer(model_name)
    
    # Extract text from chunks (adjust based on your JSON structure)
    # Common keys: 'text', 'content', 'chunk', 'transcript'
    texts = []
    for chunk in chunks:
        # Try different possible keys
        if 'text' in chunk:
            texts.append(chunk['text'])
        elif 'content' in chunk:
            texts.append(chunk['content'])
        elif 'chunk' in chunk:
            texts.append(chunk['chunk'])
        elif isinstance(chunk, str):
            texts.append(chunk)
        else:
            # If none of the above, convert to string
            texts.append(str(chunk))
    
    print(f"\nCreating embeddings for {len(texts)} chunks...")
    print("This may take a few minutes...")
    
    # Create embeddings in batches
    embeddings = model.encode(
        texts, 
        show_progress_bar=True, 
        batch_size=32,
        convert_to_numpy=True
    )
    
    return embeddings, model, texts

def save_database(chunks, embeddings, texts, output_file):
    """Save chunks, texts, and embeddings to disk"""
    database = {
        'chunks': chunks,  # Original chunk objects with metadata
        'texts': texts,    # Extracted text strings
        'embeddings': embeddings,
        'metadata': {
            'num_chunks': len(chunks),
            'embedding_dim': embeddings.shape[1],
            'model': MODEL_NAME
        }
    }
    
    with open(output_file, 'wb') as f:
        pickle.dump(database, f)
    
    file_size_mb = len(pickle.dumps(database)) / (1024 * 1024)
    
    print(f"\n{'='*60}")
    print("DATABASE SAVED SUCCESSFULLY!")
    print(f"{'='*60}")
    print(f"Output file: {output_file}")
    print(f"Total chunks: {len(chunks)}")
    print(f"Embedding dimensions: {embeddings.shape[1]}")
    print(f"Approximate file size: {file_size_mb:.2f} MB")
    print(f"{'='*60}")

def main():
    """Main pipeline to create embeddings from existing chunks"""
    print("="*60)
    print("CREATING EMBEDDINGS FROM CHUNKS.JSON")
    print("="*60)
    
    try:
        # Step 1: Load chunks
        print("\n[1/3] Loading chunks from JSON...")
        chunks = load_chunks(CHUNKS_FILE)
        
        if not chunks:
            print("ERROR: No chunks found in the file!")
            return
        
        # Step 2: Create embeddings
        print("\n[2/3] Creating embeddings...")
        embeddings, model, texts = create_embeddings(chunks)
        
        # Step 3: Save database
        print("\n[3/3] Saving database...")
        save_database(chunks, embeddings, texts, OUTPUT_FILE)
        
        print("\n✓ PROCESSING COMPLETE!")
        print(f"\nYou can now use '{OUTPUT_FILE}' for semantic search")
        
        # Show sample
        print("\n" + "="*60)
        print("SAMPLE CHUNK:")
        print("="*60)
        print(f"Text preview: {texts[0][:200]}...")
        print(f"Embedding shape: {embeddings[0].shape}")
        
    except FileNotFoundError:
        print(f"\nERROR: Could not find '{CHUNKS_FILE}'")
        print("Make sure the file is in the same directory as this notebook")
    except Exception as e:
        print(f"\nERROR: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


CREATING EMBEDDINGS FROM CHUNKS.JSON

[1/3] Loading chunks from JSON...
Loading chunks from chunks.json...
Loaded 3827 chunks

First chunk structure:
{
  "video": "Ask_Dr_Tarek__Session_1.txt",
  "chunk_id": 0,
  "text": "so i have some questions that were sent i'm going to go over the questions i have them here i'll go over them one by one inshallah uh some of them will be short there's a couple of repeats uh one or two need a little bit of expl...

[2/3] Creating embeddings...

Loading embedding model: all-MiniLM-L6-v2

Creating embeddings for 3827 chunks...
This may take a few minutes...


Batches: 100%|████████████████████████████████| 120/120 [00:08<00:00, 14.53it/s]


[3/3] Saving database...

DATABASE SAVED SUCCESSFULLY!
Output file: embeddings_database.pkl
Total chunks: 3827
Embedding dimensions: 384
Approximate file size: 7.63 MB

✓ PROCESSING COMPLETE!

You can now use 'embeddings_database.pkl' for semantic search

SAMPLE CHUNK:
Text preview: so i have some questions that were sent i'm going to go over the questions i have them here i'll go over them one by one inshallah uh some of them will be short there's a couple of repeats uh one or t...
Embedding shape: (384,)



