In [1]:
import ollama               # Ollama
import redis                # Redis
import numpy as np          # Duh

import pypdf                # PDF reader
from tqdm import tqdm       # Progress bar bc I'm impatient
import os                   # Navigate folders
import time                 # Timing
import tracemalloc          # Memory Usage

import re                   # Text preprocessing stuff
import string               # More text preprocessing
import nltk                 # Tokenization

from sentence_transformers import SentenceTransformer       # Embedding Model
from collections import Counter     # Simple counting dictionary
from redis.commands.search.query import Query

VECTOR_DIM = 768
INDEX_NAME = "embedding_index"
DOC_PREFIX = "slides:"
DISTANCE_METRIC = "COSINE"
MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'

# Run these commands if stuff goes wacky
# docker run -d --name redis-stack -p 6379:6379 redis/redis-stack
# ollama pull nomic-embed-text

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

  from .autonotebook import tqdm as notebook_tqdm





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kibbl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kibbl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kibbl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# Helper preprocessing functions

def normalize_text(text, case_senstive=False):

    # Normalizes case if need be
    if case_senstive:
        text = text.lower()

    # Removes whitespace
    text = text.replace('\n', ' ').strip()

    return text

def remove_stopwords(tokens):
    stop_words = set(nltk.corpus.stopwords.words("english"))
    return [token for token in tokens if token.lower() not in stop_words]

In [3]:
def preprocess_text(text, method='word'):

    # Very basic text normalization
    text = normalize_text(text)

    if method == 'word':
    
        # Tokenization
        tokens = nltk.tokenize.word_tokenize(text)

        # Remove stopwords if need be
        # tokens = remove_stopwords(tokens)

        # Replaces wacky symbols (like stylized bullets) with <SYM> token if need be
        # tokens = ["<SYM>" if re.fullmatch(r"[^\w\d" + re.escape(string.punctuation) + "]", token) else token for token in tokens]

        # Replaces words that show up only once with <UNK> token if need be
        # rare = [item[0] for item in Counter(tokens).items() if item[1] == 1]
        # tokens = ['<UNK>' if token in rare else token for token in tokens]

        # Replaces pure numbers with <NUM> token if need be
        # tokens = ['<NUM>' if token.isdigit() else token for token in tokens]

        # Removes punctuation marks
        # tokens = [token for token in tokens if token not in string.punctuation]

    elif method == 'sent':

        # Tokenization
        tokens = nltk.tokenize.sent_tokenize(text)

        # Preprocessing similar to regular word preprocessing if need be
        for i in range(len(tokens)):
            sent = tokens[i]
            sent = ' '.join(preprocess_text(sent))
            tokens[i] = sent

    return tokens

def chunk_text(text, chunk_size, overlap=0):
    chunks = []
    for start in range(len(text) // (chunk_size-overlap) + 1):
        chunks.append(text[start * (chunk_size-overlap) : (start+1) * (chunk_size-overlap)])
    return chunks

In [4]:
# used to clear the redis vector store
def clear_redis_store(redis_client):
    print("Clearing existing Redis store...")
    redis_client.flushdb()
    print("Redis store cleared.")

# Create an index in Redis
def create_hnsw_index(redis_client):
    try:
        redis_client.execute_command(f"FT.DROPINDEX {INDEX_NAME} DD")
    except redis.exceptions.ResponseError:
        pass

    redis_client.execute_command(
        f"""
        FT.CREATE {INDEX_NAME} ON HASH PREFIX 1 {DOC_PREFIX}
        SCHEMA text TEXT
        embedding VECTOR HNSW 6 DIM {VECTOR_DIM} TYPE FLOAT32 DISTANCE_METRIC {DISTANCE_METRIC}
        """
    )
    print("Index created successfully.")

# Generate an embedding using nomic-embed-text
def get_embedding(text: str, model) -> list:
    response = model.encode(text)
    return response

def store_embedding(redis_client, file: str, chunk: str, embedding: list):
    key = f"{DOC_PREFIX}{file}_chunk_{chunk}"
    redis_client.hset(
        key,
        mapping={
            "file": file,
            "chunk": chunk,
            "embedding": np.array(
                embedding, dtype=np.float32
            ).tobytes(),  
        },
    )

In [5]:
def search_embeddings(redis_client, embedding_model, query, top_k=3):

    query_embedding = get_embedding(query, embedding_model)

    # Convert embedding to bytes for Redis search
    query_vector = np.array(query_embedding, dtype=np.float32).tobytes()

    try:
        # Construct the vector similarity search query
        # Use a more standard RediSearch vector search syntax
        # q = Query("*").sort_by("embedding", query_vector)

        q = (
            Query("*=>[KNN 5 @embedding $vec AS vector_distance]")
            .sort_by("vector_distance")
            .return_fields("id", "file", "chunk", "vector_distance")
            .dialect(2)
        )

        # Perform the search
        results = redis_client.ft(INDEX_NAME).search(
            q, query_params={"vec": query_vector}
        )

        # Transform results into the expected format
        top_results = [
            {
                "file": result.file,
                "chunk": result.chunk,
                "similarity": result.vector_distance,
            }
            for result in results.docs
        ][:top_k]

        # Print results for debugging
        for result in top_results:
            print(
                f"---> File: {result['file']}, Page: {result['page']}, Chunk: {result['chunk']}"
            )

        return top_results

    except Exception as e:
        print(f"Search error: {e}")
        return []

In [6]:
# Client
redis_client = redis.Redis(host='localhost', port=6379, db=0)

# Model
embedding_model = SentenceTransformer(MODEL_NAME)

# Clear and create index
clear_redis_store(redis_client)
create_hnsw_index(redis_client)

# Start time / memory check
tracemalloc.start()
start_time = time.time()

# Loop over every slide
for doc in tqdm(os.listdir('Slides')):

    # Read text
    reader = pypdf.PdfReader(f'Slides/{doc}')
    text = ''
    for page in reader.pages:
        text += page.extract_text()

    # Create chunks
    tokens = preprocess_text(text)
    chunks = chunk_text(tokens, 500)

    # Add chunks to redis
    for i in range(len(chunks)):
        chunk = ' '.join(chunks[i])
        embed = get_embedding(chunk, embedding_model)
        store_embedding(redis_client, doc, i, embed)

# End time / memory check
elapsed = time.time() - start_time
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

print(f'Time elapsed: {round(elapsed, 4)} seconds')
print(f"Peak memory usage: {peak / 1024**2:.2f} MiB")


Clearing existing Redis store...
Redis store cleared.
Index created successfully.


100%|██████████| 13/13 [00:23<00:00,  1.82s/it]

Time elapsed: 23.6348 seconds
Peak memory usage: 48.72 MiB





In [7]:
search_embeddings(redis_client, embedding_model, 'Binary Search Trees', top_k=3)

[]