In [2]:
import redis

# Basic connection to Redis Stack running locally
client = redis.Redis(host='127.0.0.1', port=6379, decode_responses=True)

# Test the connection
try:
    response = client.ping()
    print("Connection to Redis successful!")
    print("Redis PING response:", response)
except redis.exceptions.ConnectionError as e:
    print(f"Connection to Redis failed: {e}")

Connection to Redis successful!
Redis PING response: True


In [17]:
import json 
from pathlib import Path
def load_jsons_to_redis(directory_path, client):
    """
    Load all JSON files from a directory into Redis
    
    Args:
        directory_path (str): Path to directory containing JSON files
        client: Redis client instance
    
    Returns:
        tuple: (number of successful insertions, list of failed files)
    """
    # Convert to Path object for better path handling
    json_dir = Path(directory_path)
    
    # Ensure directory exists
    if not json_dir.is_dir():
        raise ValueError(f"Directory not found: {directory_path}")
    
    pipeline = client.pipeline()
    file_count = 0
    failed_files = []
    
    # Process each JSON file in directory
    for json_file in json_dir.glob('*.json'):
        try:
            with open(json_file, 'r') as file:
                sections = json.load(file)
                
                # Handle both single objects and arrays of objects
                if isinstance(sections, dict):
                    sections = [sections]
                
                # Add each section to pipeline
                for i, section in enumerate(sections, start=file_count + 1):
                    redis_key = f"section:{i:05}"
                    pipeline.json().set(redis_key, "$", section)
                
                file_count += len(sections)
                
        except json.JSONDecodeError as e:
            print(f"Invalid JSON in file {json_file.name}: {e}")
            failed_files.append((json_file.name, "Invalid JSON"))
        except Exception as e:
            print(f"Error processing file {json_file.name}: {e}")
            failed_files.append((json_file.name, str(e)))
    
    # Execute pipeline if there are commands
    if file_count > 0:
        try:
            res = pipeline.execute()
            print(f"Inserted {len(res)} documents successfully!")
        except Exception as e:
            print(f"Redis pipeline execution error: {e}")
            return 0, failed_files
    else:
        print("No JSON files found in directory")
        return 0, failed_files
    
    return file_count, failed_files

load_jsons_to_redis(r"C:\Users\benja\startup_projects\civgen\rag\dmv_site_data\vehicles\registration", client)

Inserted 21 documents successfully!


(21, [])

In [18]:
import numpy as np
from openai import OpenAI
import os
from dotenv import load_dotenv


load_dotenv(dotenv_path="c:/Users/benja/startup_projects/civgen/.env")
openai_client = OpenAI(
  api_key=os.getenv("OPENAI_API_KEY")
)

# Retrieve keys from Redis
keys = sorted(client.keys("section:*"))
content = client.json().mget(keys, "$.text_content")

# Flatten list and filter invalid entries

content = [item for sublist in content for item in sublist]  # Flatten
content = [text for text in content if isinstance(text, str) and text.strip()]  # Filter non-strings and empty strings

if not content:
    raise ValueError("No valid content available for generating embeddings.")

# Call OpenAI's embedding API
response = openai_client.embeddings.create(
    input=content,
    model="text-embedding-3-large"
)

# Extract embeddings using correct attribute access
embeddings = [item.embedding for item in response.data]

# Convert embeddings to float32 and list format
embeddings_array = np.array(embeddings, dtype=np.float32).tolist()

VECTOR_DIMENSION = len(embeddings_array[0]) if embeddings_array else 0
print(f"Generated {len(embeddings_array)} embeddings with dimension {VECTOR_DIMENSION}")


Generated 21 embeddings with dimension 3072


In [19]:
pipeline = client.pipeline()
for key, embedding in zip(keys, embeddings):
  #note that its named section embeddings and not content embeddings
  pipeline.json().set(key, "$.section_embeddings", embedding)
pipeline.execute()


[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [20]:
import redis
from redis.commands.search.field import (
    TextField,
    VectorField,
)
from redis.commands.search.indexDefinition import IndexDefinition, IndexType

# Define the schema fields based on your JSON structure
schema = (
    TextField("$.url", no_stem=True, as_name="url"),
    TextField("$.title", no_stem=True, as_name="title"),
    TextField("$.text_content", no_stem=True, as_name="text_content"),
    TextField("$.attachments.*", no_stem=True, as_name="attachment_paths"),
    VectorField(
        "$.section_embeddings",  # You'll need to add this to your JSON
        "FLAT",
        {
            "TYPE": "FLOAT32",
            "DIM": VECTOR_DIMENSION,  # Replace with your actual dimension
            "DISTANCE_METRIC": "COSINE",
        },
        as_name="vector"
    )
)

# Define the index
definition = IndexDefinition(prefix=["section:"], index_type=IndexType.JSON)

# Create the index
res = client.ft("idx:section_vss").create_index(fields=schema, definition=definition)

In [21]:
import numpy as np
from redis.commands.search.query import Query

# 1. Encode query (keep this the same)
query_text = "How do I register my vehicle in Virginia?"
query_vector = np.array(
    openai_client.embeddings.create(
        input=query_text,
        model="text-embedding-3-large"
    ).data[0].embedding, 
    dtype=np.float32
)

# 2. Update query to match schema fields
query = (
    Query('(*)=>[KNN 10 @vector $query_vector AS vector_score]')
    .sort_by('vector_score')
    .return_fields(
        'vector_score', 
        'url',         # From TextField("$.url")
        'title',       # From TextField("$.title")
        'text_content',# From TextField("$.text_content")
        'attachment_paths'  # From TextField("$.attachments.*")
    )
    .dialect(2)
)

# 3. Execute search (keep the same)
result = client.ft('idx:section_vss').search(
    query,
    {'query_vector': query_vector.tobytes()}
)

# 4. Process results with schema-aligned fields
for doc in result.docs:
    print(f"""
    Score: {doc.vector_score}
    URL: {doc.url}
    Title: {doc.title}
    Content: {doc.text_content}
    Attachments: {doc.attachment_paths}
    """)



    Score: 0.321078240871
    URL: https://www.dmv.virginia.gov/vehicles/registration/first-reg
    Title: First Time Vehicle Registration
    Content: You may register your vehicle at any DMV
customer service center
. DMV will issue you two license plates with decals showing the expiration date and a registration card.
Please follow the process below:
Place the decals on the plates.
Place the plates on the front and rear of your vehicle.
Keep the registration card in your vehicle at all times.
To purchase new Virginia license plates, complete form
VSA-14
.
You may choose to register your vehicle for one or two years. Two-year registration costs twice the annual fee and provides a convenient way for you to register with DMV. The two-year registration option is not available for vehicles with a gross vehicle weight of 55,000 lbs. or more, or vehicles registered under the International Registration Plan.
    Attachments: attachments\vsa14.pdf
    

    Score: 0.328221201897
    URL: htt