In [65]:
data_path = r"dmv_site_data_v3\vehicles\registration"
def convert_backslashes(input_string):
    return input_string.replace("\\", "/")
data_path = convert_backslashes(data_path)

In [66]:
import redis

# Basic connection to Redis Stack running locally
client = redis.Redis(host='127.0.0.1', port=6379, decode_responses=True)

# Test the connection
try:
    response = client.ping()
    print("Connection to Redis successful!")
    print("Redis PING response:", response)
except redis.exceptions.ConnectionError as e:
    print(f"Connection to Redis failed: {e}")

Connection to Redis successful!
Redis PING response: True


In [67]:
import json 
from pathlib import Path
def load_jsons_to_redis(directory_path, client):
    """
    Load all JSON files from a directory into Redis
    
    Args:
        directory_path (str): Path to directory containing JSON files
        client: Redis client instance
    
    Returns:
        tuple: (number of successful insertions, list of failed files)
    """
    # Convert to Path object for better path handling
    json_dir = Path(directory_path)
    
    # Ensure directory exists
    if not json_dir.is_dir():
        raise ValueError(f"Directory not found: {directory_path}")
    
    pipeline = client.pipeline()
    file_count = 0
    failed_files = []
    
    # Process each JSON file in directory
    for json_file in json_dir.glob('*.json'):
        try:
            with open(json_file, 'r') as file:
                sections = json.load(file)
                
                # Handle both single objects and arrays of objects
                if isinstance(sections, dict):
                    sections = [sections]
                
                # Add each section to pipeline
                for i, section in enumerate(sections, start=file_count + 1):
                    redis_key = f"{data_path}:{i:05}"
                    pipeline.json().set(redis_key, "$", section)
                
                file_count += len(sections)
                
        except json.JSONDecodeError as e:
            print(f"Invalid JSON in file {json_file.name}: {e}")
            failed_files.append((json_file.name, "Invalid JSON"))
        except Exception as e:
            print(f"Error processing file {json_file.name}: {e}")
            failed_files.append((json_file.name, str(e)))
    
    # Execute pipeline if there are commands
    if file_count > 0:
        try:
            res = pipeline.execute()
            print(f"Inserted {len(res)} documents successfully!")
        except Exception as e:
            print(f"Redis pipeline execution error: {e}")
            return 0, failed_files
    else:
        print("No JSON files found in directory")
        return 0, failed_files
    
    return file_count, failed_files

load_jsons_to_redis(data_path, client)

Inserted 21 documents successfully!


(21, [])

In [68]:
import numpy as np
from openai import OpenAI
import os
from dotenv import load_dotenv


load_dotenv(dotenv_path="c:/Users/benja/startup_projects/civgen/.env")
openai_client = OpenAI(
  api_key=os.getenv("OPENAI_API_KEY")
)

# Retrieve keys from Redis
keys = sorted(client.keys(f"{data_path}:*"))
content = client.json().mget(keys, "$.text_content")

# Flatten list and filter invalid entries

content = [item for sublist in content for item in sublist]  # Flatten
content = [text for text in content if isinstance(text, str) and text.strip()]  # Filter non-strings and empty strings

if not content:
    raise ValueError("No valid content available for generating embeddings.")

# Call OpenAI's embedding API
response = openai_client.embeddings.create(
    input=content,
    model="text-embedding-3-large"
)

# Extract embeddings using correct attribute access
embeddings = [item.embedding for item in response.data]

# Convert embeddings to float32 and list format
embeddings_array = np.array(embeddings, dtype=np.float32).tolist()

VECTOR_DIMENSION = len(embeddings_array[0]) if embeddings_array else 0
print(f"Generated {len(embeddings_array)} embeddings with dimension {VECTOR_DIMENSION}")


Generated 21 embeddings with dimension 3072


In [69]:
pipeline = client.pipeline()
for key, embedding in zip(keys, embeddings):
  #note that its named section embeddings and not content embeddings
  pipeline.json().set(key, "$.section_embeddings", embedding)
pipeline.execute()


[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [70]:
import redis
from redis.commands.search.field import (
    TextField,
    VectorField,
)
from redis.commands.search.indexDefinition import IndexDefinition, IndexType

# Define the schema fields based on your JSON structure
schema = (
    TextField("$.url", no_stem=True, as_name="url"),
    TextField("$.title", no_stem=True, as_name="title"),
    TextField("$.text_content", no_stem=True, as_name="text_content"),
    TextField("$.attachments.*", no_stem=True, as_name="attachment_paths"),
    VectorField(
        "$.section_embeddings",  # You'll need to add this to your JSON
        "FLAT",
        {
            "TYPE": "FLOAT32",
            "DIM": VECTOR_DIMENSION,  # Replace with your actual dimension
            "DISTANCE_METRIC": "COSINE",
        },
        as_name="vector"
    )
)

# Define the index
definition = IndexDefinition(prefix=[f"{data_path}:"], index_type=IndexType.JSON)

# Create the index
res = client.ft(f"idx:{data_path}_vss").create_index(fields=schema, definition=definition)

In [71]:
import numpy as np
from redis.commands.search.query import Query

# 1. Encode query (keep this the same)
query_text = "How do I register a low powered bike?"
query_vector = np.array(
    openai_client.embeddings.create(
        input=query_text,
        model="text-embedding-3-large"
    ).data[0].embedding, 
    dtype=np.float32
)

# 2. Update query to match schema fields
query = (
    Query('(*)=>[KNN 5 @vector $query_vector AS vector_score]')
    .sort_by('vector_score')
    .return_fields(
        'vector_score', 
        'url',         # From TextField("$.url")
        'title',       # From TextField("$.title")
        'text_content',# From TextField("$.text_content")
        'attachment_paths'  # From TextField("$.attachments.*")
    )
    .dialect(2)
)

# 3. Execute search (keep the same)
result = client.ft(f'idx:{data_path}_vss').search(
    query,
    {'query_vector': query_vector.tobytes()}
)

# 4. Process results with schema-aligned fields
for doc in result.docs:
    print(f"""
    Score: {doc.vector_score}
    URL: {doc.url}
    Title: {doc.title}
    Content: {doc.text_content}
    Attachments: {doc.attachment_paths}
    """)



    Score: 0.569419503212
    URL: https://www.dmv.virginia.gov/vehicles/registration/moped
    Title: All About Mopeds
    Content: moped, Virginia, DMV, rules of the road, age 16, government-issued photo ID, helmet, face shield, safety glasses, goggles, interstate, license suspended, revoked, DUI, high-occupancy toll lanes, toll violations, title, register, ownership documents, title fee, annual registration fee, Moped Certification, Sales and Use Tax, SUT, safety tips, training course
    Attachments: attachments\vsa31.pdf
    

    Score: 0.61673438549
    URL: https://www.dmv.virginia.gov/vehicles/registration/alternative-vehicles
    Title: Electric Vehicles
    Content: Electric vehicles, Virginia DMV, registration, highway use fee, annual fee, renewal, clean special fuel plates, titling, converted electric vehicle, registration fees, online renewal, highway use, special fuel vehicles, plate options, original registration, electric motor vehicle, clean energy, DMV policies, fee