In [1]:
import json
import pprint
from pathlib import Path

# Method 1: Basic JSON loading
def load_and_explore_json(file_path):
    """Load JSON file and explore its structure"""
    
    # Load the JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    # Basic exploration
    print(f"File: {file_path}")
    print(f"Type: {type(data)}")
    print(f"Size: {len(data) if hasattr(data, '__len__') else 'N/A'}")
    
    return data

# Method 2: Pretty print structure
def explore_json_structure(data, max_depth=2, current_depth=0):
    """Recursively explore JSON structure"""
    
    if current_depth > max_depth:
        return "..."
    
    if isinstance(data, dict):
        result = {}
        for key, value in data.items():
            if isinstance(value, (dict, list)):
                result[key] = f"{type(value).__name__}({len(value)} items)"
            else:
                result[key] = f"{type(value).__name__}: {str(value)[:50]}..."
        return result
    
    elif isinstance(data, list):
        if len(data) == 0:
            return "Empty list"
        elif len(data) == 1:
            return [explore_json_structure(data[0], max_depth, current_depth + 1)]
        else:
            return [
                explore_json_structure(data[0], max_depth, current_depth + 1),
                f"... and {len(data) - 1} more items"
            ]
    
    else:
        return f"{type(data).__name__}: {str(data)[:50]}..."

# Example usage:
file_path = "data/conversations.json"

# Load the data
data = load_and_explore_json(file_path)

# Explore structure
print("\n=== JSON Structure ===")
structure = explore_json_structure(data)
pprint.pprint(structure, width=80, depth=3)

# If it's a list, show first few items
if isinstance(data, list):
    print(f"\n=== First item details ===")
    if len(data) > 0:
        pprint.pprint(data[0], width=80, depth=2)

# If it's a dict, show keys and sample values
elif isinstance(data, dict):
    print(f"\n=== Dictionary keys ===")
    for key, value in data.items():
        print(f"{key}: {type(value).__name__}")
        if isinstance(value, (list, dict)) and len(value) > 0:
            print(f"  Sample: {str(value)[:100]}...")

File: data/conversations.json
Type: <class 'list'>
Size: 690

=== JSON Structure ===
[{'account': 'dict(1 items)',
  'chat_messages': 'list(8 items)',
  'created_at': 'str: 2024-05-28T13:05:48.783430Z...',
  'name': 'str: Monetizing Creativity with Blockchain AGI...',
  'updated_at': 'str: 2024-05-28T13:12:45.115241Z...',
  'uuid': 'str: b4fa5efd-9e1c-4e4b-a8fe-ae1dba7ef312...'},
 '... and 689 more items']

=== First item details ===
{'account': {'uuid': '2fc29045-9a0b-488f-af46-48e235f655ea'},
 'chat_messages': [{...}, {...}, {...}, {...}, {...}, {...}, {...}, {...}],
 'created_at': '2024-05-28T13:05:48.783430Z',
 'name': 'Monetizing Creativity with Blockchain AGI',
 'updated_at': '2024-05-28T13:12:45.115241Z',
 'uuid': 'b4fa5efd-9e1c-4e4b-a8fe-ae1dba7ef312'}


# Parse into Llamaindex document

In [7]:
from typing import List, Dict, Any

from llama_index.core import Document

def create_documents_per_turn(conversations_data: List[Dict]) -> List[Document]:
    """
    Create one document for each turn/message in conversations.
    
    Args:
        conversations_data: List of conversation dictionaries from JSON
        
    Returns:
        List of LlamaIndex Document objects (one per message)
    """
    docs = []
    
    for conversation in conversations_data:
        thread_id = conversation.get('uuid', 'unknown')
        conversation_name = conversation.get('name', 'Untitled Conversation')
        conversation_created = conversation.get('created_at')
        
        # Process each message as a separate document
        for turn_idx, message in enumerate(conversation.get('chat_messages', [])):
            # Extract text content from message
            text_content = ""
            content = message.get('content', [])
            
            if isinstance(content, list):
                text_parts = []
                for content_item in content:
                    if isinstance(content_item, dict) and 'text' in content_item:
                        text_parts.append(content_item['text'])
                text_content = "\n".join(text_parts)
            elif isinstance(content, str):
                text_content = content
            elif isinstance(content, dict) and 'text' in content:
                text_content = content['text']
            
            # Only create document if there's actual text content
            if text_content.strip():
                docs.append(
                    Document(
                        text=text_content,
                        metadata={
                            "thread": thread_id,
                            "thread_name": conversation_name,
                            "role": message.get('sender', 'unknown'),  # "human" / "assistant"
                            "turn_index": turn_idx,
                            "conversation_created": conversation_created,
                            "message_created": message.get('created_at'),
                            "source": "conversations.json"
                        }
                    )
                )
    
    print(f"Created {len(docs)} documents from individual turns")
    return docs

# Create documents - one per turn/message
docs = create_documents_per_turn(data)

# Show statistics
if docs:
    print(f"\nDocument Statistics:")
    print(f"Total documents: {len(docs)}")
    
    # Count by role
    role_counts = {}
    for doc in docs:
        role = doc.metadata.get('role', 'unknown')
        role_counts[role] = role_counts.get(role, 0) + 1
    
    print(f"Messages by role: {role_counts}")
    
    # Show sample document
    print(f"\nSample document:")
    sample_doc = docs[0]
    print(f"Role: {sample_doc.metadata['role']}")
    print(f"Thread: {sample_doc.metadata['thread_name']}")
    print(f"Text length: {len(sample_doc.text)}")
    print(f"Text preview: {sample_doc.text[:200]}...")
    print(f"Full metadata: {sample_doc.metadata}")

Created 11284 documents from individual turns

Document Statistics:
Total documents: 11284
Messages by role: {'human': 5655, 'assistant': 5629}

Sample document:
Role: human
Thread: Monetizing Creativity with Blockchain AGI
Text length: 215
Text preview: Right now can I scope this down or pick a single focused that’s in this area I can turn into a Soloprenuer or small startup.

I was thinking exploring LLms for writers and using ai to define and own a...
Full metadata: {'thread': 'b4fa5efd-9e1c-4e4b-a8fe-ae1dba7ef312', 'thread_name': 'Monetizing Creativity with Blockchain AGI', 'role': 'human', 'turn_index': 0, 'conversation_created': '2024-05-28T13:05:48.783430Z', 'message_created': '2024-05-28T13:07:32.757438Z', 'source': 'conversations.json'}


In [4]:
data[0]

{'uuid': 'b4fa5efd-9e1c-4e4b-a8fe-ae1dba7ef312',
 'name': 'Monetizing Creativity with Blockchain AGI',
 'created_at': '2024-05-28T13:05:48.783430Z',
 'updated_at': '2024-05-28T13:12:45.115241Z',
 'account': {'uuid': '2fc29045-9a0b-488f-af46-48e235f655ea'},
 'chat_messages': [{'uuid': 'b458f43f-95cb-4805-83b7-7266f04103ef',
   'text': 'Right now can I scope this down or pick a single focused that’s in this area I can turn into a Soloprenuer or small startup.\n\nI was thinking exploring LLms for writers and using ai to define and own a writing niche.',
   'content': [{'start_timestamp': '2024-05-28T13:07:32.757438Z',
     'stop_timestamp': '2024-05-28T13:07:32.757438Z',
     'type': 'text',
     'text': 'Right now can I scope this down or pick a single focused that’s in this area I can turn into a Soloprenuer or small startup.\n\nI was thinking exploring LLms for writers and using ai to define and own a writing niche.',
     'citations': []}],
   'sender': 'human',
   'created_at': '2024

In [9]:
docs[0]

Document(id_='200033be-6255-4d84-ba83-25fb9634fd46', embedding=None, metadata={'thread': 'b4fa5efd-9e1c-4e4b-a8fe-ae1dba7ef312', 'thread_name': 'Monetizing Creativity with Blockchain AGI', 'role': 'human', 'turn_index': 0, 'conversation_created': '2024-05-28T13:05:48.783430Z', 'message_created': '2024-05-28T13:07:32.757438Z', 'source': 'conversations.json'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Right now can I scope this down or pick a single focused that’s in this area I can turn into a Soloprenuer or small startup.\n\nI was thinking exploring LLms for writers and using ai to define and own a writing niche.', path=None, url=None, mimetype=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}')

# Embed and store in DuckDB

In [10]:
import duckdb, datetime, uuid


db = duckdb.connect("augmcp_v0.duckdb")
db.execute("""
CREATE TABLE IF NOT EXISTS raw_chunks(
  chunk_id  TEXT PRIMARY KEY,
  thread    TEXT,
  role      TEXT,
  ts_ingest TIMESTAMP,
  content   TEXT,
  embedding DOUBLE[]
)
""")


<duckdb.duckdb.DuckDBPyConnection at 0x12583b0f0>

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from tqdm import tqdm

from dotenv import load_dotenv
load_dotenv("/Users/chris/repos/openaugi/keys.env")

embedder = OpenAIEmbedding(model="text-embedding-3-small")

for d in tqdm(docs):
    emb = embedder.get_text_embedding(d.text)
    db.execute(
        "INSERT INTO raw_chunks VALUES (?,?,?,?,?,?)",
        [str(uuid.uuid4()),
         d.metadata["thread"],
         d.metadata["role"],
         datetime.datetime.fromisoformat(d.metadata["message_created"]),
         d.text,
         emb]
    )

  8%|▊         | 857/11284 [03:33<1:21:33,  2.13it/s]