# Infinite Conversation Dataset Analysis

This notebook analyzes the Infinite Conversation dataset, which contains simulated conversations between philosophers like Slavoj Zizek and Werner Herzog.

In [2]:
import pandas as pd
import json
import glob
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('..')

In [None]:
# Define the path to the dataset
data_dir = Path.cwd().parent.parent / 'data' / 'infinite_conversation'
print(f"Data directory: {data_dir}")

# List all JSON files in the directory
json_files = list(data_dir.glob('conversation_*.json'))
print(f"Found {len(json_files)} conversation files")

In [None]:
# Function to load a conversation file
def load_conversation(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

# Load all conversations
conversations = []
for file_path in json_files:
    conv_id = file_path.stem  # Get filename without extension
    data = load_conversation(file_path)
    conversations.append({
        'id': conv_id,
        'data': data
    })

print(f"Loaded {len(conversations)} conversations")

In [None]:
print("First conversation data:")
print(f"ID: {conversations[0]['id']}")
print("\nRaw data:")
for key, value in conversations[0]['data'].items():
    print(f"{key}: {value[:79]}...")

In [None]:
# Function to extract messages from a conversation
def extract_messages(conversation_data):
    messages = []
    for key, value in conversation_data.items():
        # Extract speaker name from the text
        if ':' in value:
            speaker, text = value.split(':', 1)
        else:
            speaker = "Unknown"
            text = value
        
        messages.append({
            'key': key,  # MP3 filename
            'speaker': speaker.strip(),
            'text': text.strip()
        })
    
    # Extract timestamp from keys like "/slavoj_1704311301.66552.mp3"
    def get_timestamp(item):
        key = item['key']
        parts = key.split('_')
        if len(parts) > 1 and '.' in parts[1]:
            timestamp_parts = parts[1].split('.')
            if len(timestamp_parts) >= 2:
                return float(timestamp_parts[0] + '.' + timestamp_parts[1])
        return 0
    
    # Sort messages by extracted timestamp
    messages.sort(key=get_timestamp)
    
    return messages

# Process all conversations
for conv in conversations:
    conv['messages'] = extract_messages(conv['data'])
    print(f"Conversation {conv['id']}: {len(conv['messages'])} messages")

In [None]:
# Display a sample conversation
sample_conv = conversations[0]
print(f"Sample conversation: {sample_conv['id']}\n")
print(f"{'='*79}")

for msg in sample_conv['messages']:
    print(f"{msg['speaker']}: {msg['text'][:100]}...")
    print(f"{'-'*79}")

In [None]:
# Format conversation for display
def format_conversation(messages):
    formatted = []
    for msg in messages:
        formatted.append(f"{msg['speaker']}: {msg['text']}")
    return "\n\n".join(formatted)

# Display the first conversation fully
print(f"\n{'='*79}")
print(f"Full Conversation: {conversations[0]['id']}")
print(f"{'='*79}")
print(format_conversation(conversations[0]['messages']))

In [None]:
# Import the SemanticAnalyzer
from src.babel_ai.analyzer import SimilarityAnalyzer

# Initialize the SemanticAnalyzer
analyzer = SimilarityAnalyzer(
    analyze_window=50
)

In [None]:
# Function to extract just the text for analysis
def extract_conversation_text(messages):
    return [msg['text'] for msg in messages]

# Analyze each conversation
results = []
for conv in conversations:
    print(f"\nAnalyzing conversation {conv['id']} with {len(conv['messages'])} messages...")
    
    # Extract text messages
    messages = extract_conversation_text(conv['messages'])
    
    # Analyze each message
    metrics = []
    for i, message in enumerate(messages):
        analysis = analyzer.analyze(messages[:i+1])
        metrics.append({
            'iteration': i,
            'message': message,
            'analysis': analysis
        })
    
    results.append({
        'conversation_id': conv['id'],
        'metrics': metrics
    })

In [None]:
# Plot results for each conversation
for result in results:
    conv_id = result['conversation_id']
    metrics = result['metrics']
    
    # Create figure and axis objects
    fig, ax1 = plt.subplots(figsize=(12, 6))
    
    # Extract data
    iterations = [m['iteration'] for m in metrics]
    lexical_sim = [m['analysis']['lexical_similarity'] 
                   if 'lexical_similarity' in m['analysis'] else None 
                   for m in metrics]
    semantic_sim = [m['analysis']['semantic_similarity']
                    if 'semantic_similarity' in m['analysis'] else None
                    for m in metrics]
    semantic_surp = [m['analysis']['semantic_surprise']
                     if 'semantic_surprise' in m['analysis'] else None
                     for m in metrics]
    
    # Remove None values
    valid_indices = [i for i, v in enumerate(lexical_sim) if v is not None]
    iterations = [iterations[i] for i in valid_indices]
    lexical_sim = [v for v in lexical_sim if v is not None]
    semantic_sim = [v for v in semantic_sim if v is not None]
    semantic_surp = [v for v in semantic_surp if v is not None]
    
    # Plot similarities
    ax1.set_xlabel('Message Index')
    ax1.set_ylabel('Similarity Score', color='tab:blue')
    ax1.plot(iterations, lexical_sim, label='Lexical Similarity',
             marker='o', color='tab:blue', alpha=0.6)
    ax1.plot(iterations, semantic_sim, label='Semantic Similarity',
             marker='s', color='tab:orange', alpha=0.6)
    ax1.tick_params(axis='y', labelcolor='tab:blue')
    ax1.set_ylim(0, 1)
    
    # Plot surprise on second y-axis
    ax2 = ax1.twinx()
    ax2.set_ylabel('Surprise Score', color='tab:red')
    ax2.plot(iterations, semantic_surp, label='Semantic Surprise',
             marker='^', color='tab:red', alpha=0.6)
    ax2.tick_params(axis='y', labelcolor='tab:red')
    
    # Add legends
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper right')
    
    # Add grid
    ax1.grid(True, which='major', linestyle='-', alpha=0.5)
    ax1.grid(True, which='minor', linestyle=':', alpha=0.2)
    
    plt.title(f'Analysis of Conversation {conv_id} ({len(metrics)} messages)')
    plt.tight_layout()
    plt.show()