# Infinite Conversation Dataset Analysis

This notebook analyzes the Infinite Conversation dataset, which contains simulated conversations between philosophers like Slavoj Zizek and Werner Herzog.

In [12]:
import pandas as pd
import json
import glob
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('../..')

In [None]:
# Define the path to the dataset
data_dir = Path.cwd().parent.parent / 'data' / 'infinite_conversation'
print(f"Data directory: {data_dir}")

# List all JSON files in the directory
json_files = list(data_dir.glob('conversation_*.json'))
print(f"Found {len(json_files)} conversation files")

In [None]:
# Function to load a conversation file
def load_conversation(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

# Load all conversations
conversations = []
for file_path in json_files:
    conv_id = file_path.stem  # Get filename without extension
    data = load_conversation(file_path)
    conversations.append({
        'id': conv_id,
        'data': data
    })

print(f"Loaded {len(conversations)} conversations")

In [None]:
print("First conversation data:")
print(f"ID: {conversations[0]['id']}")
print("\nRaw data:")
for key, value in conversations[0]['data'].items():
    print(f"{key}: {value[:79]}...")

In [None]:
# Function to extract messages from a conversation
def extract_messages(conversation_data):
    messages = []
    for key, value in conversation_data.items():
        # Extract speaker name from the text
        if ':' in value:
            speaker, text = value.split(':', 1)
        else:
            speaker = "Unknown"
            text = value
        
        messages.append({
            'key': key,  # MP3 filename
            'speaker': speaker.strip(),
            'text': text.strip()
        })
    
    # Extract timestamp from keys like "/slavoj_1704311301.66552.mp3"
    def get_timestamp(item):
        key = item['key']
        parts = key.split('_')
        if len(parts) > 1 and '.' in parts[1]:
            timestamp_parts = parts[1].split('.')
            if len(timestamp_parts) >= 2:
                return float(timestamp_parts[0] + '.' + timestamp_parts[1])
        return 0
    
    # Sort messages by extracted timestamp
    messages.sort(key=get_timestamp)
    
    return messages

# Process all conversations
for conv in conversations:
    conv['messages'] = extract_messages(conv['data'])
    print(f"Conversation {conv['id']}: {len(conv['messages'])} messages")

In [None]:
# Display a sample conversation
sample_conv = conversations[0]
print(f"Sample conversation: {sample_conv['id']}\n")
print(f"{'='*79}")

for msg in sample_conv['messages']:
    print(f"{msg['speaker']}: {msg['text'][:100]}...")
    print(f"{'-'*79}")

In [None]:
# Format conversation for display
def format_conversation(messages):
    formatted = []
    for msg in messages:
        formatted.append(f"{msg['speaker']}: {msg['text']}")
    return "\n\n".join(formatted)

# Display the first conversation fully
print(f"\n{'='*79}")
print(f"Full Conversation: {conversations[0]['id']}")
print(f"{'='*79}")
print(format_conversation(conversations[0]['messages']))

In [19]:
# Import the SemanticAnalyzer
from src.babel_ai.analyzer import SimilarityAnalyzer

# Initialize the SemanticAnalyzer
analyzer = SimilarityAnalyzer(
    analyze_window=50
)

In [None]:
# Function to extract just the text for analysis
def extract_conversation_text(messages):
    return [msg['text'] for msg in messages]

# Analyze each conversation
results = []
for conv in conversations:
    print(f"\nAnalyzing conversation {conv['id']} with {len(conv['messages'])} messages...")
    
    # Extract text messages
    messages = extract_conversation_text(conv['messages'])
    
    # Analyze each message
    metrics = []
    for i, message in enumerate(messages):
        analysis = analyzer.analyze(messages[:i+1])
        metrics.append({
            'iteration': i,
            'message': message,
            'analysis': analysis
        })
    
    results.append({
        'conversation_id': conv['id'],
        'metrics': metrics
    })

In [None]:
# Plot results for each conversation
for result in results:
    conv_id = result['conversation_id']
    metrics = result['metrics']
    
    # Create figure and axis objects
    fig, ax1 = plt.subplots(figsize=(12, 6))
    
    # Extract data
    iterations = [m['iteration'] for m in metrics]
    lexical_sim = [m['analysis']['lexical_similarity'] 
                   if 'lexical_similarity' in m['analysis'] else None 
                   for m in metrics]
    semantic_sim = [m['analysis']['semantic_similarity']
                    if 'semantic_similarity' in m['analysis'] else None
                    for m in metrics]
    semantic_surp = [m['analysis']['semantic_surprise']
                     if 'semantic_surprise' in m['analysis'] else None
                     for m in metrics]
    
    # Remove None values
    valid_indices = [i for i, v in enumerate(lexical_sim) if v is not None]
    iterations = [iterations[i] for i in valid_indices]
    lexical_sim = [v for v in lexical_sim if v is not None]
    semantic_sim = [v for v in semantic_sim if v is not None]
    semantic_surp = [v for v in semantic_surp if v is not None]
    
    # Plot similarities
    ax1.set_xlabel('Message Index')
    ax1.set_ylabel('Similarity Score', color='tab:blue')
    ax1.plot(iterations, lexical_sim, label='Lexical Similarity',
             marker='o', color='tab:blue', alpha=0.6)
    ax1.plot(iterations, semantic_sim, label='Semantic Similarity',
             marker='s', color='tab:orange', alpha=0.6)
    ax1.tick_params(axis='y', labelcolor='tab:blue')
    ax1.set_ylim(0, 1)
    
    # Plot surprise on second y-axis
    ax2 = ax1.twinx()
    ax2.set_ylabel('Surprise Score', color='tab:red')
    ax2.plot(iterations, semantic_surp, label='Semantic Surprise',
             marker='^', color='tab:red', alpha=0.6)
    ax2.tick_params(axis='y', labelcolor='tab:red')
    
    # Add legends
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper right')
    
    # Add grid
    ax1.grid(True, which='major', linestyle='-', alpha=0.5)
    ax1.grid(True, which='minor', linestyle=':', alpha=0.2)
    
    plt.title(f'Analysis of Conversation {conv_id} ({len(metrics)} messages)')
    plt.tight_layout()
    plt.show()

# Combining Conversation Fragments into a Continuous Conversation

According to our observation, the conversation files appear to be fragments of a longer, continuous conversation, with each fragment ending with the same message that the next one starts with. In this section, we'll:

1. Identify the correct sequence of conversation fragments
2. Combine them into a single continuous conversation
3. Remove duplicated messages at the fragment boundaries
4. Analyze and plot the resulting continuous conversation

In [None]:
# Combine the conversations into a single continuous conversation
print("Combining conversations into a continuous sequence...")
sequence = conversations
# Sort conversations based on the timestamp in the first message key
sequence = sorted(sequence, key=lambda conv: conv['id'])

continuous_conversation = []

for i, conv in enumerate(sequence):
    if i == 0:
        # Include all messages from the first conversation
        continuous_conversation.extend(conv['messages'])
    else:
        # Skip the first message of subsequent conversations (it's a duplicate)
        continuous_conversation.extend(conv['messages'][1:])

print(f"Combined conversation has {len(continuous_conversation)} messages")

# Verify there are no duplicated consecutive messages
duplicates = 0
for i in range(1, len(continuous_conversation)):
    prev_msg = continuous_conversation[i-1]
    curr_msg = continuous_conversation[i]
    
    if prev_msg['text'] == curr_msg['text'] and prev_msg['speaker'] == curr_msg['speaker']:
        duplicates += 1
        print(f"Found duplicate: {prev_msg['speaker']}: {prev_msg['text'][:50]}...")

print(f"Found {duplicates} duplicated consecutive messages")

# Display the beginning and end of the continuous conversation
print("\nFirst 3 messages of the continuous conversation:")
for msg in continuous_conversation[:3]:
    print(f"{msg['speaker']}: {msg['text'][:79]}...")
    print(f"{'-'*79}")

print("\nLast 3 messages of the continuous conversation:")
for msg in continuous_conversation[-3:]:
    print(f"{msg['speaker']}: {msg['text'][:79]}...")
    print(f"{'-'*79}")

In [None]:
# Print the full continuous conversation
print("Full continuous conversation:")
print("="*79)
for msg in continuous_conversation:
    # Print speaker and message text, wrapping at 79 chars
    print(f"{msg['speaker']}:")
    
    # Split text into 79-char chunks for readability
    text = msg['text']
    while text:
        if len(text) <= 79:
            print(text)
            text = ""
        else:
            # Find last space before 79 chars
            split_point = text[:79].rfind(' ')
            if split_point == -1:
                split_point = 79
            print(text[:split_point])
            text = text[split_point:].lstrip()
    
    print("-"*79)


In [None]:
# Analyze the continuous conversation
print("Analyzing the continuous conversation...")

# Extract text messages for analysis
continuous_messages = extract_conversation_text(continuous_conversation)

# Analyze each message
continuous_metrics = []
for i, message in enumerate(continuous_messages):
    if i % 50 == 0:
        print(f"Analyzing message {i+1}/{len(continuous_messages)}...")
    analysis = analyzer.analyze(continuous_messages[:i+1])
    continuous_metrics.append({
        'iteration': i,
        'message': message,
        'analysis': analysis
    })

# Add the continuous conversation result
continuous_result = {
    'conversation_id': 'continuous_conversation',
    'metrics': continuous_metrics
}

print(f"Completed analysis of {len(continuous_metrics)} messages")

In [None]:
# Plot results for the continuous conversation
# Create figure and axis objects
fig, ax1 = plt.subplots(figsize=(15, 8))

# Extract data
iterations = [m['iteration'] for m in continuous_metrics]
lexical_sim = [m['analysis']['lexical_similarity'] 
               if 'lexical_similarity' in m['analysis'] else None 
               for m in continuous_metrics]
semantic_sim = [m['analysis']['semantic_similarity']
                if 'semantic_similarity' in m['analysis'] else None
                for m in continuous_metrics]
semantic_surp = [m['analysis']['semantic_surprise']
                 if 'semantic_surprise' in m['analysis'] else None
                 for m in continuous_metrics]

# Remove None values
valid_indices = [i for i, v in enumerate(lexical_sim) if v is not None]
iterations = [iterations[i] for i in valid_indices]
lexical_sim = [v for v in lexical_sim if v is not None]
semantic_sim = [v for v in semantic_sim if v is not None]
semantic_surp = [v for v in semantic_surp if v is not None]

# Plot similarities
ax1.set_xlabel('Message Index')
ax1.set_ylabel('Similarity Score', color='tab:blue')
ax1.plot(iterations, lexical_sim, label='Lexical Similarity',
         marker='o', color='tab:blue', alpha=0.6, markersize=3)
ax1.plot(iterations, semantic_sim, label='Semantic Similarity',
         marker='s', color='tab:orange', alpha=0.6, markersize=3)
ax1.tick_params(axis='y', labelcolor='tab:blue')
ax1.set_ylim(0, 1)

# Mark fragment boundaries with vertical lines
boundary_indices = []
msg_count = 0
for conv in sequence[:-1]:  # All except the last conversation
    msg_count += len(conv['messages'])
    boundary_indices.append(msg_count - 1)  # -1 because we're 0-indexed

for idx in boundary_indices:
    if idx in iterations:
        ax1.axvline(x=idx, color='gray', linestyle='--', alpha=0.7)

# Plot surprise on second y-axis
ax2 = ax1.twinx()
ax2.set_ylabel('Surprise Score', color='tab:red')
ax2.plot(iterations, semantic_surp, label='Semantic Surprise',
         marker='^', color='tab:red', alpha=0.6, markersize=3)
ax2.tick_params(axis='y', labelcolor='tab:red')

# Add legends
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

# Add grid
ax1.grid(True, which='major', linestyle='-', alpha=0.5)
ax1.grid(True, which='minor', linestyle=':', alpha=0.2)

plt.title('Analysis of Combined Continuous Conversation')
plt.tight_layout()
plt.show()

In [None]:
# Calculate and plot rolling average of semantic similarity and surprise
import numpy as np
from scipy.signal import savgol_filter

# Calculate rolling averages
window_size = min(20, len(lexical_sim) // 10)  # Adaptive window size
if window_size < 3:
    window_size = 3  # Minimum window size
if window_size % 2 == 0:
    window_size += 1  # Ensure odd window size for Savitzky-Golay

# Use Savitzky-Golay filter for smoothing
poly_order = min(3, window_size - 1)  # Polynomial order must be less than window size
smoothed_lexical = savgol_filter(lexical_sim, window_size, poly_order)
smoothed_semantic = savgol_filter(semantic_sim, window_size, poly_order)
smoothed_surprise = savgol_filter(semantic_surp, window_size, poly_order)

# Create the plot
fig, ax1 = plt.subplots(figsize=(15, 8))

# Plot smoothed similarities
ax1.set_xlabel('Message Index')
ax1.set_ylabel('Smoothed Similarity Score', color='tab:blue')
ax1.plot(iterations, smoothed_lexical, label='Smoothed Lexical Similarity',
         color='tab:blue', linewidth=2)
ax1.plot(iterations, smoothed_semantic, label='Smoothed Semantic Similarity',
         color='tab:orange', linewidth=2)
ax1.tick_params(axis='y', labelcolor='tab:blue')
ax1.set_ylim(0, 1)

# Mark fragment boundaries with vertical lines
for idx in boundary_indices:
    if idx in iterations:
        ax1.axvline(x=idx, color='gray', linestyle='--', alpha=0.7)

# Plot smoothed surprise on second y-axis
ax2 = ax1.twinx()
ax2.set_ylabel('Smoothed Surprise Score', color='tab:red')
ax2.plot(iterations, smoothed_surprise, label='Smoothed Semantic Surprise',
         color='tab:red', linewidth=2)
ax2.tick_params(axis='y', labelcolor='tab:red')

# Add legends
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

# Add grid
ax1.grid(True, which='major', linestyle='-', alpha=0.5)
ax1.grid(True, which='minor', linestyle=':', alpha=0.2)

plt.title(f'Smoothed Analysis of Combined Conversation (Window Size: {window_size})')
plt.tight_layout()
plt.show()

## Key Findings from the Continuous Conversation Analysis

1. **Message Distribution**: The combined conversation contains messages across multiple fragments, with vertical lines in the plot marking the boundaries between original conversation files.

2. **Similarity Patterns**: The smoothed plots reveal trends in lexical and semantic similarity that might be obscured in the individual fragments.

3. **Surprise Metrics**: The semantic surprise metric shows how unexpected each message is compared to previous ones. High surprise values indicate topic shifts or unexpected statements in the conversation.

4. **Conversation Flow**: Observing the metrics across the full conversation provides insights into how the dialogue progresses and evolves over time, potentially revealing patterns that weren't visible in the fragmented analysis.