# ShareGPT Dataset Preview

This notebook efficiently loads and previews the ShareGPT dataset using pandas.

In [1]:
import pandas as pd
import json
from pathlib import Path

In [None]:
import sys
sys.path.append('..')

Path.cwd()

In [None]:
# Define the path to the dataset
data_path = Path.cwd().parent.parent / 'data' / 'human-ai_datasets' / 'sharegpt_clean.json'

# Read only the first 12 lines efficiently
with open(data_path, 'r') as f:
    data = json.load(f)

items = [d['items'] for d in data]
items = [item for item in items if len(item) > 100]

# Convert to DataFrame
df = pd.DataFrame(items)

# Display the first few rows
print(f"Total rows loaded: {len(df)}")
display(df)

In [None]:
def format_conversation(row):
    conversation = []
    for i in range(len(row)):
        if pd.isna(row[i]):
            break
        message = row[i]
        if isinstance(message, str):
            message = eval(message)  # Convert string representation of dict to dict
        role = "Human" if message['from'] == 'human' else "Assistant"
        conversation.append(f"{role}: {message['value']}")
    return "\n\n".join(conversation)

# Format the first 5 conversations
for idx, row in df.head(5).iterrows():
    print(f"\n{'='*80}")
    print(f"Conversation {idx + 1}")
    print(f"{'='*80}")
    print(format_conversation(row))
    print(f"\n")

In [None]:
import sys
sys.path.append('..')

from src.babel_ai.analyzer import SimilarityAnalyzer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize the DriftAnalyzer
analyzer = SimilarityAnalyzer()

# Function to extract conversation text
def extract_conversation_text(row):
    messages = []
    for i in range(len(row)):
        if pd.isna(row[i]):
            break
        message = row[i]
        if isinstance(message, str):
            message = eval(message)  # Convert string representation of dict to dict
        messages.append(message['value'])
    return messages

# Find conversations with more than 100 messages
long_conversations = []
for idx, row in df.iterrows():
    msg_count = sum(1 for x in row if pd.notna(x))
    if msg_count > 100:
        long_conversations.append((idx, extract_conversation_text(row)))

# Select 3 conversations to analyze
selected_conversations = long_conversations[3:6]

# Analyze each conversation
results = []
for conv_idx, messages in selected_conversations:
    print(f"\nAnalyzing conversation {conv_idx} with {len(messages)} messages...")
    
    # Analyze each message
    metrics = []
    for i, message in enumerate(messages):
        analysis = analyzer.analyze(messages[:i+1])
        metrics.append({
            'iteration': i,
            'message': message,
            'analysis': analysis
        })
    
    results.append({
        'conversation_id': conv_idx,
        'metrics': metrics
    })

# Plot results for each conversation
for result in results:
    conv_id = result['conversation_id']
    metrics = result['metrics']
    
    # Create figure and axis objects
    fig, ax1 = plt.subplots(figsize=(12, 6))
    
    # Extract data
    iterations = [m['iteration'] for m in metrics]
    lexical_sim = [m['analysis']['lexical_similarity'] 
                   if 'lexical_similarity' in m['analysis'] else None 
                   for m in metrics]
    semantic_sim = [m['analysis']['semantic_similarity']
                    if 'semantic_similarity' in m['analysis'] else None
                    for m in metrics]
    semantic_surp = [m['analysis']['semantic_surprise']
                     if 'semantic_surprise' in m['analysis'] else None
                     for m in metrics]
    
    # Remove None values
    valid_indices = [i for i, v in enumerate(lexical_sim) if v is not None]
    iterations = [iterations[i] for i in valid_indices]
    lexical_sim = [v for v in lexical_sim if v is not None]
    semantic_sim = [v for v in semantic_sim if v is not None]
    semantic_surp = [v for v in semantic_surp if v is not None]
    
    # Plot similarities
    ax1.set_xlabel('Message Index')
    ax1.set_ylabel('Similarity Score', color='tab:blue')
    ax1.plot(iterations, lexical_sim, label='Lexical Similarity',
             marker='o', color='tab:blue', alpha=0.6)
    ax1.plot(iterations, semantic_sim, label='Semantic Similarity',
             marker='s', color='tab:orange', alpha=0.6)
    ax1.tick_params(axis='y', labelcolor='tab:blue')
    ax1.set_ylim(0, 1)
    
    # Plot surprise on second y-axis
    ax2 = ax1.twinx()
    ax2.set_ylabel('Surprise Score', color='tab:red')
    ax2.plot(iterations, semantic_surp, label='Semantic Surprise',
             marker='^', color='tab:red', alpha=0.6)
    ax2.tick_params(axis='y', labelcolor='tab:red')
    
    # Add legends
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper right')
    
    # Add grid
    ax1.grid(True, which='major', linestyle='-', alpha=0.5)
    ax1.grid(True, which='minor', linestyle=':', alpha=0.2)
    
    plt.title(f'Analysis of Conversation {conv_id} ({len(metrics)} messages)')
    plt.tight_layout()
    plt.show()