In [1]:
import os
import json
from datetime import datetime
import random

# Function to convert a timestamp to the required format
def timestamp_to_str(timestamp):
    if timestamp is None:
        return 'None'
    return datetime.utcfromtimestamp(timestamp).strftime('%y-%m-%d %H:%M')

# Load the data
with open('../data/conversations.json', 'r') as f:
    data = json.load(f)

In [3]:

# Find small conversations
small_conversations = []
for conversation in data:
    if len(conversation['mapping'].keys()) < 5:
        small_conversations.append(conversation)

# Print the number of small conversations and total conversations
print('Number of small conversations: ', len(small_conversations))
print('Total conversations: ', len(data))
print('-' * 50)

# Select 3 random conversations from the list of small conversations
random_conversations = random.sample(small_conversations, k=3)

for convo in small_conversations:
    if convo['title'] == 'Virtual Conferences Solution.':
        print(f"{convo['title']}: {convo['mapping']}")
        print("Conversation:")
        print(json.dumps(convo, indent=4))
        print('-' * 50)
        
# Print the titles of the selected conversations
for conversation in random_conversations:
    print(conversation['title'])

Number of small conversations:  173
Total conversations:  874
--------------------------------------------------
Virtual Conferences Solution.: {'0e7a81e7-608d-4120-8776-21f724e13733': {'id': '0e7a81e7-608d-4120-8776-21f724e13733', 'message': {'id': '0e7a81e7-608d-4120-8776-21f724e13733', 'author': {'role': 'system', 'name': None, 'metadata': {}}, 'create_time': 1681977336.611987, 'update_time': None, 'content': {'content_type': 'text', 'parts': ['']}, 'status': 'finished_successfully', 'end_turn': True, 'weight': 1.0, 'metadata': {}, 'recipient': 'all'}, 'parent': 'fb062b48-1505-4354-8362-9b8b338be06d', 'children': ['0ac6d621-f070-4451-826d-bb0a39ee3717']}, 'fb062b48-1505-4354-8362-9b8b338be06d': {'id': 'fb062b48-1505-4354-8362-9b8b338be06d', 'message': None, 'parent': None, 'children': ['0e7a81e7-608d-4120-8776-21f724e13733']}, '0ac6d621-f070-4451-826d-bb0a39ee3717': {'id': '0ac6d621-f070-4451-826d-bb0a39ee3717', 'message': {'id': '0ac6d621-f070-4451-826d-bb0a39ee3717', 'author': {'r

In [4]:
def print_tree_structure(mapping, node_id, indent="", fork_point=None):
    node = mapping[node_id]
    message = node.get('message', None)
    role = message['author']['role'].capitalize() if message and message.get('author') else 'Root'
    time = timestamp_to_str(message['create_time']) if message else 'None'
    
    # If current node has multiple children, update the fork point
    if len(node['children']) > 1:
        fork_point = node_id
    
    # Display the current node with the role, node_id, and timestamp
    print(f"{indent}└── [[{role}: {node_id} ({time})]]")
    
    # If there's a fork point, display it for reference
    if fork_point:
        branching_node = mapping[fork_point]
        role_at_fork = branching_node['message']['author']['role'].capitalize() if branching_node['message'] and branching_node['message'].get('author') else 'Root'
        print(f"{indent}    [[Fork Point - {role_at_fork}: {fork_point}]]")

    for child_id in node['children']:
        print_tree_structure(mapping, child_id, indent + "    ", fork_point)

In [6]:
def print_conversation_structure(convo):
    print("=" * 40)
    print(f"Conversation: {convo['title']}")
    print("-" * 40)

    root_node_id = next((node_id for node_id, node_data in convo['mapping'].items() if not node_data['parent']), None)
    if not root_node_id:
        return
    
    system_message_node_id = next((node_id for node_id, node_data in convo['mapping'].items() if node_data['message'] and node_data['message']['author']['role'] == 'system'), None)

    print(f"Root Node: {root_node_id}")
    if system_message_node_id:
        system_time = timestamp_to_str(convo['mapping'][system_message_node_id]['message']['create_time'])
        # print(f"System Node: {system_message_node_id} ({system_time})")

        print_tree_structure(convo['mapping'], system_message_node_id)
    print("=" * 40)
    print()

# Process some of the small conversations
for convo in small_conversations[:3]:
    print_conversation_structure(convo)


Conversation: New chat
----------------------------------------
Root Node: aaa19d94-3fe1-425b-86ae-972ef7ec44f9
└── [[System: 2e1d064f-c760-4bd7-a7f8-0280da43ee5b (None)]]
    └── [[User: aaa2a7c0-882e-4090-a6ea-3c6340fb3d5f (23-08-28 03:34)]]
        └── [[Assistant: 1a495d80-e0f5-4f83-8fde-3f3a2b0a131d (23-08-28 03:34)]]

Conversation: Markdown Quotes Extraction
----------------------------------------
Root Node: aaa1c8af-888c-4640-85e5-2477b3993a92
└── [[System: 628b2957-a572-4e90-bf03-52bab0e3c04c (None)]]
    └── [[User: aaa2594c-131c-435d-be0a-1ad868c18068 (23-08-27 21:44)]]
        └── [[Assistant: 4bbfe999-4cd9-4393-b50a-d9b34d030a95 (23-08-27 21:44)]]

Conversation: Autonomous Robotics Job Opportunity
----------------------------------------
Root Node: 9d8cda6d-624e-4fc9-9ab2-61929a0d1475
└── [[System: dba67e1c-9ac0-47e1-8922-8ded7c5bc4d0 (None)]]
    └── [[User: 54a7e6a1-fec7-4a45-81d8-1492fc70c6e6 (23-08-24 01:35)]]
        └── [[Assistant: c46f7048-8cfb-4e54-953e-cc3e3523a2

In [7]:
def traverse_tree(mapping, node_id, level):
    children = mapping[node_id]['children']

    if not children:
        return 0, level, 1

    branchings = len(children)
    max_depth = level
    total_leaves = 0

    for child in children:
        child_branchings, child_depth, child_leaves = traverse_tree(mapping, child, level + 1)
        branchings += child_branchings
        max_depth = max(max_depth, child_depth)
        total_leaves += child_leaves

    return branchings, max_depth, total_leaves

test_convo_title = 'GPT-3 Token Control.'
test_convo_id = 'b84bc33d-1495-49fd-805e-c42958209a06'

def examine_test_conversation(data):
    test_convo = None
    conversation_sampling_record = []
    conversation_sampling_criteria = {
        'depth_ranges': {
            'shallow': {
                'range': (3, 5),
                'sample': None
            },
            'intermediate': {
                'range': (6, 10),
                'sample': None
            },
            'deep': {
                'range': (11, 20),
                'sample': None
            },
            'profound': {
                'range': (21, 100),
                'sample': None
            }
        },
        'branching_ranges': {
            'slight': {
                'range': (3, 5),
                'sample': None
            },
            'moderate': {
                'range': (6, 10),
                'sample': None
            },
            'considerable': {
                'range': (11, 20),
                'sample': None
            },
            'extensive': {
                'range': (21, 100),
                'sample': None
            }
        },
        'leaf_ranges': {
            'few': {
                'range': (3, 5),
                'sample': None
            },
            'moderate': {
                'range': (6, 10),
                'sample': None
            },
            'many': {
                'range': (11, 20),
                'sample': None
            },
            'abundant': {
                'range': (21, 100),
                'sample': None
            }
        }

    }

    for convo in data:
        root_node_id = next((node_id for node_id, node_data in convo['mapping'].items() if not node_data['parent']), None)

        if not root_node_id:
            print(f"Conversation {convo['title']} ({convo['id']}) has no root node!")
            continue

        # If the conversation is the test conversation, save it
        if convo['title'] == test_convo_title and convo['id'] == test_convo_id:
            test_convo = convo
            print(f"Found conversation: {convo['title']} | {convo['id']}")

        # Calculate the branchings, depth, and leaves of the conversation
        branchings, depth, leaves = traverse_tree(convo['mapping'], root_node_id, 0)

        # Check if the conversation meets any of the sampling criteria
        # Note: The order of the criteria matters.
        #       The first criteria that is met will be the one that is sampled.
        #       This means that if a conversation meets multiple criteria,
        #       it will only be sampled once (for the first criteria that it meets).
        for depth_range_name, depth_range_data in conversation_sampling_criteria['depth_ranges'].items():
            # If the conversation has already been sampled, skip the conversation
            if convo['id'] in conversation_sampling_record:
                #print(f"Conversation {convo['title']} ({convo['id']}) has already been sampled!")
                break

            # If we've already sampled a conversation for this depth range, skip the depth range
            if depth_range_data['sample']:
                #print(f"Depth range {depth_range_name} has already been sampled!")
                continue

            if depth_range_data['range'][0] <= depth <= depth_range_data['range'][1]:
                # If the conversation meets the criteria, and the sample is empty, add it to the sample
                if not depth_range_data['sample']:
                    depth_range_sample = f"{depth:<2} {convo['title']} ({convo['id']})"
                    depth_range_data['sample'] = depth_range_sample
                    conversation_sampling_record.append(convo['id'])
                    # Print the sample and the depth range
                    print(f"Sampled: {convo['title']:<50} | {depth} ({depth_range_name} depth)")
                    break
        
        for branching_range_name, branching_range_data in conversation_sampling_criteria['branching_ranges'].items():
            # If the conversation has already been sampled, skip the conversation
            if convo['id'] in conversation_sampling_record:
                #print(f"Conversation {convo['title']} ({convo['id']}) has already been sampled!")
                break

            # If we've already sampled a conversation for this branching range, skip the branching range
            if branching_range_data['sample']:
                #print(f"Branching range {branching_range_name} has already been sampled!")
                continue

            if branching_range_data['range'][0] <= branchings <= branching_range_data['range'][1]:
                # If the conversation meets the criteria, and the sample is empty, add it to the sample
                if not branching_range_data['sample']:
                    branching_range_sample = f"{branchings:<2} {convo['title']} ({convo['id']})"
                    branching_range_data['sample'] = branching_range_sample
                    conversation_sampling_record.append(convo['id'])
                    # Print the sample and the branching range
                    print(f"Sampled: {convo['title']:<50} | {branchings} ({branching_range_name} branching)")
                    break

        for leaf_range_name, leaf_range_data in conversation_sampling_criteria['leaf_ranges'].items():
            # If the conversation has already been sampled, skip the conversation
            if convo['id'] in conversation_sampling_record:
                #print(f"Conversation {convo['title']} ({convo['id']}) has already been sampled!")
                break

            # If we've already sampled a conversation for this leaf range, skip the leaf range
            if leaf_range_data['sample']:
                #print(f"Leaf range {leaf_range_name} has already been sampled!")
                continue

            if leaf_range_data['range'][0] <= leaves <= leaf_range_data['range'][1]:
                # If the conversation meets the criteria, and the sample is empty, add it to the sample
                if not leaf_range_data['sample']:
                    leaf_range_sample = f"{leaves:<2} {convo['title']} ({convo['id']})"
                    leaf_range_data['sample'] = leaf_range_sample
                    conversation_sampling_record.append(convo['id'])
                    # Print the sample and the leaf range
                    print(f"Sampled: {convo['title']:<50} | {leaves} ({leaf_range_name} leaves)")
                    break
                
        #print(f"{convo['title']:<50} Branchings, depth, leaves: {branchings}, {depth}, {leaves}")

    # Print the samples
    print("Conversation Samples:")
    for depth_range_name, depth_range_data in conversation_sampling_criteria['depth_ranges'].items():
        print(f"Depth range: {depth_range_name:<17} | {depth_range_data['sample']}")
    for branching_range_name, branching_range_data in conversation_sampling_criteria['branching_ranges'].items():
        print(f"Branching range: {branching_range_name:<13} | {branching_range_data['sample']}")
    for leaf_range_name, leaf_range_data in conversation_sampling_criteria['leaf_ranges'].items():
        print(f"Leaf range: {leaf_range_name:<18} | {leaf_range_data['sample']}")
    
    # Continue to printing the test conversation if it was found
    if not test_convo:
        print("Test conversation not found!")
        return

    # We should only have one root node
    root_node_ids = [node_id for node_id, node_data in test_convo['mapping'].items() if not node_data['parent']]    
    if len(root_node_ids) > 1:
        print("More than one root node!")
    
    # Print the test conversation
    for root_node_id in root_node_ids:
        print(f"Root Node: {root_node_id}")
        system_node_id = test_convo['mapping'][root_node_id]['children'][0]
        print_tree_structure(test_convo['mapping'], system_node_id)


examine_test_conversation(data)


Sampled: Import ChatGPT JSON Format                         | 5 (shallow depth)
Sampled: Atoms on Earth Estimate                            | 19 (deep depth)
Sampled: Customized Career Opportunity                      | 7 (intermediate depth)
Sampled: RNN Concepts and Basics                            | 5 (slight branching)
Sampled: Climate Change Presentation & Maps                 | 24 (profound depth)
Sampled: Enhance Wear OS Reminders                          | 7 (moderate branching)
Sampled: Full Workflow with Python                          | 25 (extensive branching)
Sampled: Function Docs & Tests                              | 4 (few leaves)
Sampled: Setting Up Python Project                          | 17 (considerable branching)
Sampled: Summarizing Request & Response                     | 16 (many leaves)
Sampled: Text Condensation: Techniques & Summary            | 10 (moderate leaves)
Found conversation: GPT-3 Token Control. | b84bc33d-1495-49fd-805e-c42958209a06
Sampled: CA

In [8]:


def print_test_convo_structure(test_convo):
    print("=" * 40)
    print(f"Conversation: {test_convo['title']} | {test_convo['id']}")
    print("-" * 40)

    root_node_ids = [node_id for node_id, node_data in test_convo['mapping'].items() if not node_data['parent']]
    for root_node_id in root_node_ids:
        print_tree_structure(test_convo['mapping'], root_node_id)
    print("=" * 40)
    print()



def process_conversations(data):
    test_convo = None

    for convo in data:
        root_node_id = next((node_id for node_id, node_data in convo['mapping'].items() if not node_data['parent']), None)

        if not root_node_id:
            continue

        if convo['title'] == 'GPT-3 Token Control.':
            test_convo = convo
            print(f"Found test conversation {convo['title']}: {convo['id']}")

    if not test_convo:
        return

    root_node_ids = [node_id for node_id, node_data in test_convo['mapping'].items() if not node_data['parent']]
    
    if len(root_node_ids) > 1:
        print("More than one root node!")

    # Print the test conversation
    print_test_convo_structure(test_convo)


process_conversations(data)


Found test conversation GPT-3 Token Control.: b84bc33d-1495-49fd-805e-c42958209a06
Conversation: GPT-3 Token Control. | b84bc33d-1495-49fd-805e-c42958209a06
----------------------------------------
└── [[Root: a88eec46-80d6-4c3f-810d-f4c8868d84ad (None)]]
    └── [[System: 076adf85-94b6-42c2-8738-181964801053 (23-04-23 03:34)]]
        [[Fork Point - System: 076adf85-94b6-42c2-8738-181964801053]]
        └── [[User: 72fe4134-3519-484e-b5cd-16887affc285 (23-04-23 03:34)]]
            [[Fork Point - User: 72fe4134-3519-484e-b5cd-16887affc285]]
            └── [[Assistant: a17c71e2-1200-445a-b77f-9b5e4e111e5c (23-04-23 03:34)]]
                [[Fork Point - User: 72fe4134-3519-484e-b5cd-16887affc285]]
            └── [[Assistant: 15916ca2-e6eb-4f79-b806-57ccf9039bb2 (23-04-23 03:36)]]
                [[Fork Point - User: 72fe4134-3519-484e-b5cd-16887affc285]]
        └── [[User: cd07e965-e958-44ac-b8fc-3e4bc6b1882f (23-04-23 03:35)]]
            [[Fork Point - System: 076adf85-94b6-42c2-