# ConvFinQA Dataset Analysis

This notebook analyses the ConvFinQA dataset structure and content to understand:
- Available dataset splits (train/dev/test)
- Dataset statistics and characteristics  
- Sample conversations and data quality
- Evaluation recommendations


In [1]:
import json
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from pathlib import Path
import sys

sys.path.append('src')
pd.set_option('display.max_columns', None)


## Load Dataset


In [2]:
# Load the dataset
with open('../data/convfinqa_dataset.json', 'r') as f:
    dataset = json.load(f)

print("Dataset loaded successfully")
print(f"Available splits: {list(dataset.keys())}")

# Get basic counts
total_conversations = sum(len(conversations) for conversations in dataset.values())
print(f"Total conversations: {total_conversations}")


Dataset loaded successfully
Available splits: ['train', 'dev']
Total conversations: 3458


## Split Analysis


In [3]:
# Analyze each split
split_stats = {}

for split_name, conversations in dataset.items():
    num_conversations = len(conversations)
    
    # Count total turns
    total_turns = sum(len(conv['dialogue']['conv_questions']) for conv in conversations)
    avg_turns = total_turns / num_conversations if num_conversations > 0 else 0
    
    split_stats[split_name] = {
        'conversations': num_conversations,
        'turns': total_turns,
        'avg_turns': avg_turns
    }
    
    print(f"\n{split_name.upper()} split:")
    print(f"  Conversations: {num_conversations}")
    print(f"  Total turns: {total_turns}")
    print(f"  Average turns per conversation: {avg_turns:.2f}")

# Check test set availability
test_available = 'test' in dataset
print(f"\nTest set available: {test_available}")
if not test_available:
    print("Note: Test set not available. Use dev set for evaluation.")



TRAIN split:
  Conversations: 3037
  Total turns: 11104
  Average turns per conversation: 3.66

DEV split:
  Conversations: 421
  Total turns: 1490
  Average turns per conversation: 3.54

Test set available: False
Note: Test set not available. Use dev set for evaluation.


In [4]:
# Create summary table
df = pd.DataFrame.from_dict(split_stats, orient='index')
print("\nSplit Summary:")
print(df.round(2))

# Compare with paper statistics
print("\nPaper statistics:")
print("  Train: 3,037 conversations")
print("  Dev: 421 conversations") 
print("  Test: 434 conversations")

print("\nYour dataset matches paper splits:", 
      split_stats.get('train', {}).get('conversations') == 3037 and 
      split_stats.get('dev', {}).get('conversations') == 421)



Split Summary:
       conversations  turns  avg_turns
train           3037  11104       3.66
dev              421   1490       3.54

Paper statistics:
  Train: 3,037 conversations
  Dev: 421 conversations
  Test: 434 conversations

Your dataset matches paper splits: True


## Conversation Analysis


In [5]:
# Analyze conversation patterns
conv_lengths = []
simple_conversations = 0
hybrid_conversations = 0

for conversations in dataset.values():
    for conv in conversations:
        dialogue = conv['dialogue']
        num_turns = len(dialogue['conv_questions'])
        conv_lengths.append(num_turns)
        
        # Check conversation type
        qa_split = dialogue.get('qa_split', [])
        if qa_split and any(qa_split):
            hybrid_conversations += 1
        else:
            simple_conversations += 1

print("Conversation length statistics:")
print(f"  Average: {np.mean(conv_lengths):.2f} turns")
print(f"  Median: {np.median(conv_lengths):.0f} turns")
print(f"  Min: {min(conv_lengths)} turns")
print(f"  Max: {max(conv_lengths)} turns")

print(f"\nConversation types:")
total = simple_conversations + hybrid_conversations
print(f"  Simple (Type I): {simple_conversations} ({simple_conversations/total*100:.1f}%)")
print(f"  Hybrid (Type II): {hybrid_conversations} ({hybrid_conversations/total*100:.1f}%)")

# Length distribution
length_counts = Counter(conv_lengths)
print(f"\nLength distribution:")
for length in sorted(length_counts.keys()):
    count = length_counts[length]
    pct = count / len(conv_lengths) * 100
    print(f"  {length} turns: {count} conversations ({pct:.1f}%)")


Conversation length statistics:
  Average: 3.64 turns
  Median: 4 turns
  Min: 1 turns
  Max: 9 turns

Conversation types:
  Simple (Type I): 2448 (70.8%)
  Hybrid (Type II): 1010 (29.2%)

Length distribution:
  1 turns: 6 conversations (0.2%)
  2 turns: 858 conversations (24.8%)
  3 turns: 751 conversations (21.7%)
  4 turns: 940 conversations (27.2%)
  5 turns: 649 conversations (18.8%)
  6 turns: 184 conversations (5.3%)
  7 turns: 52 conversations (1.5%)
  8 turns: 16 conversations (0.5%)
  9 turns: 2 conversations (0.1%)


## Sample Data


In [6]:
# Look at a sample conversation
first_split = list(dataset.keys())[0]
sample_conv = dataset[first_split][0]

print(f"Sample conversation from {first_split} split:")
print(f"ID: {sample_conv['id']}")

# Document context
doc = sample_conv['doc']
print(f"\nDocument context:")
print(f"Pre-text: {doc['pre_text'][:100]}...")
print(f"Post-text: {doc['post_text'][:100]}...")

# Table info
table = doc['table']
print(f"\nTable structure:")
print(f"Columns: {len(table)}")
print(f"Column names: {list(table.keys())[:3]}...")

if table:
    first_col = list(table.keys())[0]
    print(f"Rows: {len(table[first_col])}")

# Show conversation
dialogue = sample_conv['dialogue']
print(f"\nConversation ({len(dialogue['conv_questions'])} turns):")

for i, (q, a, p) in enumerate(zip(
    dialogue['conv_questions'][:2],
    dialogue['conv_answers'][:2], 
    dialogue['turn_program'][:2]
)):
    print(f"\nTurn {i+1}:")
    print(f"  Q: {q}")
    print(f"  A: {a}")
    print(f"  Program: {p}")

if len(dialogue['conv_questions']) > 2:
    remaining = len(dialogue['conv_questions']) - 2
    print(f"\n  ... {remaining} more turns")


Sample conversation from train split:
ID: Single_JKHY/2009/page_28.pdf-3

Document context:
Pre-text: 26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business ...
Post-text: year ended june 30 , cash provided by operations increased $ 25587 to $ 206588 for the fiscal year e...

Table structure:
Columns: 3
Column names: ['Year ended June 30, 2009', '2008', '2007']...
Rows: 6

Conversation (4 turns):

Turn 1:
  Q: what is the net cash from operating activities in 2009?
  A: 206588
  Program: 206588

Turn 2:
  Q: what about in 2008?
  A: 181001
  Program: 181001

  ... 2 more turns


In [7]:
# Print first few record IDs from each split to understand the format
for split_name, conversations in dataset.items():
    print(f"\n{split_name.upper()} split record IDs (first 5):")
    for conv in conversations[:5]:
        print(f"  {conv['id']}")



TRAIN split record IDs (first 5):
  Single_JKHY/2009/page_28.pdf-3
  Single_RSG/2008/page_114.pdf-2
  Single_AAPL/2002/page_23.pdf-1
  Single_UPS/2009/page_33.pdf-2
  Double_UPS/2009/page_33.pdf

DEV split record IDs (first 5):
  Single_MRO/2007/page_134.pdf-1
  Double_HII/2017/page_104.pdf
  Single_IPG/2009/page_85.pdf-3
  Single_UNP/2008/page_77.pdf-2
  Double_RSG/2016/page_144.pdf


In [8]:
# Search for JKHY records
print("\nSearching for JKHY records:")
for split_name, conversations in dataset.items():
    jkhy_records = [conv['id'] for conv in conversations if 'JKHY' in conv['id']]
    if jkhy_records:
        print(f"\n{split_name.upper()} split JKHY records:")
        for record_id in jkhy_records:
            print(f"  {record_id}")



Searching for JKHY records:

TRAIN split JKHY records:
  Single_JKHY/2009/page_28.pdf-3
  Double_JKHY/2016/page_25.pdf
  Double_JKHY/2015/page_20.pdf
  Double_JKHY/2016/page_61.pdf
  Single_JKHY/2008/page_30.pdf-1
  Double_JKHY/2008/page_30.pdf
  Single_JKHY/2009/page_28.pdf-4

DEV split JKHY records:
  Single_JKHY/2014/page_30.pdf-2


In [9]:
# Print 5 random record IDs from each split
import random

print("Sample valid record IDs you can use:")
for split_name, conversations in dataset.items():
    sample_records = random.sample(conversations, min(5, len(conversations)))
    print(f"\n{split_name.upper()} split examples:")
    for record in sample_records:
        # Also print first question to understand the context
        first_question = record['dialogue']['conv_questions'][0]
        print(f"ID: {record['id']}")
        print(f"First question: {first_question}\n")


Sample valid record IDs you can use:

TRAIN split examples:
ID: Single_GS/2018/page_134.pdf-3
First question: what was the value of total financial assets in 2018?

ID: Single_LMT/2014/page_31.pdf-2
First question: what was the average price of the purchased shares in november 2014?

ID: Single_ETR/2004/page_163.pdf-5
First question: what is the difference in receivables from the money pool between 2001 and 2022?

ID: Single_ADI/2016/page_61.pdf-2
First question: what was the balance of goodwill at the end of 2016?

ID: Double_HWM/2015/page_87.pdf
First question: in the year of 2013, what were the intersegment sales as a percentage of the total sales?


DEV split examples:
ID: Single_SNA/2012/page_33.pdf-1
First question: what is the value of the investment in snap-onincorporated	in 2008?

ID: Single_AAP/2011/page_28.pdf-2
First question: what is the value of the s&p 500 index on january 3, 2009 less it at the end of 2006?

ID: Double_GPN/2017/page_77.pdf
First question: what portion o

In [10]:
# Search specifically for records related to JKHY and cash flow
print("\nSearching for JKHY records with cash flow information:")
for split_name, conversations in dataset.items():
    relevant_records = [
        conv for conv in conversations 
        if ('JKHY' in conv['id'] or 'cash' in ' '.join(conv['dialogue']['conv_questions']).lower())
    ]
    if relevant_records:
        print(f"\n{split_name.upper()} split relevant records:")
        for record in relevant_records[:5]:  # Show first 5 matches
            print(f"\nID: {record['id']}")
            print(f"First question: {record['dialogue']['conv_questions'][0]}")



Searching for JKHY records with cash flow information:

TRAIN split relevant records:

ID: Single_JKHY/2009/page_28.pdf-3
First question: what is the net cash from operating activities in 2009?

ID: Single_RSG/2012/page_145.pdf-1
First question: what was the total restricted cash and marketable securities in 2012?

ID: Single_WRK/2019/page_49.pdf-1
First question: what was the sum of net cash provided by operating activities and used for investing in 2019?

ID: Double_WRK/2018/page_53.pdf
First question: in the year of 2018, what was the total net cash used?

ID: Single_SWKS/2010/page_105.pdf-2
First question: what is the balance of cash and cash equivalents at the end of 2010?

DEV split relevant records:

ID: Single_UNP/2014/page_35.pdf-3
First question: what was the cash provided by operating activities in 2013?

ID: Single_IPG/2008/page_93.pdf-3
First question: what is the total of estimated future contingent acquisition obligations payable in cash in 2009?

ID: Single_LMT/2010/pa

## Document Analysis

Let's analyze all unique documents in the dataset and their characteristics.


In [11]:
# Create a dictionary to store unique documents
unique_docs = {}

# Iterate through all splits and conversations
for split_name, conversations in dataset.items():
    for conv in conversations:
        doc_id = conv['id'].split('-')[0]  # Get base document ID without conversation number
        
        if doc_id not in unique_docs:
            unique_docs[doc_id] = {
                'doc': conv['doc'],
                'conversation_count': 1,
                'splits': {split_name},
                'table_columns': len(conv['doc']['table']),
                'table_rows': len(next(iter(conv['doc']['table'].values()))),
                'pre_text_length': len(conv['doc']['pre_text']),
                'post_text_length': len(conv['doc']['post_text'])
            }
        else:
            unique_docs[doc_id]['conversation_count'] += 1
            unique_docs[doc_id]['splits'].add(split_name)

# Convert to DataFrame for better analysis
docs_df = pd.DataFrame([
    {
        'document_id': doc_id,
        'conversations': info['conversation_count'],
        'splits': ', '.join(sorted(info['splits'])),
        'table_columns': info['table_columns'],
        'table_rows': info['table_rows'],
        'pre_text_length': info['pre_text_length'],
        'post_text_length': info['post_text_length'],
        'total_text_length': info['pre_text_length'] + info['post_text_length']
    }
    for doc_id, info in unique_docs.items()
])

# Display summary statistics
print(f"Total unique documents: {len(docs_df)}")
print("\nDocument statistics:")
print(docs_df.describe().round(2))

print("\nSplit distribution:")
split_counts = docs_df['splits'].value_counts()
for split_type, count in split_counts.items():
    print(f"  {split_type}: {count} documents")

# Display first few documents with most conversations
print("\nTop 5 documents by number of conversations:")
print(docs_df.nlargest(5, 'conversations')[['document_id', 'conversations', 'splits', 'table_columns', 'table_rows']])


Total unique documents: 2462

Document statistics:
       conversations  table_columns  table_rows  pre_text_length  \
count         2462.0        2462.00     2462.00          2462.00   
mean             1.4           2.82        5.28          1730.94   
std              0.7           1.53        2.55          1383.42   
min              1.0           1.00        1.00             1.00   
25%              1.0           2.00        3.00           523.25   
50%              1.0           3.00        5.00          1397.50   
75%              2.0           3.00        7.00          2741.00   
max              6.0          10.00       19.00          7153.00   

       post_text_length  total_text_length  
count           2462.00            2462.00  
mean            1907.50            3638.45  
std             1495.59            1480.93  
min                1.00             103.00  
25%              649.50            2854.00  
50%             1743.00            3627.50  
75%             2974.

In [12]:
docs_df['document_id']

0       Single_JKHY/2009/page_28.pdf
1       Single_RSG/2008/page_114.pdf
2       Single_AAPL/2002/page_23.pdf
3        Single_UPS/2009/page_33.pdf
4        Double_UPS/2009/page_33.pdf
                    ...             
2457    Single_VRTX/2003/page_71.pdf
2458     Single_MRO/2006/page_33.pdf
2459    Double_ADBE/2018/page_86.pdf
2460     Double_DVN/2014/page_85.pdf
2461    Single_JKHY/2014/page_30.pdf
Name: document_id, Length: 2462, dtype: object

In [13]:
# Function to display document details
def display_document_details(doc_id):
    doc_info = unique_docs[doc_id]
    doc = doc_info['doc']
    
    print(f"Document ID: {doc_id}")
    print(f"Number of conversations: {doc_info['conversation_count']}")
    print(f"Present in splits: {', '.join(sorted(doc_info['splits']))}")
    
    print("\nTable Structure:")
    print(f"Columns ({len(doc['table'])}): {list(doc['table'].keys())}")
    print(f"Rows: {len(next(iter(doc['table'].values())))}")
    
    print("\nPre-text:")
    print(doc['pre_text'][:200] + "..." if len(doc['pre_text']) > 200 else doc['pre_text'])
    
    print("\nPost-text:")
    print(doc['post_text'][:200] + "..." if len(doc['post_text']) > 200 else doc['post_text'])

# Display details of the document with the most conversations
most_conv_doc = docs_df.nlargest(1, 'conversations').iloc[0]['document_id']
print("Details of the document with most conversations:")
display_document_details(most_conv_doc)


Details of the document with most conversations:
Document ID: Single_RE/2010/page_138.pdf
Number of conversations: 6
Present in splits: train

Table Structure:
Columns (3): ['2010', '2009', '2008']
Rows: 7

Pre-text:
a reconciliation of the beginning and ending amount of unrecognized tax benefits , for the periods indicated , is as follows: .

Post-text:
the entire amount of the unrecognized tax benefits would affect the effective tax rate if recognized . in 2010 , the company favorably settled a 2003 and 2004 irs audit . the company recorded a net ov...
