In [29]:
import json 
import os
with open(r"artifacts\ohr_data_stats.json", encoding='utf-8') as f:
    data = json.load(f)

In [30]:
# Filter documents that have at least one table (table_count != 0)
docs_with_table = [doc for doc in data if doc.get('table_count', 0) != 0]

print(f"Documents with at least one table: {len(docs_with_table)}")

# Optionally, print the first few document names and their table counts for inspection
for doc in docs_with_table[:10]:
    print(f"{doc.get('doc_name', 'unknown')}: {doc.get('table_count', 0)} tables")

# Save the filtered documents to artifacts/havetable_docs.json
output_path = os.path.join('artifacts', 'havetable_docs.json')
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(docs_with_table, f, ensure_ascii=False, indent=2)

print(f"Filtered documents saved to {output_path}")

Documents with at least one table: 471
academic/2305.02437v3: 12 tables
academic/2305.14160v4: 6 tables
academic/2310.11511v1: 7 tables
academic/2402.03216v4: 14 tables
academic/2403.20330v2: 7 tables
academic/2404.10198v2: 5 tables
academic/2405.14458v1: 8 tables
academic/2405.14831v1: 17 tables
academic/2409.01704v1: 5 tables
academic/2409.16145v1: 5 tables
Filtered documents saved to artifacts\havetable_docs.json


In [34]:
with open(r"artifacts\havetable_docs.json", encoding='utf-8') as f:
    data=json.load(f)

english_docs=[doc for doc in data if doc.get('lan')=='en']
output_path=os.path.join('artifacts', 'havetable_english_docs.json')
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(english_docs, f, ensure_ascii=False, indent=2)

print(f"English documents saved to {output_path}")





English documents saved to artifacts\havetable_english_docs.json


In [36]:
import json
import os
import shutil

# Read the filtered English documents
with open(os.path.join('artifacts', 'havetable_english_docs.json'), encoding='utf-8') as f:
    docs = json.load(f)

src_base = os.path.join('..', 'data', 'pdf')
dest_base = os.path.join('..', 'sample data', 'pdf')

not_found = 0
for doc in docs:
    doc_name = doc.get('doc_name')
    if not doc_name:
        print("No doc_name found in document:", doc)
        continue

    # Simple conversion: ../data/pdf/{doc_name}.pdf
    src_pdf_path = os.path.join(src_base, doc_name + '.pdf')
    domain, pdf_file = doc_name.split('/', 1)
    dest_domain_folder = os.path.join(dest_base, domain)
    dest_pdf_path = os.path.join(dest_domain_folder, pdf_file + '.pdf')

    # Check and create the domain folder if it doesn't exist
    if not os.path.exists(dest_domain_folder):
        os.makedirs(dest_domain_folder)
        print(f"Created folder: {dest_domain_folder}")

    if not os.path.exists(src_pdf_path):
        print(f"Source PDF does not exist: {src_pdf_path}")
        not_found += 1
        continue

    shutil.copy2(src_pdf_path, dest_pdf_path)
    print(f"Copied {src_pdf_path} to {dest_pdf_path}")

print(f"Total not found: {not_found} out of {len(docs)}")

Created folder: ..\sample data\pdf\academic
Copied ..\data\pdf\academic/2305.02437v3.pdf to ..\sample data\pdf\academic\2305.02437v3.pdf
Copied ..\data\pdf\academic/2305.14160v4.pdf to ..\sample data\pdf\academic\2305.14160v4.pdf
Copied ..\data\pdf\academic/2310.11511v1.pdf to ..\sample data\pdf\academic\2310.11511v1.pdf
Copied ..\data\pdf\academic/2402.03216v4.pdf to ..\sample data\pdf\academic\2402.03216v4.pdf
Copied ..\data\pdf\academic/2403.20330v2.pdf to ..\sample data\pdf\academic\2403.20330v2.pdf
Copied ..\data\pdf\academic/2404.10198v2.pdf to ..\sample data\pdf\academic\2404.10198v2.pdf
Copied ..\data\pdf\academic/2405.14458v1.pdf to ..\sample data\pdf\academic\2405.14458v1.pdf
Copied ..\data\pdf\academic/2405.14831v1.pdf to ..\sample data\pdf\academic\2405.14831v1.pdf
Copied ..\data\pdf\academic/2409.01704v1.pdf to ..\sample data\pdf\academic\2409.01704v1.pdf
Copied ..\data\pdf\academic/2409.16145v1.pdf to ..\sample data\pdf\academic\2409.16145v1.pdf
Copied ..\data\pdf\academi

In [37]:
import json
import os
import shutil

# Read the filtered English documents
with open(os.path.join('artifacts', 'havetable_english_docs.json'), encoding='utf-8') as f:
    docs = json.load(f)

pdf_base = os.path.join('..', 'sample data', 'pdf')
gt_src_base = os.path.join('..', 'data', 'ground_truth')
gt_dest_base = os.path.join('..', 'sample data', 'ground_truth')

for doc in docs:
    doc_name = doc.get('doc_name')
    if not doc_name:
        print("No doc_name found in document:", doc)
        continue

    # PDF path in sample data/pdf/{domain}/{pdf_file}.pdf
    domain, pdf_file = doc_name.split('/', 1)
    pdf_path = os.path.join(pdf_base, domain, pdf_file + '.pdf')
    if not os.path.exists(pdf_path):
        print(f"PDF does not exist in sample data: {pdf_path}")
        continue

    # Ground truth JSON path: ../data/ground_truth/{doc_name}.json
    gt_src_path = os.path.join(gt_src_base, domain, pdf_file + '.json')
    if not os.path.exists(gt_src_path):
        print(f"Ground truth JSON does not exist: {gt_src_path}")
        continue

    # Destination: ../sample data/ground_truth/{domain}/{pdf_file}.json
    gt_dest_domain_folder = os.path.join(gt_dest_base, domain)
    gt_dest_path = os.path.join(gt_dest_domain_folder, pdf_file + '.json')
    if not os.path.exists(gt_dest_domain_folder):
        os.makedirs(gt_dest_domain_folder)
        print(f"Created folder: {gt_dest_domain_folder}")

    shutil.copy2(gt_src_path, gt_dest_path)
    print(f"Copied {gt_src_path} to {gt_dest_path}")

Created folder: ..\sample data\ground_truth\academic
Copied ..\data\ground_truth\academic\2305.02437v3.json to ..\sample data\ground_truth\academic\2305.02437v3.json
Copied ..\data\ground_truth\academic\2305.14160v4.json to ..\sample data\ground_truth\academic\2305.14160v4.json
Copied ..\data\ground_truth\academic\2310.11511v1.json to ..\sample data\ground_truth\academic\2310.11511v1.json
Copied ..\data\ground_truth\academic\2402.03216v4.json to ..\sample data\ground_truth\academic\2402.03216v4.json
Copied ..\data\ground_truth\academic\2403.20330v2.json to ..\sample data\ground_truth\academic\2403.20330v2.json
Copied ..\data\ground_truth\academic\2404.10198v2.json to ..\sample data\ground_truth\academic\2404.10198v2.json
Copied ..\data\ground_truth\academic\2405.14458v1.json to ..\sample data\ground_truth\academic\2405.14458v1.json
Copied ..\data\ground_truth\academic\2405.14831v1.json to ..\sample data\ground_truth\academic\2405.14831v1.json
Copied ..\data\ground_truth\academic\2409.0

In [38]:
#check the which files are not matched...

import os

pdf_base = os.path.join('..', 'sample data', 'pdf')
gt_base = os.path.join('..', 'sample data', 'ground_truth')

domains = set()
if os.path.exists(pdf_base):
    domains.update(os.listdir(pdf_base))
if os.path.exists(gt_base):
    domains.update(os.listdir(gt_base))

for domain in sorted(domains):
    pdf_dir = os.path.join(pdf_base, domain)
    gt_dir = os.path.join(gt_base, domain)
    pdf_files = set()
    gt_files = set()

    if os.path.exists(pdf_dir):
        pdf_files = {os.path.splitext(f)[0] for f in os.listdir(pdf_dir) if f.lower().endswith('.pdf')}
    if os.path.exists(gt_dir):
        gt_files = {os.path.splitext(f)[0] for f in os.listdir(gt_dir) if f.lower().endswith('.json')}

    both = pdf_files & gt_files
    only_pdf = pdf_files - gt_files
    only_gt = gt_files - pdf_files

    print(f"\nDomain: {domain}")
    print(f"  PDFs: {len(pdf_files)}")
    print(f"  JSONs: {len(gt_files)}")
    print(f"  Matched pairs: {len(both)}")
    print(f"  PDFs without JSON: {len(only_pdf)}")
    if only_pdf:
        print("    " + ", ".join(sorted(only_pdf)[:10]) + (" ..." if len(only_pdf) > 10 else ""))
    print(f"  JSONs without PDF: {len(only_gt)}")
    if only_gt:
        print("    " + ", ".join(sorted(only_gt)[:10]) + (" ..." if len(only_gt) > 10 else ""))


Domain: academic
  PDFs: 58
  JSONs: 58
  Matched pairs: 58
  PDFs without JSON: 0
  JSONs without PDF: 0

Domain: administration
  PDFs: 91
  JSONs: 91
  Matched pairs: 91
  PDFs without JSON: 0
  JSONs without PDF: 0

Domain: finance
  PDFs: 61
  JSONs: 61
  Matched pairs: 61
  PDFs without JSON: 0
  JSONs without PDF: 0

Domain: law
  PDFs: 40
  JSONs: 40
  Matched pairs: 40
  PDFs without JSON: 0
  JSONs without PDF: 0

Domain: manual
  PDFs: 59
  JSONs: 59
  Matched pairs: 59
  PDFs without JSON: 0
  JSONs without PDF: 0

Domain: news
  PDFs: 66
  JSONs: 66
  Matched pairs: 66
  PDFs without JSON: 0
  JSONs without PDF: 0

Domain: textbook
  PDFs: 41
  JSONs: 41
  Matched pairs: 41
  PDFs without JSON: 0
  JSONs without PDF: 0


In [39]:
#Statistics of the final sample dataset 

import json
from collections import defaultdict

# Load the data
with open('artifacts/havetable_english_docs.json', encoding='utf-8') as f:
    docs = json.load(f)

# Prepare statistics containers
stats = defaultdict(lambda: {
    'docs': 0,
    'tables': 0,
    'formulas': 0,
    'pages': 0,
    'tokens': 0
})

# Aggregate statistics
for doc in docs:
    domain = doc.get('domain', 'unknown')
    stats[domain]['docs'] += 1
    stats[domain]['tables'] += doc.get('table_count', 0)
    stats[domain]['formulas'] += doc.get('formula_count', 0)
    stats[domain]['pages'] += doc.get('num_pages', 0)
    stats[domain]['tokens'] += doc.get('total_token_count', 0)

# Print statistics for each domain
print(f"{'Domain':<18} {'Docs':>5} {'Tables':>7} {'Formulas':>9} {'Pages':>7} {'Tokens':>10} {'AvgTok/Doc':>12} {'AvgTok/Page':>12} {'AvgTab/Doc':>12} {'AvgFor/Doc':>12}")
print('-'*110)
total_docs = total_tables = total_formulas = total_pages = total_tokens = 0
for domain, s in stats.items():
    avg_tok_doc = s['tokens'] / s['docs'] if s['docs'] else 0
    avg_tok_page = s['tokens'] / s['pages'] if s['pages'] else 0
    avg_tab_doc = s['tables'] / s['docs'] if s['docs'] else 0
    avg_for_doc = s['formulas'] / s['docs'] if s['docs'] else 0
    print(f"{domain:<18} {s['docs']:>5} {s['tables']:>7} {s['formulas']:>9} {s['pages']:>7} {s['tokens']:>10} {avg_tok_doc:>12.2f} {avg_tok_page:>12.2f} {avg_tab_doc:>12.2f} {avg_for_doc:>12.2f}")
    total_docs += s['docs']
    total_tables += s['tables']
    total_formulas += s['formulas']
    total_pages += s['pages']
    total_tokens += s['tokens']

# Print totals
print('-'*110)
avg_tok_doc = total_tokens / total_docs if total_docs else 0
avg_tok_page = total_tokens / total_pages if total_pages else 0
avg_tab_doc = total_tables / total_docs if total_docs else 0
avg_for_doc = total_formulas / total_docs if total_docs else 0
print(f"{'TOTAL':<18} {total_docs:>5} {total_tables:>7} {total_formulas:>9} {total_pages:>7} {total_tokens:>10} {avg_tok_doc:>12.2f} {avg_tok_page:>12.2f} {avg_tab_doc:>12.2f} {avg_for_doc:>12.2f}")

Domain              Docs  Tables  Formulas   Pages     Tokens   AvgTok/Doc  AvgTok/Page   AvgTab/Doc   AvgFor/Doc
--------------------------------------------------------------------------------------------------------------
academic              58     311      4862     765     584394     10075.76       763.91         5.36        83.83
administration        91     386      1988    1054    3039238     33398.22      2883.53         4.24        21.85
finance               61    1758     22040    2112    1953283     32021.03       924.85        28.82       361.31
law                   40     127       655     681     385783      9644.58       566.49         3.17        16.38
manual                59     479      2374    1628     566269      9597.78       347.83         8.12        40.24
news                  66      93       268     118     168233      2548.98      1425.70         1.41         4.06
textbook              41      62       475     172      94158      2296.54       547.43    

In [40]:
import json
from collections import defaultdict

# Load the data
with open('artifacts/havetable_english_docs.json', encoding='utf-8') as f:
    docs = json.load(f)

# Prepare statistics containers
stats = defaultdict(lambda: {
    'docs': 0,
    'tables': 0,
    'formulas': 0,
    'pages': 0,
    'tokens': 0
})

# Aggregate statistics
for doc in docs:
    domain = doc.get('domain', 'unknown')
    stats[domain]['docs'] += 1
    stats[domain]['tables'] += doc.get('table_count', 0)
    stats[domain]['formulas'] += doc.get('formula_count', 0)
    stats[domain]['pages'] += doc.get('num_pages', 0)
    stats[domain]['tokens'] += doc.get('total_token_count', 0)

# Compute totals
total_docs = total_tables = total_formulas = total_pages = total_tokens = 0
for s in stats.values():
    total_docs += s['docs']
    total_tables += s['tables']
    total_formulas += s['formulas']
    total_pages += s['pages']
    total_tokens += s['tokens']

# Prepare Markdown table
md = "| Domain | Docs | Tables | Formulas | Pages | Tokens | AvgTok/Doc | AvgTok/Page | AvgTab/Doc | AvgFor/Doc |\n"
md += "|--------|------|--------|----------|-------|--------|------------|-------------|------------|------------|\n"
for domain, s in stats.items():
    avg_tok_doc = s['tokens'] / s['docs'] if s['docs'] else 0
    avg_tok_page = s['tokens'] / s['pages'] if s['pages'] else 0
    avg_tab_doc = s['tables'] / s['docs'] if s['docs'] else 0
    avg_for_doc = s['formulas'] / s['docs'] if s['docs'] else 0
    md += f"| {domain} | {s['docs']} | {s['tables']} | {s['formulas']} | {s['pages']} | {s['tokens']} | {avg_tok_doc:.2f} | {avg_tok_page:.2f} | {avg_tab_doc:.2f} | {avg_for_doc:.2f} |\n"
# Add total row
avg_tok_doc = total_tokens / total_docs if total_docs else 0
avg_tok_page = total_tokens / total_pages if total_pages else 0
avg_tab_doc = total_tables / total_docs if total_docs else 0
avg_for_doc = total_formulas / total_docs if total_docs else 0
md += f"| TOTAL | {total_docs} | {total_tables} | {total_formulas} | {total_pages} | {total_tokens} | {avg_tok_doc:.2f} | {avg_tok_page:.2f} | {avg_tab_doc:.2f} | {avg_for_doc:.2f} |\n"

with open('artifacts/final_sample_dataset_statistics.md', 'w', encoding='utf-8') as f:
    f.write(md)

# Prepare LaTeX table
latex = "\\begin{tabular}{lrrrrrrrrr}\n\\hline\n"
latex += "Domain & Docs & Tables & Formulas & Pages & Tokens & AvgTok/Doc & AvgTok/Page & AvgTab/Doc & AvgFor/Doc \\\\\n\\hline\n"
for domain, s in stats.items():
    avg_tok_doc = s['tokens'] / s['docs'] if s['docs'] else 0
    avg_tok_page = s['tokens'] / s['pages'] if s['pages'] else 0
    avg_tab_doc = s['tables'] / s['docs'] if s['docs'] else 0
    avg_for_doc = s['formulas'] / s['docs'] if s['docs'] else 0
    latex += f"{domain} & {s['docs']} & {s['tables']} & {s['formulas']} & {s['pages']} & {s['tokens']} & {avg_tok_doc:.2f} & {avg_tok_page:.2f} & {avg_tab_doc:.2f} & {avg_for_doc:.2f} \\\\\n"
latex += "\\hline\n"
latex += f"TOTAL & {total_docs} & {total_tables} & {total_formulas} & {total_pages} & {total_tokens} & {avg_tok_doc:.2f} & {avg_tok_page:.2f} & {avg_tab_doc:.2f} & {avg_for_doc:.2f} \\\\\n"
latex += "\\hline\n\\end{tabular}\n"

with open('artifacts/final_sample_dataset_statistics.tex', 'w', encoding='utf-8') as f:
    f.write(latex)

print("Markdown and LaTeX tables have been saved to artifacts/final_sample_dataset_statistics.md and artifacts/inal_sample_dataset_statistics.tex")

Markdown and LaTeX tables have been saved to artifacts/final_sample_dataset_statistics.md and artifacts/inal_sample_dataset_statistics.tex


In [48]:
import json
import os

# Load the sample document list
with open('artifacts/havetable_english_docs.json', encoding='utf-8') as f:
    sample_docs = json.load(f)

pdf_base = os.path.join('..', 'sample data', 'pdf')
gt_base = os.path.join('..', 'sample data', 'ground_truth')

filtered_docs = []
removed_docs = []

for doc in sample_docs:
    doc_name = doc.get('doc_name')
    if not doc_name:
        continue
    domain, pdf_file = doc_name.split('/', 1)
    pdf_path = os.path.join(pdf_base, domain, pdf_file + '.pdf')
    json_path = os.path.join(gt_base, domain, pdf_file + '.json')

    if os.path.exists(pdf_path) and os.path.exists(json_path):
        filtered_docs.append(doc)
    else:
        removed_docs.append(doc_name)

# Write the filtered docs to the new JSON file
output_path = os.path.join('..', 'sample data', 'final_sample_stats.json')
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(filtered_docs, f, ensure_ascii=False, indent=2)

print(f"Filtered sample doc list written to {output_path} with {len(filtered_docs)} docs.")
if removed_docs:
    print(f"Removed {len(removed_docs)} docs not present in both PDF and ground truth:")
    for doc_name in removed_docs:
        print(f"  {doc_name}")
else:
    print("All docs in havetable_english_docs.json are present in both PDF and ground truth.")

Filtered sample doc list written to ..\sample data\final_sample_stats.json with 408 docs.
Removed 8 docs not present in both PDF and ground truth:
  academic/DUDE_06ea52c33812c8d04da69910ace53113
  news/05_10
  news/06_08
  textbook/biz-96816.pdf_48
  textbook/human-20608.pdf_383
  textbook/omnidocbench_notes_f7f010b78016aeebd76e56d9283eb67f_49
  textbook/omnidocbench_notes_f7f010b78016aeebd76e56d9283eb67f_50
  textbook/socialsci-145347.pdf_16


In [49]:
#Final Check for matches in the sample ground truth and pdf files and final_sample_stats...
import json
import os

pdf_base = os.path.join('..', 'sample data', 'pdf')
gt_base = os.path.join('..', 'sample data', 'ground_truth')
final_stats_path = os.path.join('..', 'sample data', 'final_sample_stats.json')

# Collect doc_names from PDF and ground truth folders
pdf_doc_names = set()
gt_doc_names = set()

for base, doc_set in [(pdf_base, pdf_doc_names), (gt_base, gt_doc_names)]:
    if os.path.exists(base):
        for domain in os.listdir(base):
            domain_dir = os.path.join(base, domain)
            if os.path.isdir(domain_dir):
                for fname in os.listdir(domain_dir):
                    if base == pdf_base and fname.lower().endswith('.pdf'):
                        doc_set.add(f"{domain}/{os.path.splitext(fname)[0]}")
                    elif base == gt_base and fname.lower().endswith('.json'):
                        doc_set.add(f"{domain}/{os.path.splitext(fname)[0]}")

# Collect doc_names from final_sample_stats.json
with open(final_stats_path, encoding='utf-8') as f:
    final_stats = json.load(f)
final_stats_doc_names = set(doc['doc_name'] for doc in final_stats if 'doc_name' in doc)

# Find unmatched
pdf_not_in_gt = pdf_doc_names - gt_doc_names
gt_not_in_pdf = gt_doc_names - pdf_doc_names
pdf_not_in_stats = pdf_doc_names - final_stats_doc_names
stats_not_in_pdf = final_stats_doc_names - pdf_doc_names
gt_not_in_stats = gt_doc_names - final_stats_doc_names
stats_not_in_gt = final_stats_doc_names - gt_doc_names

# Print results
print(f"Total PDFs: {len(pdf_doc_names)}")
print(f"Total ground truth JSONs: {len(gt_doc_names)}")
print(f"Total docs in final_sample_stats.json: {len(final_stats_doc_names)}")

if pdf_not_in_gt:
    print("\nPDFs without matching ground truth JSON:")
    for name in sorted(pdf_not_in_gt):
        print(f"  {name}")
if gt_not_in_pdf:
    print("\nGround truth JSONs without matching PDF:")
    for name in sorted(gt_not_in_pdf):
        print(f"  {name}")
if pdf_not_in_stats:
    print("\nPDFs not in final_sample_stats.json:")
    for name in sorted(pdf_not_in_stats):
        print(f"  {name}")
if stats_not_in_pdf:
    print("\nDocs in final_sample_stats.json not in PDFs:")
    for name in sorted(stats_not_in_pdf):
        print(f"  {name}")
if gt_not_in_stats:
    print("\nGround truth JSONs not in final_sample_stats.json:")
    for name in sorted(gt_not_in_stats):
        print(f"  {name}")
if stats_not_in_gt:
    print("\nDocs in final_sample_stats.json not in ground truth JSONs:")
    for name in sorted(stats_not_in_gt):
        print(f"  {name}")

if not (pdf_not_in_gt or gt_not_in_pdf or pdf_not_in_stats or stats_not_in_pdf or gt_not_in_stats or stats_not_in_gt):
    print("\nAll files in sample data/ground_truth, pdf, and final_sample_stats.json are fully matched!")
else:
    print("\nThere are unmatched files as listed above.")

Total PDFs: 408
Total ground truth JSONs: 408
Total docs in final_sample_stats.json: 408

All files in sample data/ground_truth, pdf, and final_sample_stats.json are fully matched!


In [52]:
import json
import os

# Load the final sample doc list
with open(os.path.join('..', 'sample data', 'final_sample_stats.json'), encoding='utf-8') as f:
    sample_docs = json.load(f)

sample_doc_names = set(doc['doc_name'] for doc in sample_docs if 'doc_name' in doc)
num_sample_docs = len(sample_doc_names)

# Load the QA dataset
with open(os.path.join('..', 'data', 'qas_v2.json'), encoding='utf-8') as f:
    qa_data = json.load(f)

# If qa_data is a dict, get its values (in case it's a mapping)
if isinstance(qa_data, dict):
    qa_items = list(qa_data.values())
else:
    qa_items = qa_data

# Build doc_name -> list of QAs mapping for all QAs
qa_by_doc = {}
for qa in qa_items:
    doc_name = qa.get('doc_name')
    if doc_name:
        qa_by_doc.setdefault(doc_name, []).append(qa)

# Retrieve QAs for sample docs
sample_qa = []
docs_with_qa = set()
docs_with_chart_qa = set()
chart_qa = []
docs_with_chart_qa_not_retrieved = set()
all_chart_qa = []
docs_with_non_chart_qa = set()
non_chart_qa = []

for doc_name in sample_doc_names:
    qas = qa_by_doc.get(doc_name, [])
    if qas:
        docs_with_qa.add(doc_name)
        sample_qa.extend(qas)
        # Chart-evidence QAs for this doc
        chart_qas = [qa for qa in qas if qa.get('evidence_source') == 'chart']
        if chart_qas:
            docs_with_chart_qa.add(doc_name)
            chart_qa.extend(chart_qas)
        # Non-chart-evidence QAs for this doc
        non_chart_qas = [qa for qa in qas if qa.get('evidence_source') != 'chart']
        if non_chart_qas:
            docs_with_non_chart_qa.add(doc_name)
            non_chart_qa.extend(non_chart_qas)
    else:
        # Check if there are chart-evidence QAs in the original dataset for this doc
        orig_chart_qas = [qa for qa in qa_items if qa.get('doc_name') == doc_name and qa.get('evidence_source') == 'chart']
        if orig_chart_qas:
            docs_with_chart_qa_not_retrieved.add(doc_name)
            all_chart_qa.extend(orig_chart_qas)

# For all QAs in the original dataset, count chart-evidence QAs for sample docs
all_chart_qa_for_sample = [qa for qa in qa_items if qa.get('doc_name') in sample_doc_names and qa.get('evidence_source') == 'chart']
all_non_chart_qa_for_sample = [qa for qa in qa_items if qa.get('doc_name') in sample_doc_names and qa.get('evidence_source') != 'chart']

# For all QAs in the original dataset, count unique doc_names
all_qa_doc_names = set(qa.get('doc_name') for qa in qa_items if 'doc_name' in qa)
all_chart_qa_doc_names = set(qa.get('doc_name') for qa in qa_items if qa.get('evidence_source') == 'chart')
all_non_chart_qa_doc_names = set(qa.get('doc_name') for qa in qa_items if qa.get('evidence_source') != 'chart')

# Docs in sample set with no QA
docs_without_qa = sorted(sample_doc_names - docs_with_qa)

print(f"Retrieved {len(sample_qa)} QA pairs for {len(docs_with_qa)} documents out of {num_sample_docs} documents in our sample dataset.")

if docs_without_qa:
    print(f"\nThere are {len(docs_without_qa)} documents in our sample set (out of {num_sample_docs} total) with no QA pairs in qas_v2.json:")
    for doc_name in docs_without_qa:
        print(f"  {doc_name}")

print(f"\nThere are {len(chart_qa)} chart-evidence QA pairs for {len(docs_with_chart_qa)} documents in our sample dataset with chart-evidence QAs retrieved.")

if docs_with_chart_qa_not_retrieved:
    print(f"\nThere are {len(all_chart_qa) - len(chart_qa)} chart-evidence QA pairs for {len(docs_with_chart_qa_not_retrieved)} documents in our sample dataset that were NOT retrieved (because those docs have no QA at all in the sample set):")
    for doc_name in docs_with_chart_qa_not_retrieved:
        print(f"  {doc_name}")

print(f"\nA total of {len(non_chart_qa)} QA pairs with no chart evidence were retrieved for {len(docs_with_non_chart_qa)} documents in our sample dataset.")
print(f"In the original QA dataset, there are {len(all_non_chart_qa_for_sample)} non-chart-evidence QA pairs for {len(all_non_chart_qa_doc_names & sample_doc_names)} documents in our sample set, and {len(all_non_chart_qa_doc_names)} total docs in qas_v2.json with non-chart-evidence QAs.")

print(f"\nIn the original QA dataset, there are {len(all_chart_qa_for_sample)} chart-evidence QA pairs for {len(all_chart_qa_doc_names & sample_doc_names)} documents in our sample set, and {len(all_chart_qa_doc_names)} total docs in qas_v2.json with chart-evidence QAs.")

print("\nSummary:")
print(f"  - Sample docs: {num_sample_docs}")
print(f"  - Sample docs with at least one QA: {len(docs_with_qa)}")
print(f"  - Sample docs with chart-evidence QA: {len(docs_with_chart_qa)}")
print(f"  - Sample docs with non-chart-evidence QA: {len(docs_with_non_chart_qa)}")
print(f"  - Sample docs with no QA: {len(docs_without_qa)}")
print(f"  - Total QA pairs retrieved: {len(sample_qa)}")
print(f"  - Chart-evidence QA pairs retrieved: {len(chart_qa)}")
print(f"  - Non-chart-evidence QA pairs retrieved: {len(non_chart_qa)}")

# Write the filtered QA set to the sample data folder
output_path = os.path.join('..', 'sample data', 'sample_qa.json')
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(sample_qa, f, ensure_ascii=False, indent=2)

print(f"\nSample QA dataset written to {output_path}")

Retrieved 5728 QA pairs for 408 documents out of 408 documents in our sample dataset.

There are 765 chart-evidence QA pairs for 35 documents in our sample dataset with chart-evidence QAs retrieved.

A total of 4963 QA pairs with no chart evidence were retrieved for 408 documents in our sample dataset.
In the original QA dataset, there are 4963 non-chart-evidence QA pairs for 408 documents in our sample set, and 1117 total docs in qas_v2.json with non-chart-evidence QAs.

In the original QA dataset, there are 765 chart-evidence QA pairs for 35 documents in our sample set, and 35 total docs in qas_v2.json with chart-evidence QAs.

Summary:
  - Sample docs: 408
  - Sample docs with at least one QA: 408
  - Sample docs with chart-evidence QA: 35
  - Sample docs with non-chart-evidence QA: 408
  - Sample docs with no QA: 0
  - Total QA pairs retrieved: 5728
  - Chart-evidence QA pairs retrieved: 765
  - Non-chart-evidence QA pairs retrieved: 4963

Sample QA dataset written to ..\sample dat

In [54]:
#Final Check for matches in the sample ground truth and pdf files and final_sample_stats...
import json
import os

pdf_base = os.path.join('..', 'sample data', 'pdf')
gt_base = os.path.join('..', 'sample data', 'ground_truth')
qa_path = os.path.join('..', 'sample data', 'sample_qa.json')

# Collect doc_names from PDF and ground truth folders
pdf_doc_names = set()
gt_doc_names = set()

for base, doc_set, ext in [(pdf_base, pdf_doc_names, '.pdf'), (gt_base, gt_doc_names, '.json')]:
    if os.path.exists(base):
        for domain in os.listdir(base):
            domain_dir = os.path.join(base, domain)
            if os.path.isdir(domain_dir):
                for fname in os.listdir(domain_dir):
                    if fname.lower().endswith(ext):
                        doc_set.add(f"{domain}/{os.path.splitext(fname)[0]}")

# Collect doc_names from sample_qa.json
with open(qa_path, encoding='utf-8') as f:
    qa_data = json.load(f)
qa_doc_names = set(qa['doc_name'] for qa in qa_data if 'doc_name' in qa)

# Find unmatched
pdf_not_in_gt = pdf_doc_names - gt_doc_names
gt_not_in_pdf = gt_doc_names - pdf_doc_names
pdf_not_in_qa = pdf_doc_names - qa_doc_names
qa_not_in_pdf = qa_doc_names - pdf_doc_names
gt_not_in_qa = gt_doc_names - qa_doc_names
qa_not_in_gt = qa_doc_names - gt_doc_names

# Print results
print(f"Total PDFs: {len(pdf_doc_names)}")
print(f"Total ground truth JSONs: {len(gt_doc_names)}")
print(f"Total docs in sample_qa.json: {len(qa_doc_names)}")

error = False

if pdf_not_in_gt:
    print("\nPDFs without matching ground truth JSON:")
    for name in sorted(pdf_not_in_gt):
        print(f"  {name}")
    error = True
if gt_not_in_pdf:
    print("\nGround truth JSONs without matching PDF:")
    for name in sorted(gt_not_in_pdf):
        print(f"  {name}")
    error = True
if pdf_not_in_qa:
    print("\nPDFs not in sample_qa.json:")
    for name in sorted(pdf_not_in_qa):
        print(f"  {name}")
    error = True
if qa_not_in_pdf:
    print("\nDocs in sample_qa.json not in PDFs:")
    for name in sorted(qa_not_in_pdf):
        print(f"  {name}")
    error = True
if gt_not_in_qa:
    print("\nGround truth JSONs not in sample_qa.json:")
    for name in sorted(gt_not_in_qa):
        print(f"  {name}")
    error = True
if qa_not_in_gt:
    print("\nDocs in sample_qa.json not in ground truth JSONs:")
    for name in sorted(qa_not_in_gt):
        print(f"  {name}")
    error = True

if not error:
    print("\nAll files in sample data/ground_truth, pdf, and sample_qa.json are fully matched!")
else:
    print("\nERROR: There are unmatched files as listed above. Please check your sample data consistency.")

Total PDFs: 408
Total ground truth JSONs: 408
Total docs in sample_qa.json: 408

All files in sample data/ground_truth, pdf, and sample_qa.json are fully matched!


In [56]:
import json
from collections import defaultdict, Counter

# Load the final sample doc list
with open('../sample data/final_sample_stats.json', encoding='utf-8') as f:
    docs = json.load(f)

# Prepare statistics containers
stats = defaultdict(lambda: {
    'docs': 0,
    'tables': 0,
    'formulas': 0,
    'pages': 0,
    'tokens': 0,
})

# Aggregate statistics
for doc in docs:
    domain = doc.get('domain', 'unknown')
    stats[domain]['docs'] += 1
    stats[domain]['tables'] += doc.get('table_count', 0)
    stats[domain]['formulas'] += doc.get('formula_count', 0)
    stats[domain]['pages'] += doc.get('num_pages', 0)
    stats[domain]['tokens'] += doc.get('total_token_count', 0)

# Compute totals
total = defaultdict(int)
for s in stats.values():
    for k in ['docs', 'tables', 'formulas', 'pages', 'tokens']:
        total[k] += s[k]

# Prepare and print aligned text table
header = (
    "Domain              Docs  Tables  Formulas   Pages     Tokens   AvgTok/Doc  AvgTok/Page   AvgTab/Doc   AvgFor/Doc"
)
sep = "-" * len(header)
print(header)
print(sep)
for domain, s in stats.items():
    avg_tok_doc = s['tokens'] / s['docs'] if s['docs'] else 0
    avg_tok_page = s['tokens'] / s['pages'] if s['pages'] else 0
    avg_tab_doc = s['tables'] / s['docs'] if s['docs'] else 0
    avg_for_doc = s['formulas'] / s['docs'] if s['docs'] else 0
    print(f"{domain:<18} {s['docs']:>5} {s['tables']:>7} {s['formulas']:>9} {s['pages']:>7} {s['tokens']:>10} "
          f"{avg_tok_doc:>12.2f} {avg_tok_page:>12.2f} {avg_tab_doc:>12.2f} {avg_for_doc:>12.2f}")
print(sep)
avg_tok_doc = total['tokens'] / total['docs'] if total['docs'] else 0
avg_tok_page = total['tokens'] / total['pages'] if total['pages'] else 0
avg_tab_doc = total['tables'] / total['docs'] if total['docs'] else 0
avg_for_doc = total['formulas'] / total['docs'] if total['docs'] else 0
print(f"{'TOTAL':<18} {total['docs']:>5} {total['tables']:>7} {total['formulas']:>9} {total['pages']:>7} {total['tokens']:>10} "
      f"{avg_tok_doc:>12.2f} {avg_tok_page:>12.2f} {avg_tab_doc:>12.2f} {avg_for_doc:>12.2f}")

Domain              Docs  Tables  Formulas   Pages     Tokens   AvgTok/Doc  AvgTok/Page   AvgTab/Doc   AvgFor/Doc
-----------------------------------------------------------------------------------------------------------------
academic              57     310      4857     764     583805     10242.19       764.14         5.44        85.21
administration        91     386      1988    1054    3039238     33398.22      2883.53         4.24        21.85
finance               61    1758     22040    2112    1953283     32021.03       924.85        28.82       361.31
law                   40     127       655     681     385783      9644.58       566.49         3.17        16.38
manual                59     479      2374    1628     566269      9597.78       347.83         8.12        40.24
news                  64      91       263     116     163961      2561.89      1413.46         1.42         4.11
textbook              36      56       445     167      90525      2514.58       542.07 