In [1]:
import pandas as pd
import json

with open("data/research_dois_used.json", "r") as f:
    research_dois = json.load(f)

dois_used = set(research_dois)
print(len(dois_used))

10027


In [None]:
findings = pd.read_json("data/findings/combined.jsonl", lines=True)
print(len(findings))

In [None]:
research = pd.read_json("data/research_used.jsonl", lines=True)
research["pubdate"] = research["pubdate"].apply(lambda x: int(x.replace("-", "")))

# build a lookup table for research by doi
research_lookup = research.set_index("doi").T.to_dict()
print(len(research_lookup))

In [None]:
for doi in findings["doi"].unique():
    if doi not in research_lookup:
        print(f"DOI {doi} not found in research lookup table.")
    else:
        # If the DOI exists, you can access the corresponding research data
        research_data = research_lookup[doi]
        print(f"DOI {doi} found: {research_data}")

In [None]:
# Let's debug this step by step
print("Columns in findings DataFrame:", findings.columns.tolist())
print("Sample of first few rows:")
print(findings.head(2))

# Check the structure of the findings column
print("\nSample findings data:")
for i in range(min(3, len(findings))):
    row_findings = findings.iloc[i]["findings"]
    print(
        f"Row {i}: type={type(row_findings)}, length={len(row_findings) if hasattr(row_findings, '__len__') else 'N/A'}"
    )
    if hasattr(row_findings, "__len__") and len(row_findings) > 0:
        print(f"  First item: {row_findings[0]}")

# Calculate expected total
total_expected = findings["findings"].apply(lambda x: len(x) if hasattr(x, "__len__") else 0).sum()
print(f"\nExpected total records: {total_expected}")

In [None]:
prepped_records = []
record_count_by_paper = []

for row in findings.itertuples():
    doi = row.doi
    if doi not in research_lookup:
        print(f"Warning: DOI {doi} not found in research_lookup")
        continue

    paper_record = research_lookup[doi]
    pubdate = paper_record["pubdate"]
    contributions = row.findings

    # Debug: check the contributions
    if not hasattr(contributions, "__len__"):
        print(f"Warning: contributions for {doi} is not iterable: {type(contributions)}")
        continue

    records = [{"text": contribution, "doi": doi, "pubdate": pubdate} for contribution in contributions]

    record_count_by_paper.append(len(records))
    prepped_records.extend(records)

print(f"Total prepped records: {len(prepped_records)}")
print(f"Papers processed: {len(record_count_by_paper)}")
print(f"Average findings per paper: {sum(record_count_by_paper) / len(record_count_by_paper):.2f}")
print(f"Min findings per paper: {min(record_count_by_paper)}")
print(f"Max findings per paper: {max(record_count_by_paper)}")

In [None]:
# Save the prepped records to a file for the database
with open("data/prepped_findings.jsonl", "w") as f:
    for record in prepped_records:
        f.write(json.dumps(record) + "\n")

print(f"Saved {len(prepped_records)} records to data/prepped_findings.jsonl")