In [None]:
from sci_review.data_base import *
import jsonlines
from sci_review.agentic_rag import *
from tqdm import tqdm

Try a range of different questions. Besides extracting information into a taxonomy, I am converting the following datasets into test questions:
+ ACLSum (Summarize the Challenge, Approach, and Outcome in the paper)
+ SciREX (extract the main results of a scientific article including Dataset, Metric, Task and Method)
+ arxivDIGESTables (given a table schema for literature survey and extract targeted values from scientific papers to fill in the table)

In [None]:
with open('words_alpha.txt') as f:
    words_alpha = set(f.read().splitlines())
doc_manager = DocManager(word_vocab=words_alpha)

agentic_rag = AgenticRAG()
agentic_rag.doc_manager = doc_manager

eval_metrics = EvalMetrics()

## ACLSum

### Normalize Data Samples

In [3]:
from aclsum import ACLSum

# Load per split ("train", "val", "test")
split = "train"
train = ACLSum(split)

aclsum_dataset = list[dict]()
for doc in train:
    aclsum_dataset.append(Sample(
        doc_file=f'https://aclanthology.org/{doc.id}.pdf',
        doc_strs=[
            'Abstract', 
            DocManager.remove_citations(DocManager.remove_space_before_punct(' '.join(doc.get_all_sentences(['abstract'])))), 
            'Introduction', 
            DocManager.remove_citations(DocManager.remove_space_before_punct(' '.join(doc.get_all_sentences(['introduction'])))), 
            'Conclusion', 
            DocManager.remove_citations(DocManager.remove_space_before_punct(' '.join(doc.get_all_sentences(['conclusion'])))), 
        ],
        outline='Abstract\nIntroduction\nConclusion',
        question_types=['challenge', 'approach', 'outcome'],
        questions={
            'challenge': 'Summarize the challenge of the paper, which is the current situation faced by the researcher. It will normally include a Problem Statement, the Motivation, a Hypothesis and/or a Goal.', 
            'approach': 'Summarize the approach of the paper: How they intend to carry out the investigation, comments on a theoretical model or framework.', 
            'outcome': 'Summarize the outcome of the paper: Overall conclusion that should reject or support the research hypothesis.'
        },
        answers={
            'challenge': doc.summaries['challenge'], 
            'approach': doc.summaries['approach'], 
            'outcome': doc.summaries['outcome']
        },
        extractions={
            'challenge': [DocManager.remove_citations(DocManager.remove_space_before_punct(sent)) for sent in doc.get_all_highlighted_sentences('challenge')],
            'approach': [DocManager.remove_citations(DocManager.remove_space_before_punct(sent)) for sent in doc.get_all_highlighted_sentences('approach')],
            'outcome': [DocManager.remove_citations(DocManager.remove_space_before_punct(sent)) for sent in doc.get_all_highlighted_sentences('outcome')],
        }
    ).model_dump())
    
with jsonlines.open(f'../../data/ACLSum/{split}_dataset.jsonl', 'w') as f_out:
    f_out.write_all(aclsum_dataset)

In [None]:
train[0].get_all_highlighted_sentences('challenge')

### Test Experiments

In [2]:
# Dataset config
split = 'train'
load_from_pdf = False
question_type = 'challenge'

# Chunk config
sent_chunk = True
max_seq_len = None
k = 10
# sent_chunk = False
# max_seq_len = None
# k = 10
# sent_chunk = False
# max_seq_len = 100
# k = 10

In [None]:
with jsonlines.open(f'../../data/ACLSum/{split}_dataset.jsonl') as f_in:
    aclsum_dataset = [Sample.model_validate(line) for line in f_in]
    for sid, sample in enumerate(tqdm(aclsum_dataset)):
        if load_from_pdf:
            # Load from full pdf
            doc_file = f"../../data/ACLSum/{sample.doc_file.split('/')[-1]}"
            outline_file = f"../../data/ACLSum/outline_{sample.doc_file.split('/')[-1].replace('.pdf', '.txt')}"
            if not os.path.exists(doc_file):
                download_file(sample.doc_file, doc_file)
            if os.path.exists(outline_file):
                with open(outline_file) as f:
                    outline = f.read()
            else:
                outline = None
            doc_manager.load_doc(doc_file, outline)
            if not outline:
                with open(outline_file, 'w') as f:
                    f.write(doc_manager.full_outline)
        else:
            # Load from partial text
            doc_manager.load_doc(doc_strs=sample.doc_strs, outline=sample.outline)
            
        unique_ngram2sent = get_sent_index([sent.text for sent in doc_manager.sents])
        doc_manager.build_chunks(sent_chunk=sent_chunk, max_seq_length=max_seq_len)
        agentic_rag.load_langgraph([RetrieveByDenseRetrieval(doc_manager, k), RewriteQuestion])
        process = agentic_rag.invoke(sample.questions[question_type])
        process_file = f'../../data/ACLSum/generation/{split}_{sid}_{question_type}_{load_from_pdf}_{sent_chunk}_{max_seq_len}_{k}.json'
        AgenticRAG.dump_process(process, process_file)

In [3]:
process_file = f'../../data/ACLSum/generation/{split}_{3}_{question_type}_{load_from_pdf}_{sent_chunk}_{max_seq_len}_{k}.json'
process = AgenticRAG.load_process(process_file)

In [None]:
process

In [None]:
message = process[0]['agent']['messages'][0].model_dump()
message

In [None]:
retrieved_sents = passages if sent_chunk else [sent for passage in passages for sent in sent_tokenize(passage)]
retrieved_sent_ids = get_binary_sent_ids(retrieved_sents, unique_ngram2sent)
gold_sent_ids = get_binary_sent_ids(test_sample.extractions[question_type], unique_ngram2sent)
eval_metrics.eval_precision_recall_f1(predictions=retrieved_sent_ids, references=gold_sent_ids)

## SciREX

+ Salient Entity Extraction
  + Extract the salient Dataset, Method, Task and Metric of the paper.
  + An entity is extracted if one of its mentions is being returned.
+ Salient Entity Mention Extraction
  + Extract the sentences where a salient entity's mention appear.
  + An entity mention is extracted if the sentence containing the mention is extracted.
+ Salient N-ary Relation Extraction
  + Extract the Dataset, Method, Task and Metric tuples that are bounded together.

### Dataset Observation and Preparation

In [2]:
with jsonlines.open('../../data/SciREX/train.jsonl') as f_in:
    scirex_dataset = list(f_in)

In [3]:
test_sample = scirex_dataset[0]

In [None]:
test_sample.keys()

In [None]:
test_sample['n_ary_relations']

In [None]:
test_sample['doc_id']

In [None]:
test_sample['words'][12]

In [18]:
import requests
from time import sleep
from tqdm import tqdm


        

failed_ids = []
for sample in tqdm(scirex_dataset[2:]):
    paper_meta = requests.get(f"https://api.semanticscholar.org/graph/v1/paper/{sample['doc_id']}", params={'fields': 'externalIds'}).json()
    while 'externalIds' not in paper_meta:
        sleep(10)
        paper_meta = requests.get(f"https://api.semanticscholar.org/graph/v1/paper/{sample['doc_id']}", params={'fields': 'externalIds'}).json()
    if 'ArXiv' not in paper_meta['externalIds']:
        failed_ids.append(sample['doc_id'])
        continue
    download_file(f"https://arxiv.org/pdf/{paper_meta['externalIds']['ArXiv']}", f"../../data/SciREX/pdfs/{sample['doc_id']}.pdf")
    sleep(2)
with open('../../data/SciREX/failed_ids.txt', 'w') as f_out:
    f_out.write('\n'.join(failed_ids))

In [None]:
doc_manager = DocManager()

In [None]:
test_file = f"../../data/SciREX/pdfs/{test_sample['doc_id']}.pdf"
if os.path.exists(test_file):
    doc_manager.load_doc(test_file)

In [None]:
print(doc_manager.outline)

In [None]:
doc_manager.get_section_by_header('6. Conclusion').text

In [None]:
test_file

In [None]:
doc_manager.sections[5].text

In [None]:
doc_manager.sections[3].blocks[0]

In [None]:
# doc_strs = list[str]()
# for section_range in test_sample['sections']:
#     section_words = test_sample['words'][section_range[0]:section_range[1]]
#     if section_words[0] == 'section' and section_words[1] == ':':
#         section_words = section_words[2:]
#     doc_strs.append(' '.join(section_words))
'https://arxiv.org/pdf/2210.14427'
doc_manager.load_doc(doc_file='https://arxiv.org/pdf/1611.08323')

In [None]:
test_sample['sentences']

In [None]:
test_sample['method_subrelations']

## Self-RAG

## Plan and Solve

## CoT