In [1]:
# from sci_review.self_rag import SelfRAG
from sci_review.base import Sample
import jsonlines
from sci_review.paper import *

/Users/kerui2/miniconda3/envs/longdoc/lib/python3.11/site-packages


Try a range of different questions. Besides extracting information into a taxonomy, I am converting the following datasets into test questions:
+ ACLSum (Summarize the Challenge, Approach, and Outcome in the paper)
+ SciREX (extract the main results of a scientific article including Dataset, Metric, Task and Method)
+ arxivDIGESTables (given a table schema for literature survey and extract targeted values from scientific papers to fill in the table)

## ACLSum

In [2]:
from aclsum import ACLSum

# Load per split ("train", "val", "test")
train = ACLSum("train")

aclsum_dataset = list[dict]()
for doc in train:
    aclsum_dataset.append(Sample(
        doc_file=f'https://aclanthology.org/{doc.id}.pdf',
        questions=[
            'Summarize the challenge of the paper, which is the current situation faced by the researcher. It will normally include a Problem Statement, the Motivation, a Hypothesis and/or a Goal.', 
            'Summarize the approach of the paper: How they intend to carry out the investigation, comments on a theoretical model or framework.', 
            'Summarize the outcome of the paper: Overall conclusion that should reject or support the research hypothesis.'
        ],
        answers=[
            doc.summaries['challenge'], 
            doc.summaries['approach'], 
            doc.summaries['outcome']
        ]
    ).model_dump())
    
with jsonlines.open('../../data/ACLSum/dataset.jsonl', 'w') as f_out:
    f_out.write_all(aclsum_dataset)

In [3]:
with jsonlines.open('../../data/ACLSum/dataset.jsonl') as f_in:
    aclsum_dataset = [Sample.model_validate(line) for line in f_in]

## SciREX

+ Salient Entity Extraction
  + Extract the salient Dataset, Method, Task and Metric of the paper.
  + An entity is extracted if one of its mentions is being returned.
+ Salient Entity Mention Extraction
  + Extract the sentences where a salient entity's mention appear.
  + An entity mention is extracted if the sentence containing the mention is extracted.
+ Salient N-ary Relation Extraction
  + Extract the Dataset, Method, Task and Metric tuples that are bounded together.

### Dataset Observation and Preparation

In [2]:
with jsonlines.open('../../data/SciREX/train.jsonl') as f_in:
    scirex_dataset = list(f_in)

In [3]:
test_sample = scirex_dataset[0]

In [None]:
test_sample.keys()

In [None]:
test_sample['n_ary_relations']

In [None]:
test_sample['doc_id']

In [None]:
test_sample['words'][12]

In [18]:
import requests
from time import sleep
from tqdm import tqdm

def download_file(url, filename):
    """Downloads a file from a given URL and saves it with the specified filename."""

    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an exception if the request failed

        with open(filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    file.write(chunk)

        print(f"File '{filename}' downloaded successfully.")

    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
        

failed_ids = []
for sample in tqdm(scirex_dataset[2:]):
    paper_meta = requests.get(f"https://api.semanticscholar.org/graph/v1/paper/{sample['doc_id']}", params={'fields': 'externalIds'}).json()
    while 'externalIds' not in paper_meta:
        sleep(10)
        paper_meta = requests.get(f"https://api.semanticscholar.org/graph/v1/paper/{sample['doc_id']}", params={'fields': 'externalIds'}).json()
    if 'ArXiv' not in paper_meta['externalIds']:
        failed_ids.append(sample['doc_id'])
        continue
    download_file(f"https://arxiv.org/pdf/{paper_meta['externalIds']['ArXiv']}", f"../../data/SciREX/pdfs/{sample['doc_id']}.pdf")
    sleep(2)
with open('../../data/SciREX/failed_ids.txt', 'w') as f_out:
    f_out.write('\n'.join(failed_ids))

In [4]:
doc_manager = DocManager()

01/11/2025 18:34:32 - INFO - 	 missing_keys: []
01/11/2025 18:34:32 - INFO - 	 unexpected_keys: []
01/11/2025 18:34:32 - INFO - 	 mismatched_keys: []
01/11/2025 18:34:32 - INFO - 	 error_msgs: []
01/11/2025 18:34:32 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
01/11/2025 18:34:32 - INFO - 	 Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


In [5]:
test_file = f"../../data/SciREX/pdfs/{test_sample['doc_id']}.pdf"
if os.path.exists(test_file):
    doc_manager.load_doc(test_file)

01/11/2025 18:34:43 - INFO - 	 HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Regeneration attempt: 1

1. Introduction  
2. Related Work  
    2.1. Semantic Segmentation Approaches  
    2.2. Network Architectures  
3. Network Architectures for Segmentation  
    3.1. Feed-Forward Networks  
    3.2. Residual Networks (ResNets)  
    3.3. Full-Resolution Residual Networks (FRRNs)  
4. Training Procedure  
5. Experimental Evaluation  
    5.1. Residual Network Baseline  
    5.2. Quantitative Evaluation  
    5.3. Boundary Adherence  
6. Conclusion  
References  
Appendix  
    A. Gamma Augmentation  
    B. Baseline Evaluation  
    C. Qualitative Results  

The specified section name does not appear at the beginning of any paragraph in the paper. Section names must either appear at the start of a paragraph or stand alone as an independent paragraph. Please ensure all section names meet these requirements. If not, the section name should be removed.

B. Baseline Evaluation


01/11/2025 18:34:46 - INFO - 	 HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
01/11/2025 18:34:46 - INFO - 	 Tokenize 14 inputs...


Map:   0%|          | 0/14 [00:00<?, ? examples/s]

01/11/2025 18:34:48 - INFO - 	 ***** Running Inference on 14 texts *****


Inference:   0%|          | 0/14 [00:00<?, ?it/s]

01/11/2025 18:34:53 - INFO - 	 Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [6]:
print(doc_manager.outline)

1. Introduction
2. Related Work
    Semantic Segmentation Approaches
    Network Architectures
3. Network Architectures for Segmentation
    Feed-Forward Networks
    Residual Networks (ResNets)
    Full-Resolution Residual Networks (FRRNs)
4. Training Procedure
5. Experimental Evaluation
    5.1. Residual Network Baseline
    5.2. Quantitative Evaluation
        Overview
        Subsampling Factor.
    5.3. Boundary Adherence
6. Conclusion


In [12]:
doc_manager.get_section_by_header('6. Conclusion').text

'6. Conclusion\n\nIn this paper we propose a novel network architecture for semantic segmentation in street scenes. Our architecture is clean, does not require additional post-processing, can be trained from scratch, shows superior boundary adherence, and reaches state-of-the-art results on the Cityscapes benchmark. We will provide code and all trained models. Since we do not incorporate design choices specifically tailored towards semantic segmentation, we believe that our architecture will also be applicable to other tasks such as stereo or optical flow where predictions are performed per pixel.'

In [7]:
test_file

'../../data/SciREX/pdfs/000f90380d768a85e2316225854fc377c079b5c4.pdf'

In [10]:
doc_manager.sections[5].text

'Feed-Forward Networks.\n\nUntil recently, the majority of feedforward networks, such as the VGG-variants [ 50 ], were composed of a linear sequence of layers. Each layer in such a network computes a function F and the output x n of the n -th layer is computed as where W n are the parameters of the layer (see 2 a). We refer to this class of network architectures as traditional feedforward networks.'

In [11]:
doc_manager.sections[3].blocks[0]

DocBlock(text='Semantic Segmentation Approaches.', i=5, is_section_header=True, startswith_section_header=True)

In [None]:
# doc_strs = list[str]()
# for section_range in test_sample['sections']:
#     section_words = test_sample['words'][section_range[0]:section_range[1]]
#     if section_words[0] == 'section' and section_words[1] == ':':
#         section_words = section_words[2:]
#     doc_strs.append(' '.join(section_words))
'https://arxiv.org/pdf/2210.14427'
doc_manager.load_doc(doc_file='https://arxiv.org/pdf/1611.08323')

In [None]:
test_sample['sentences']

In [None]:
test_sample['method_subrelations']

## Self-RAG

In [2]:
with jsonlines.open('../../data/ACLSum/dataset.jsonl') as f_in:
    aclsum_dataset = [Sample.model_validate(line) for line in f_in]

In [3]:
sample = aclsum_dataset[0]
self_rag = SelfRAG(doc_file=sample.doc_file)

In [7]:
a = self_rag(sample.questions[0])

In [5]:
b = list(a)

In [6]:
self_rag.update_doc(doc_file=aclsum_dataset[1].doc_file)

In [None]:
b = list(a)

In [None]:
b

## Plan and Solve

In [None]:
config = {"recursion_limit": 50}
inputs = {"input": "what is the hometown of the mens 2024 Australia open winner?"}
async for event in app.astream(inputs, config=config):
    for k, v in event.items():
        if k != "__end__":
            print(v)

## CoT