In [None]:
from sci_review.paper import *

Try a range of different questions. Besides extracting information into a taxonomy, I am converting the following datasets into test questions:
+ ACLSum (Summarize the Challenge, Approach, and Outcome in the paper)
+ SciREX (extract the main results of a scientific article including Dataset, Metric, Task and Method)
+ arxivDIGESTables (given a table schema for literature survey and extract targeted values from scientific papers to fill in the table)

In [None]:
with open('../../data/words_alpha.txt') as f:
    words_alpha = set(f.read().splitlines())
doc_manager = DocManager(word_vocab=words_alpha)

# ACLSum

In [None]:
from aclsum_base import *

## Experiments

In [None]:
# Dataset config
split = 'train'

load_from_pdf = False
# load_from_pdf = True

# Retrieval config
# retrieval_method = 'rag'
# retrieval_method = 'rag_base'
retrieval_method = 'gen'
# retrieval_method = 'cls'

# Chunk config
sent_chunk = True
max_seq_len = None
k = 10
# sent_chunk = False
# max_seq_len = None
# k = 3
# sent_chunk = False
# max_seq_len = 100
# k = 10

with jsonlines.open(f'{ACLSUM_DIR}/{split}_dataset.jsonl') as f_in:
    aclsum_dataset = [Sample.model_validate(line) for line in f_in]

### NotebookLM Results

In [None]:
for i in range(15):
    sample = aclsum_dataset[i]
    load_doc_manager(doc_manager, sample, True, ACLSUM_DIR, True)
    with open(f'{pdf_dir(ACLSUM_DIR)}/{sample.doc_file.split(":")[1]}.txt', 'w') as f:
        f.write(doc_manager.doc_spacy.text)

In [None]:
print(doc_manager.doc_spacy.text)

In [None]:
for question_type, questions in aclsum_dataset[0].questions.items():
    print(question_type)
    for question in questions:
        print(question)
for sample in aclsum_dataset[0:15]:
    print(sample.doc_file)

### Evaluate Experiments

In [None]:
print(retrieval_method, f'split--{split}', f'load_from_pdf--{load_from_pdf}', f'sent_chunk--{sent_chunk}', f'max_seq_len--{max_seq_len}', f'k--{k}')
eval_results = defaultdict(list)
for question_type in ['challenge', 'approach', 'outcome']:
    eval_file = get_eval_file(retrieval_method, split, question_type, load_from_pdf, sent_chunk, max_seq_len, k, is_temp=False)
    if not os.path.exists(eval_file):
        continue
    with open(eval_file) as f_in:
        eval_results[question_type] = json.load(f_in)
        print('question_type', question_type)
        print('recall', np.mean([result['recall'] for result in eval_results[question_type][:]]))
        print('precision', np.mean([result['precision'] for result in eval_results[question_type][:]]))
        print('f1', np.mean([result['f1'] for result in eval_results[question_type][:]]))
        print('')

In [None]:
aclsum_dataset[2].questions

In [None]:
for question_type in ['challenge', 'approach', 'outcome']:
    print(question_type, sum(len(sample.extractions[question_type]) for sample in aclsum_dataset) / len(aclsum_dataset))
# challenge 4.55
# approach 7.23
# outcome 4.59

In [None]:
doc_lens = list[int]()
for sample in aclsum_dataset:
    load_doc_manager(doc_manager, sample, False)
    doc_manager.build_chunks(True, None)
    doc_lens.append(len(doc_manager.chunks))
print(sum(doc_lens) / len(doc_lens))
# 38.6

In [None]:
question_type2sent_lens = defaultdict(list)
for sid, sample in enumerate(aclsum_dataset):
    load_doc_manager(doc_manager, sample, False)
    doc_manager.build_chunks(True, None)
    unique_ngram2sent = get_sent_index([sent.text for section in doc_manager.sections if section.section_nlp_local for sent in section.section_nlp_local.sents])
    valid_sent_ids = set(range(max(sent_id for ngram, (sent_id, sent) in unique_ngram2sent.items()) + 1))
    for question_type in ['challenge', 'approach', 'outcome']:
        if sid in [60, 70, 76]:
            continue
        _, retrieved_sents = get_sents_and_process(
                doc_manager=doc_manager,
                retrieval_method=retrieval_method,
                split=split,
                sid=sid,
                question_type=question_type,
                load_from_pdf=False,
                sent_chunk=True,
                max_seq_len=None,
                k=None,
                is_temp=False)
        question_type2sent_lens[question_type].append(len(retrieved_sents))
for question_type in ['challenge', 'approach', 'outcome']:
    print(question_type, sum(question_type2sent_lens[question_type]) / len(question_type2sent_lens[question_type]))
# challenge 14.690721649484535
# approach 19.649484536082475
# outcome 9.371134020618557

In [None]:
sorted(eval_results['challenge'], key=lambda x: x['precision'], reverse=False)

### Test

In [None]:
sample = aclsum_dataset[2]
load_doc_manager(doc_manager, sample, load_from_pdf=False)
doc_manager.build_chunks(sent_chunk=True, max_seq_length=None)

In [None]:
sample.questions['challenge']

In [None]:
keywords = [phrase.text for phrase in doc_manager.doc_spacy._.phrases if phrase.rank > 0.04]
question = sample.questions['challenge']
question_break_down_prompt = f'''Refine the given general information-seeking question about a scientific paper by breaking it down into distinct subtopics, each representing a key aspect of the required information. Ensure each subtopic is concise, addresses a single point that can be explained in one sentence, and is logically connected to others through relevant entity types. Then, for each subtopic, generate three pseudo-sentences, each offering a different way the subtopic might be expressed in the paper. Below is an example of a refined question and its subtopics, along with pseudo-sentences for each subtopic.

### Question:
What are the key contributions of the paper?

### Subtopics:
1. **Summary of proposed [Method]**
2. **Comparison with existing approaches to [Task]**
3. **Performance improvements on [Dataset/Benchmark]**

### Pseudo-sentences:

1. **Summary of proposed [Method]**
    - This paper introduces a novel [Method] for addressing [Task].
    - We present [Method], which enhances efficiency in [Task].
    - Our approach leverages [Method] to improve performance in [Task].
2. **Comparison with existing approaches to [Task]**
    - Unlike previous methods, [Method] achieves better generalization in [Task].
    - Compared to existing models, [Method] reduces computational cost significantly.
    - Our approach differs from prior work by introducing [Key Novel Feature].
3. **Performance improvements on [Dataset/Benchmark]**
    - Our method achieves state-of-the-art results on [Dataset].
    - We report a [X]% improvement in accuracy over previous methods on [Benchmark].
    - Experimental results demonstrate superior performance of [Method] on [Dataset].

You should follow the format of the example. To assist you in generating subtopics and pseudo-sentences, we provide a list of keywords from the paper as context. Use these keywords to guide your understanding and develop relevant subtopics and pseudo-sentences. Additionally, apply your own knowledge and reasoning to refine the question and enhance your responses.

Keywords:
{keywords}

### Question:
{question}'''

chat_completion = doc_manager.client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": question_break_down_prompt,
        }
    ],
    model=doc_manager.tool_llm,
)
content = chat_completion.choices[0].message.content

In [None]:
print(content)

In [None]:
def parse_question_breakdown(content:str):
    subtopics:dict[str, list[str]] = defaultdict(list[str])
    is_subtopic = False
    is_pesudo_sentence = False
    curr_subtopic = None
    for line in content.split('\n'):
        line = line.strip()
        if not line:
            continue
        if not is_subtopic:
            if line.startswith('### Subtopics:'):
                is_subtopic = True
        else:
            if line.startswith('### Pseudo-sentences:'):
                is_pesudo_sentence = True
            elif not is_pesudo_sentence:
                subtopics[line]
            else:
                if line in subtopics:
                    curr_subtopic = line
                else:
                    subtopics[curr_subtopic].append(line.strip('- '))
    return {subtopic.replace('*', ''): sentences for subtopic, sentences in subtopics.items()}

In [None]:
subtopics = parse_question_breakdown(content)
subtopics.keys()

In [None]:
subtopics['1. Current limitations of [Chinese NER] methods']

In [None]:
a = tuple(doc.page_content for doc, score in doc_manager.vectorstore.similarity_search_with_score(subtopics['2. Motivation for improving [NER] performance'][0], k=len(doc_manager.chunks)))

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-v2-m3')
model.eval()


In [None]:
pairs = [('Furthermore, they indirectly interface via a soft-attention mechanism, which makes them comparatively isolated.', chunk.page_content) for chunk in doc_manager.chunks]
with torch.no_grad():
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
    scores = model(**inputs, return_dict=True).logits.view(-1, ).float()
    print(scores)

In [None]:
sorted(zip(scores, doc_manager.chunks), reverse=True)

In [None]:
# b = tuple(doc.page_content for doc, score in doc_manager.vectorstore.similarity_search_with_relevance_scores(subtopics['1. Current limitations of [Chinese NER] methods'][0], k=len(doc_manager.chunks)))
b = tuple(doc.page_content for doc, score in doc_manager.vectorstore.similarity_search_with_relevance_scores('1. Current limitations of [Chinese NER] methods', k=len(doc_manager.chunks)))

In [None]:
a == b

In [None]:
subtopics['4. Goal of the proposed method in addressing NER challenges'][0]

In [None]:
sorted(((doc_manager.phrases[u], doc_manager.phrases[v], weight) for u, v, edge_type, weight in doc_manager.dkg.edges(data='similarity', keys=True) if edge_type == SHARED_TEXT), key=lambda x: x[2])

In [None]:
a

## Observation

In [None]:
retrieval2configs = {
    'rag': [
        # {
        #     'load_from_pdf': False, 
        #     'sent_chunk': True, 
        #     'max_seq_len': None, 
        #     'k': 10
        # },
        # {
        #     'load_from_pdf': False, 
        #     'sent_chunk': False, 
        #     'max_seq_len': 100, 
        #     'k': 10
        # }
    ],
    'gen': [
        {
            'load_from_pdf': False, 
            'sent_chunk': True, 
            'max_seq_len': None, 
            'k': None
        },
        # {
        #     'load_from_pdf': True, 
        #     'sent_chunk': True, 
        #     'max_seq_len': None, 
        #     'k': None
        # }
    ]
}
retrieval2configs['rag_base'] = retrieval2configs['rag']


sid = 25
split = 'train'
question_type = 'challenge'
with jsonlines.open(f'{ACLSUM_DIR}/{split}_dataset.jsonl') as f_in:
    aclsum_dataset = [Sample.model_validate(line) for line in f_in]

sample = aclsum_dataset[sid]
test2sents = dict[str, list[str]]()
test2process = dict[str, list[dict]]()
test2chunks = dict[str, list[str]]()

for retrieval_method, retrieval_configs in retrieval2configs.items():
    for retrieval_config in retrieval_configs:
        load_doc_manager(doc_manager, sample, retrieval_config['load_from_pdf'])
        doc_manager.build_chunks(sent_chunk=retrieval_config['sent_chunk'], max_seq_length=retrieval_config['max_seq_len'])
        
        unique_ngram2sent = get_sent_index([sent.text for section in doc_manager.sections if section.section_nlp_local for sent in section.section_nlp_local.sents])
        if retrieval_config['load_from_pdf']:
            valid_sent_ids = get_sent_ids([sent for block in sample.doc_strs if block not in ['Abstract', 'Introduction', 'Conclusion'] for sent in spacy_sent_tokenize(doc_manager.nlp, block)], unique_ngram2sent)
            if -1 in valid_sent_ids:
                print(f'Invalid sent id in sample {sid}, retrieval_config {retrieval_config}, {valid_sent_ids.count(-1)}/{len(valid_sent_ids)}')
                valid_sent_ids = [sent_id for sent_id in valid_sent_ids if sent_id > -1]
            valid_sent_ids = set(valid_sent_ids)
        else:
            valid_sent_ids = set(range(max(sent_id for ngram, (sent_id, sent) in unique_ngram2sent.items()) + 1))
        
        process, retrieved_sents = get_sents_and_process(
            doc_manager=doc_manager,
            retrieval_method=retrieval_method,
            split=split,
            sid=sid,
            question_type=question_type,
            is_temp=True,
            **retrieval_config
        )
        valid_retrieved_sents = [sent for sent_id, sent in zip(get_sent_ids(retrieved_sents, unique_ngram2sent), retrieved_sents) if sent_id in valid_sent_ids]
        test_name = f"{retrieval_method}_{retrieval_config['load_from_pdf']}_{retrieval_config['sent_chunk']}_{retrieval_config['max_seq_len']}"
        test2sents[test_name] = valid_retrieved_sents
        test2process[test_name] = process
        test2chunks[test_name] = [chunk.page_content for chunk in doc_manager.chunks]
        
test2label = {
    'rag_False_True_None': 'rag_sent', 
    'rag_False_False_100': 'rag_100', 
    'gen_False_True_None': 'gen', 
    'gen_True_True_None': 'gen_full', 
    'rag_base_False_True_None': 'rag_sent_base', 
    'rag_base_False_False_100': 'rag_100_base',
    'GOLD': 'GOLD'
}

label2order = {
    'rag_sent': 6, 
    'rag_100': 5, 
    'gen': 4, 
    'gen_full': 3, 
    'rag_sent_base': 2, 
    'rag_100_base': 1, 
    'GOLD': 0, 
}

load_doc_manager(doc_manager, sample, False)
source_sents = [sent.text for section in doc_manager.sections if section.section_nlp_local for sent in section.section_nlp_local.sents]
unique_ngram2sent = get_sent_index(source_sents)

sent_id2labels = [[] for _ in range(max(sent_id for ngram, (sent_id, sent) in unique_ngram2sent.items()) + 1)]
for test_name, test_label in test2label.items():
    if test_name == 'GOLD':
        test_sents = sample.extractions[question_type]
    else:
        if test_name not in test2sents:
            continue
        test_sents = test2sents[test_name]
    for sent_id in get_sent_ids(test_sents, unique_ngram2sent):
        sent_id2labels[sent_id].append(test_label)
        

In [None]:
sample.extractions[question_type]

In [None]:
content = PARAGRAPH_SEP.join([f'Chunk {chunk.metadata["chunk_id"]}: {chunk.page_content}' for chunk in doc_manager.chunks])
prompt = f'Below are text chunks from a paper:\n\n\n\n{content}\n\n\n\nSelect the Chunk ids that are relevant to the following question: \n\n{sample.questions[question_type]}\n\nReturn only the selected chunk ids separated by commas, e.g. "1, 3, 5".'
print(prompt)

In [None]:
# skipped_labels = {'gen_full', 'rag_sent', 'rag_100'}
# skipped_labels = {'gen_full', 'gen'}
# skipped_labels = {'gen_full', 'gen', 'rag_sent', 'rag_100', 'rag_sent_base', 'rag_100_base'}
skipped_labels = {'gen_full', 'rag_sent', 'rag_100', 'rag_sent_base', 'rag_100_base'}
with open(f'observations_{sid}.txt', 'w') as f_out:
    for sent_id, sent in enumerate(source_sents):
        f_out.write(f'{sent} --- {", ".join(sorted([label for label in sent_id2labels[sent_id] if label not in skipped_labels], key=lambda x: label2order[x]))}\n')

In [None]:
eval_file = get_eval_file('rag', split, question_type, load_from_pdf=False, sent_chunk=False, max_seq_len=100, k=10, is_temp=False)
with open(eval_file) as f_in:
    eval_results = json.load(f_in)
eval_results[sid]

In [None]:
for eval_result in eval_results:
    if eval_result['recall']:
        print(eval_result)

In [None]:
test2chunks.keys()

In [None]:
test2chunks['rag_base_False_False_100']

In [None]:
sample.extractions['challenge']

In [None]:
sample.questions['challenge']

In [None]:
doc_manager.build_chunks(sent_chunk=True, max_seq_length=None)

In [None]:
doc_manager.vectorstore.similarity_search('Limitations of the current methods')

In [None]:
doc_manager.vectorstore.similarity_search('We propose a new method to solve this problem.')

In [None]:
content = PARAGRAPH_SEP.join([f'Chunk {chunk.metadata["chunk_id"]}: {chunk.page_content}' for chunk in doc_manager.chunks])
similar_chunk_prompt = f'Below are text chunks from a paper:\n\n\n\n{content}\n\n\n\nSelect the Chunk ids that express similar general meaning as the following statement: \n\n{"Previous [Method] has been used for [Task]."}\n\nReturn only the selected chunk ids separated by commas, e.g. "1, 3, 5".'
chat_completion = doc_manager.client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": similar_chunk_prompt,
        }
    ],
    model=doc_manager.tool_llm,
)
content = chat_completion.choices[0].message.content
content

In [None]:
doc_manager.chunks[21]

In [None]:
import random
split = 'train'
question_type = 'challenge'
with jsonlines.open(f'{ACLSUM_DIR}/{split}_dataset.jsonl') as f_in:
    aclsum_dataset = [Sample.model_validate(line) for line in f_in]
eval_metrics = EvalMetrics()
eval_results = list[dict]()
for sid, sample in enumerate(tqdm(aclsum_dataset)):
    load_doc_manager(doc_manager, sample, False)
    doc_manager.build_chunks(sent_chunk=True, max_seq_length=None)
    unique_ngram2sent = get_sent_index([sent.text for section in doc_manager.sections if section.section_nlp_local for sent in section.section_nlp_local.sents])
    chunks = [chunk.page_content for chunk in doc_manager.chunks]
    # random.shuffle(chunks)
    content = PARAGRAPH_SEP.join([f'Chunk {chunk_id}: {chunk}' for chunk_id, chunk in enumerate(chunks)])
    selected_chunk_ids = set[int]()
    question2chunk_ids = dict[str, list[int]]()
    for question in [
        # "[Task] is widely studied in the research.",
        "Introduction of [Task].",
        # "Previous [Method] has been used for [Task].",
        # "Previous [Method] has drawbacks.",
        "[Method] has been used for [Task].",
        "[Method] has limitations.",
        # "We propose a new [Method] to solve this problem.",
    ]:
        similar_chunk_prompt = f'Below are text chunks from a paper:\n\n\n\n{content}\n\n\n\nRank the **TOP 5** Chunk ids that belong to the following topic: \n\n{question}\n\nReturn only the selected chunk ids separated by commas, e.g. "1, 3, 5".'
        chat_completion = doc_manager.client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": similar_chunk_prompt,
                }
            ],
            model=doc_manager.tool_llm,
        )
        try:
            selected_chunk_ids.update(map(int, chat_completion.choices[0].message.content.split(', ')))
            question2chunk_ids[question] = list(map(int, chat_completion.choices[0].message.content.split(', ')))
        except:
            question2chunk_ids[question] = []
    
    retrieved_sents = [sent for chunk_id in selected_chunk_ids if chunk_id < len(chunks)  for sent in spacy_sent_tokenize(doc_manager.nlp, chunks[chunk_id])]
    if not retrieved_sents:
        eval_result = {'f1': 0, 'precision': 0, 'recall': 0}
    else:
        retrieved_sent_ids = get_binary_sent_ids(retrieved_sents, unique_ngram2sent)
        gold_sent_ids = get_binary_sent_ids(sample.extractions[question_type], unique_ngram2sent)
        
        eval_result:dict[str, Any] = eval_metrics.eval_precision_recall_f1(predictions=retrieved_sent_ids, references=gold_sent_ids)
    eval_result.update({'sid': sid, 'sent_ids': retrieved_sents, 'question2chunk_ids': question2chunk_ids})
    eval_results.append(eval_result)

In [None]:
with open('temp_eval.json', 'w') as f_out:
    json.dump(eval_results, f_out)

In [None]:
with open('temp_eval.json') as f_in:
    eval_results = json.load(f_in)

In [None]:
print('recall', np.mean([result['recall'] for result in eval_results[:]]))
print('precision', np.mean([result['precision'] for result in eval_results[:]]))
print('f1', np.mean([result['f1'] for result in eval_results[:]]))

In [None]:
for eval_result in eval_results:
    if eval_result['recall'] < 0.5:
        print(eval_result['sid'], eval_result['recall'], eval_result['precision'], eval_result['f1'])

In [None]:
aclsum_dataset[1].extractions['challenge']

In [None]:
for eval_result, sample in zip(eval_results, aclsum_dataset):
    eval_result['missing'] = set(sample.extractions['challenge']).difference(eval_result['sent_ids'])
    eval_result['shared'] = set(sample.extractions['challenge']).intersection(eval_result['sent_ids'])
    eval_result['extra'] = set(eval_result['sent_ids']).difference(sample.extractions['challenge'])

In [None]:
load_doc_manager(doc_manager, aclsum_dataset[0], False)
doc_manager.build_chunks(sent_chunk=True, max_seq_length=None)

In [None]:
eval_results[0]['missing']

In [None]:
eval_results[0]['extra']

In [None]:
eval_results[0]['shared']

In [None]:
chunks = [chunk.page_content for chunk in doc_manager.chunks]
[sent for chunk_id in eval_results[0]['question2chunk_ids']['[Method] has limitations.'] if chunk_id < len(chunks)  for sent in spacy_sent_tokenize(doc_manager.nlp, chunks[chunk_id])]

In [None]:
aclsum_dataset[0].answers['challenge']

In [None]:
aclsum_dataset[4].questions['outcome']

In [None]:
# context = '''Results show that the proposed system outperforms significantly other stateof-the-art QE systems. This study is part of a bigger, ongoing project, aiming to develop a real-time QE system for Web search, where simplicity is the key to the success. Thus, what we learned from this study is particularly encouraging.'''

# context = '''These models are trained on pairs of user queries and the titles of clicked documents using EM. Second, we present a ranker-based QE system, the heart of which is a MRF-based ranker in which the lexicon models are incorporated as features. We perform experiments on the Web search task using a real world data set.'''

# context = '''The experimental results show that when implementing the sequence modeling layer with a single-layer Bi-LSTM, our method achieves considerable improvements over the state-of-theart methods in both inference speed and sequence labeling performance.'''

context = '''Experimental studies on four benchmark Chinese NER datasets show that our method achieves an inference speed up to 6.15 times faster than those of state-ofthe-art methods, along with a better performance. The experimental results also show that the proposed method can be easily incorporated with pre-trained models like BERT.'''

question = aclsum_dataset[4].questions['outcome']

grade_doc_prompt = f"""You are a grader assessing relevance of a retrieved document to a user question.

Here is the retrieved document:

{context}

Here is the user question:

{question}

If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant.
Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. Briefly explain your reasoning for the grade."""

chat_completion = doc_manager.client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": grade_doc_prompt,
        }
    ],
    model=doc_manager.tool_llm,
)

chat_completion.choices[0].message.content

In [None]:
context = '''In this work, we propose a simple but effective method for incorporating the word lexicon into the character representations. This method avoids designing a complicated sequence modeling architecture, and for any neural NER model, it requires only subtle adjustment of the character representation layer to introduce the lexicon information. Experimental studies on four benchmark Chinese NER datasets show that our method achieves an inference speed up to 6.15 times faster than those of state-ofthe-art methods, along with a better performance. The experimental results also show that the proposed method can be easily incorporated with pre-trained models like BERT.'''

question = aclsum_dataset[4].questions['outcome']

grade_doc_prompt = f"""You are a grader assessing relevance of a retrieved document to a user question.

Here is the retrieved document:

{context}

Here is the user question:

{question}

If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant.
Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. Briefly explain your reasoning for the grade."""

chat_completion = doc_manager.client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": grade_doc_prompt,
        }
    ],
    model=doc_manager.tool_llm,
)

chat_completion.choices[0].message.content

# SciREX

+ Salient Entity Extraction
  + Extract the salient Dataset, Method, Task and Metric of the paper.
  + An entity is extracted if one of its mentions is being returned.
+ Salient Entity Mention Extraction
  + Extract the sentences where a salient entity's mention appear.
  + An entity mention is extracted if the sentence containing the mention is extracted.
+ Salient N-ary Relation Extraction
  + Extract the Dataset, Method, Task and Metric tuples that are bounded together.

### Dataset Observation and Preparation

In [None]:
with jsonlines.open('../../data/SciREX/train.jsonl') as f_in:
    scirex_dataset = list(f_in)

In [None]:
test_sample = scirex_dataset[0]

In [None]:
test_sample.keys()

In [None]:
test_sample['n_ary_relations']

In [None]:
test_sample['doc_id']

In [None]:
test_sample['words'][12]

In [None]:
import requests
from time import sleep
from tqdm import tqdm

failed_ids = []
for sample in tqdm(scirex_dataset[2:]):
    paper_meta = requests.get(f"https://api.semanticscholar.org/graph/v1/paper/{sample['doc_id']}", params={'fields': 'externalIds'}).json()
    while 'externalIds' not in paper_meta:
        sleep(10)
        paper_meta = requests.get(f"https://api.semanticscholar.org/graph/v1/paper/{sample['doc_id']}", params={'fields': 'externalIds'}).json()
    if 'ArXiv' not in paper_meta['externalIds']:
        failed_ids.append(sample['doc_id'])
        continue
    download_file(f"https://arxiv.org/pdf/{paper_meta['externalIds']['ArXiv']}", f"../../data/SciREX/pdf/{sample['doc_id']}.pdf")
    sleep(2)
with open('../../data/SciREX/failed_ids.txt', 'w') as f_out:
    f_out.write('\n'.join(failed_ids))

In [None]:
208291415

In [None]:
import requests

In [None]:
'externalIds', 'url', 

In [None]:
a = requests.get(f"https://api.semanticscholar.org/graph/v1/paper/CorpusID:13530374", params={'fields': 'externalIds'}).json()

In [None]:
a

In [None]:
a = requests.get(f"https://api.semanticscholar.org/graph/v1/paper/ACL:2020.aacl-main.88", params={'fields': 'externalIds'}).json()

In [None]:
doc_manager = DocManager()

In [None]:
test_file = f"../../data/SciREX/pdfs/{test_sample['doc_id']}.pdf"
if os.path.exists(test_file):
    doc_manager.load_doc(test_file)

In [None]:
print(doc_manager.outline)

In [None]:
doc_manager.get_section_by_header('6. Conclusion').text

In [None]:
test_file

In [None]:
doc_manager.sections[5].text

In [None]:
doc_manager.sections[3].blocks[0]

In [None]:
# doc_strs = list[str]()
# for section_range in test_sample['sections']:
#     section_words = test_sample['words'][section_range[0]:section_range[1]]
#     if section_words[0] == 'section' and section_words[1] == ':':
#         section_words = section_words[2:]
#     doc_strs.append(' '.join(section_words))
'https://arxiv.org/pdf/2210.14427'
doc_manager.load_doc(doc_file='https://arxiv.org/pdf/1611.08323')

In [None]:
test_sample['sentences']

In [None]:
test_sample['method_subrelations']

# ArxivDIGESTables

In [None]:
from arxivdigestables_base import *

In [None]:
table = tables_collection.find_one()

In [None]:
table.keys()

In [None]:
len(table['in_text_ref'])

In [None]:
table['caption']

In [None]:
table['in_text_ref']

In [None]:
a = [[5, 3], [1], [5, 2]]
a.sort()

In [None]:
a

In [None]:
len(table['row_bib_map'])

In [None]:
table['table']

In [None]:
print(table['in_text_ref'][0]['text'].replace('\n', ' '))

In [None]:
table['table']

In [None]:
len(table['table'])

## Normalize Data Samples

### Load data into MongoDB

In [None]:
with open('../../data/ArxivDIGESTables/papers.jsonl') as f_in:
    papers = [json.loads(line) for line in f_in]
with open('../../data/ArxivDIGESTables/tables.jsonl') as f_in:
    tables = [json.loads(line) for line in f_in]
with open('../../data/ArxivDIGESTables/full_texts.jsonl') as f_in:
    full_texts = [json.loads(line) for line in f_in]

full_texts_collection.insert_many(full_texts)
papers_collection.insert_many(papers)
tables_collection.insert_many(tables)

### Download PDFs

In [None]:
paper = papers_collection.find_one()

In [None]:
paper

In [None]:
with open('../../reference_repo/arxivDIGESTables/predictions/predictions.jsonl') as f_in:
    predictions = [json.loads(line) for line in f_in]

In [None]:
a = tables_collection.find_one({'tabid': 'bb09b7e1-2ab7-4193-922a-1b1b93486e83'})

In [None]:
papers_collection.find_one({'corpus_id': 208291415})

# QASA

In [None]:
with open('../../data/QASA/testset_answerable_1554_v1.1.json') as f_in:
    qasa_dataset = [sample for _, sample in sorted((int(sid), sample) for sid, sample in json.load(f_in).items())]

In [None]:
sample = qasa_dataset[0]

In [None]:
sample.keys()

In [None]:
sample['question']

In [None]:
sample['question_type']

In [None]:
sample['evidential_info'][0]

In [None]:
sample['s2orc_url']

In [None]:
with open('../../data/QASA/testset_unanswerable_244_v1.1.json') as f_in:
    qasa_dataset = json.load(f_in)

# Test

In [None]:
from sci_review.framework import *

In [None]:
import jsonlines
with jsonlines.open('../../data/ACLSum/train_rag_10_train_dataset.json') as f_in:
    aclsum_dataset = [Sample.model_validate(line) for line in f_in]

In [None]:
aclsum_dataset[0].relevant_blocks

In [None]:
import re

In [None]:
response = 'The paper addresses the challenge of answer extraction in open-domain Question Answering (QA) systems, which has become increasingly crucial as these systems aim to find exact answers rather than mere text snippets 2.0 (Chunk 3). A significant issue is that traditional methods heavily rely on Named Entity Recognition (NER), which can lead to performance degradation due to errors in NER, as it may not effectively identify and classify a wide range of named entities (Chunk 8). The authors note that existing systems often struggle with larger candidate answer sets when a general NER is employed, resulting in more difficult answer extraction tasks (Chunk 9). Furthermore, previous approaches that operate at the surface word level, such as density-based ranking and pattern matching, are inadequate for capturing the deeper linguistic relationships necessary for effective answer extraction (Chunk 9). The paper proposes a novel method that explores the correlation of dependency relation paths to rank candidate answers, motivated by the observation that proper answers and question phrases share similar relations (Chunk 10, 19). This approach aims to overcome the limitations of existing methods by incorporating a Maximum Entropy-based ranking model that estimates path weights from training data, thereby enhancing the performance of QA systems, especially for more challenging questions where NER may not provide sufficient support (Chunk 10, 76).'

In [None]:
for c in re.findall(r'\(Chunk \d+(?:, \d+)*\)', response):
    print(c[7:-1].split(', '))

In [None]:
re.findall(r'\(Chunk \d+(?:, \d+)*\)', '')

In [None]:
from nltk import sent_tokenize

In [None]:
sent_tokenize(response)