# Tools

In [None]:
from collections import defaultdict, Counter
import math
import re
import pymupdf
import pymupdf4llm
from nltk import sent_tokenize, word_tokenize
import os
from openai import OpenAI
client = OpenAI(api_key=os.environ['OPENAI_AUTO_SURVEY'])

import spacy
import spacy.tokens
import pytextrank
from fastcoref import spacy_component
import numpy as np

# nlp.add_pipe("fastcoref")

## Question decompose

In [2]:
extract_general_term_prompt = '''The following task involves extracting specific information from a paper. The task description may include general terms that represent types of concrete entities, which need to be identified and clarified based on the paper. Your objective is to identify and list **ALL** general terms or concepts in the task description that might be open to interpretation, require further specification using details from the paper and be critical in locating relevant information in the paper. These include:
    1.	Specific entities, models, methods, or processes referenced in the description.
    2.	Broad categories or classifications that require more detailed breakdowns.
    3.	Implicit generalizations or assumptions that could benefit from contextual examples or precise definitions.

Some general terms may refer to the same entity in the description. You should only list one general term for each entity. Make sure you cover **ALL** the entities.

Task: {task}'''


organize_general_term_prompt = '''Understand the hierarchy among the general terms you listed above with respect to the "Parent-Child" relationship:

Parent Concept:
A parent concept represents a broader, overarching idea or category that serves as the foundation for related subordinate ideas. It is independent and provides the contextual framework or structure for its associated dependent concepts.

Child Concept:
A child concept is a more specific, subordinate idea that derives meaning, classification, or context from its associated parent concept. It depends on the parent concept for its definition and existence within a hierarchical structure.

Organize the general terms you listed above hierarchically based on their dependencies, ensuring that parent concepts are listed first, followed by their dependent child concepts. Use indentation to represent the hierarchy, with the format as follows:

1.	Parent concept
    1.1 Dependent child concept
        1.1.1 Dependent grandchild concept
        1.1.2 Dependent grandchild concept
    1.2 Dependent child concept
2.	Parent concept
    2.1 Dependent child concept

Only use the general terms identified in your previous response to create this hierarchical structure.'''

generate_checkpoint_prompt = '''To find the relevant information step by step, break down the task into a series of simple, single-step questions. Each question should be narrowly focused, collecting or verifying only one attribute of one entity or entity type, or serving as a follow-up to refine the scope with one additional attribute. Each question can be either a "What" question or a "True or False" question. Always start with questions for the low level entities (child entities) and then move forward to questions for their parent entity. This structured approach ensures clarity and precision in locating relevant information from the paper.'''

In [3]:
# task = '''Extract the modeling paradigms proposed in the paper that satisfy the following type:

# LLM Embeddings + RS. This modeling paradigm views the language model as a feature extractor, which feeds the features of items and users into LLMs and outputs corresponding embeddings. A traditional RS model can utilize knowledge-aware embeddings for various recommendation tasks.'''

task = '''Extract the modeling paradigms proposed in the paper that satisfy the following type:

LLM as RS. This paradigm aims to directly transfer pre-trained LLM into a powerful recommendation system. The input sequence usually consists of the profile description, behavior prompt, and task instruction. The output sequence is expected to offer a reasonable recommendation result.'''

Sub-questions are designed to find necessary information.

In [None]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": extract_general_term_prompt.format(task=task),
        }
    ],
    model=GPT_MODEL_EXPENSIVE,
)
general_term_str = chat_completion.choices[0].message.content
print(general_term_str)

In [None]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": extract_general_term_prompt.format(task=task),
        }, {
            "role": "assistant",
            "content": general_term_str,
        }, {
            "role": "user",
            "content": organize_general_term_prompt,
        }
    ],
    model=GPT_MODEL_EXPENSIVE,
)
general_term_hierarchy_str = chat_completion.choices[0].message.content
print(general_term_hierarchy_str)

In [None]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": extract_general_term_prompt.format(task=task),
        }, {
            "role": "assistant",
            "content": general_term_str,
        }, {
            "role": "user",
            "content": organize_general_term_prompt,
        }, {
            "role": "assistant",
            "content": general_term_hierarchy_str,
        }, {
            "role": "user",
            "content": generate_checkpoint_prompt,
        }
    ],
    model=GPT_MODEL_EXPENSIVE,
)
checkpoint_str = chat_completion.choices[0].message.content
print(checkpoint_str)

Some necessary sub-questions are not asked in the above example.
The sub-questions above are more about asking definition of terms.
Check how other papers do question decomposition.

## Outline

In [None]:
from sci_review.paper import *
from sci_review.text import *
import spacy.displacy



/home/keruiz2/miniconda3/envs/dkg/lib/python3.11/site-packages


In [2]:
# doc_file = '../../data/systematic_review_papers/planning/CALM.pdf'
doc_file = 'aclsum.pdf'
outline = '''1 Introduction
2 Related Work
    Unfaithful summaries.
    Noisy data.
    Legal issues.
    Missing gold extractive labels.
3 Dataset creation
    Source documents
    Summary aspects
    Annotation process
4 ACLS UM
5 Experiments and Results
    5.1 RQ1: Extract-then-abstract vs. end-to-end
        Experimental setup.
        Results and discussions.
    5.2 RQ2: CoT vs. E2E instruct-tuning
        Experimental setup.
        Results and discussions.
    5.3 RQ3: How good is the heuristic for inducing extractive summarization labels?
6 Conclusion
7 Limitations
References'''

In [3]:
with open('words_alpha.txt') as f:
    words_alpha = set(f.read().splitlines())
doc = DocManager(word_vocab=words_alpha)

01/25/2025 14:08:51 - INFO - 	 missing_keys: []
01/25/2025 14:08:51 - INFO - 	 unexpected_keys: []
01/25/2025 14:08:51 - INFO - 	 mismatched_keys: []
01/25/2025 14:08:51 - INFO - 	 error_msgs: []
01/25/2025 14:08:51 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
01/25/2025 14:08:52 - INFO - 	 Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


In [4]:
doc.load_doc(doc_file, outline=outline)

01/25/2025 14:08:59 - INFO - 	 Tokenize 20 inputs...


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

01/25/2025 14:09:01 - INFO - 	 ***** Running Inference on 20 texts *****


Inference:   0%|          | 0/20 [00:00<?, ?it/s]

01/25/2025 14:09:03 - INFO - 	 Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [11]:
print(doc.sections[5].text)

Missing gold extractive labels.

While extractive and aspect-based summarization are both active research subjects, there are no freely available datasets with ground-truth labels for such tasks. For extractive summarization, the de facto standard approach has been relying on a heuristic-based algorithm that automatically induces labels from abstractive summarization datasets without validating its effectiveness, including recent improvements from Xu and Lapata (2022).

The work closest to ours is SQuALITY from Wang et al. (2022). While this work shares the core motivation with our work, which is to build a reliable and validated summarization dataset, our dataset has several different properties. First, besides the abstractive reference summaries, our dataset also has passage annotations (i.e., aspects) that can serve as gold labels for extractive summarization. Second, in contrast to the SQuALITY, which provides question-focused summaries, our dataset has multi-aspect summaries more 

## Plot DKG

In [23]:
import dash
import dash_cytoscape as cyto
from dash import html

y_start = 0
width = 80
nodes = []
for section in doc.sections:
    if section.section_nlp_local:
        # start_idx = 0
        for sent in section.section_nlp_local.sents:
            start_idx = sent[0].idx
            last_chunk_tid = sent.start + section.section_nlp_global.start
            last_chunk_id = doc.tid2phrase_id[last_chunk_tid]
            curr_chunk_tid = last_chunk_tid
            curr_chunk_id = last_chunk_id
            for token in sent[1:]:
                curr_chunk_tid = token.i + section.section_nlp_global.start
                curr_chunk_id = doc.tid2phrase_id[curr_chunk_tid]
                if curr_chunk_id != last_chunk_id:
                    dist2start = doc.doc_spacy[last_chunk_tid].idx - section.section_nlp_global[0].idx - start_idx
                    if dist2start > width:
                        y_start += 20
                        start_idx = doc.doc_spacy[last_chunk_tid].idx - section.section_nlp_global[0].idx
                        dist2start = 0
                    nodes.append({'data': {'id': last_chunk_tid, 'label': doc.doc_spacy[last_chunk_tid:curr_chunk_tid].text}, 'position': {'x': dist2start * 6.1, 'y': y_start}, 'classes': 'noun_phrase' if last_chunk_id >= 0 else 'text', 'locked': True})
                    last_chunk_tid = curr_chunk_tid
                    last_chunk_id = curr_chunk_id
            dist2start = doc.doc_spacy[last_chunk_tid].idx - section.section_nlp_global[0].idx - start_idx
            if dist2start > width:
                y_start += 20
                start_idx = doc.doc_spacy[last_chunk_tid].idx - section.section_nlp_global[0].idx
                dist2start = 0
            nodes.append({'data': {'id': last_chunk_tid, 'label': doc.doc_spacy[last_chunk_tid:sent.end+section.section_nlp_global.start].text}, 'position': {'x': dist2start * 6.1, 'y': y_start}, 'classes': 'noun_phrase' if last_chunk_id >= 0 else 'text', 'locked': True})
            y_start += 50
        y_start += 100
        
edges = [{'data': {'source': doc.phrases[u].start, 'target': doc.phrases[v].start, 'type': edge_type}} for u, v, edge_type in dkg.edges]

app = dash.Dash(__name__)
app.layout = html.Div([
    cyto.Cytoscape(
        id='cytoscape',
        # elements=[
        #     {'data': {'id': 'one', 'label': 'Node 1'}, 'position': {'x': 50, 'y': 50}},
        #     {'data': {'id': 'two', 'label': 'Node 2'}, 'position': {'x': 200, 'y': 200}},
        #     {'data': {'source': 'one', 'target': 'two','label': 'Node 1 to 2'}}
        # ],
        # elements=[{'data': {'id': token.i, 'label': token.text}, 'position': {'x': (token.idx - sent[0].idx) * 6, 'y': 0}} for token in sent],
        elements=nodes+edges,
        layout={'name': 'preset',
                'zoomingEnabled': False,
                },
        style={
            # 'width': '1000px', 
            'height': '2000px', 
            # 'height': '100%',
            'backgroundColor': "#1E1E1E",
        },
        stylesheet=[
            {
                'selector': 'node',
                'style': {
                    'label': 'data(label)',
                    'font-family': 'Courier',
                    'font-size': '10',
                    'text-halign': 'right',
                    'width': '1',
                    'height': '1',
                    'color': "#ce9178",
                    # 'autoungrabify': True,
                }
            },
            {
                'selector': '.noun_phrase',
                'style': {
                    # 'label': 'data(label)',
                    # 'font-family': 'Courier',
                    # 'font-weight': 'bold',
                    # 'text-halign': 'right',
                    # 'width': '1',
                    # 'height': '1',
                    'color': "#9CDCFE",
                    'font-weight': 'bold',
                }
            },
            {
                'selector': 'edge',
                'style': {
                    # 'label': 'data(label)',
                    # 'font-family': 'Courier New',
                    # 'font-size': '12px'
                    'curve-style': 'unbundled-bezier',
                    'target-arrow-shape': 'vee',
                    'width': 1,
                    'opacity': 0.3
                }
            },
            {
                'selector': f'edge[type = "{SUBJ_OBJ}"]', 
                'style': {
                    'line-color': "#C586C0",
                    'target-arrow-color': "#C586C0",
                }
            },
            {
                'selector': f'edge[type = "{COREF}"]', 
                'style': {
                    'line-color': "#4EC9B0", 
                    'line-style': 'dashed',
                    'target-arrow-color': "#4EC9B0",
                }
            },
            {
                'selector': f'edge[type = "{SHARED_TEXT}"]', 
                'style': {
                    'line-color': "#4FC1FF", 
                    'line-style': 'dashed',
                    'target-arrow-color': "#4FC1FF",
                }
            }
        ]
    )
])

if __name__ == '__main__':
    app.run_server(jupyter_mode="external", port=8051, host='128.174.136.27')

Dash app running on http://128.174.136.27:8051/
