# Tools

In [None]:
from collections import defaultdict, Counter
import math
import re
import pymupdf
import pymupdf4llm
from nltk import sent_tokenize, word_tokenize
import os
from openai import OpenAI
client = OpenAI(api_key=os.environ['OPENAI_AUTO_SURVEY'])

import spacy
import spacy.tokens
import pytextrank
from fastcoref import spacy_component
import numpy as np

# nlp.add_pipe("fastcoref")

## Question decompose

In [2]:
extract_general_term_prompt = '''The following task involves extracting specific information from a paper. The task description may include general terms that represent types of concrete entities, which need to be identified and clarified based on the paper. Your objective is to identify and list **ALL** general terms or concepts in the task description that might be open to interpretation, require further specification using details from the paper and be critical in locating relevant information in the paper. These include:
    1.	Specific entities, models, methods, or processes referenced in the description.
    2.	Broad categories or classifications that require more detailed breakdowns.
    3.	Implicit generalizations or assumptions that could benefit from contextual examples or precise definitions.

Some general terms may refer to the same entity in the description. You should only list one general term for each entity. Make sure you cover **ALL** the entities.

Task: {task}'''


organize_general_term_prompt = '''Understand the hierarchy among the general terms you listed above with respect to the "Parent-Child" relationship:

Parent Concept:
A parent concept represents a broader, overarching idea or category that serves as the foundation for related subordinate ideas. It is independent and provides the contextual framework or structure for its associated dependent concepts.

Child Concept:
A child concept is a more specific, subordinate idea that derives meaning, classification, or context from its associated parent concept. It depends on the parent concept for its definition and existence within a hierarchical structure.

Organize the general terms you listed above hierarchically based on their dependencies, ensuring that parent concepts are listed first, followed by their dependent child concepts. Use indentation to represent the hierarchy, with the format as follows:

1.	Parent concept
    1.1 Dependent child concept
        1.1.1 Dependent grandchild concept
        1.1.2 Dependent grandchild concept
    1.2 Dependent child concept
2.	Parent concept
    2.1 Dependent child concept

Only use the general terms identified in your previous response to create this hierarchical structure.'''

generate_checkpoint_prompt = '''To find the relevant information step by step, break down the task into a series of simple, single-step questions. Each question should be narrowly focused, collecting or verifying only one attribute of one entity or entity type, or serving as a follow-up to refine the scope with one additional attribute. Each question can be either a "What" question or a "True or False" question. Always start with questions for the low level entities (child entities) and then move forward to questions for their parent entity. This structured approach ensures clarity and precision in locating relevant information from the paper.'''

In [3]:
# task = '''Extract the modeling paradigms proposed in the paper that satisfy the following type:

# LLM Embeddings + RS. This modeling paradigm views the language model as a feature extractor, which feeds the features of items and users into LLMs and outputs corresponding embeddings. A traditional RS model can utilize knowledge-aware embeddings for various recommendation tasks.'''

task = '''Extract the modeling paradigms proposed in the paper that satisfy the following type:

LLM as RS. This paradigm aims to directly transfer pre-trained LLM into a powerful recommendation system. The input sequence usually consists of the profile description, behavior prompt, and task instruction. The output sequence is expected to offer a reasonable recommendation result.'''

Sub-questions are designed to find necessary information.

In [None]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": extract_general_term_prompt.format(task=task),
        }
    ],
    model=GPT_MODEL_EXPENSIVE,
)
general_term_str = chat_completion.choices[0].message.content
print(general_term_str)

In [None]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": extract_general_term_prompt.format(task=task),
        }, {
            "role": "assistant",
            "content": general_term_str,
        }, {
            "role": "user",
            "content": organize_general_term_prompt,
        }
    ],
    model=GPT_MODEL_EXPENSIVE,
)
general_term_hierarchy_str = chat_completion.choices[0].message.content
print(general_term_hierarchy_str)

In [None]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": extract_general_term_prompt.format(task=task),
        }, {
            "role": "assistant",
            "content": general_term_str,
        }, {
            "role": "user",
            "content": organize_general_term_prompt,
        }, {
            "role": "assistant",
            "content": general_term_hierarchy_str,
        }, {
            "role": "user",
            "content": generate_checkpoint_prompt,
        }
    ],
    model=GPT_MODEL_EXPENSIVE,
)
checkpoint_str = chat_completion.choices[0].message.content
print(checkpoint_str)

Some necessary sub-questions are not asked in the above example.
The sub-questions above are more about asking definition of terms.
Check how other papers do question decomposition.

## Outline

In [None]:
from sci_review.paper import *
from sci_review.text import *

In [2]:
# doc_file = '../../data/systematic_review_papers/planning/CALM.pdf'
doc_file = 'aclsum.pdf'
 # open a document
# out = open("output.txt", "wb") # create a text output
# for page in doc: # iterate the document pages
#     text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
#     out.write(text) # write text of page
#     out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
# out.close()

In [None]:
doc = DocManager()

In [None]:
doc.load_doc(doc_file)

In [None]:
for k in doc.dkg[0][1].items():
    print(k)

In [24]:
test_node = 0
edge_type_test = defaultdict(list)
for nbr in doc.dkg[test_node].keys():
    for edge_type, edge_data in doc.dkg[test_node][nbr].items():
        edge_type_test[edge_type].append((doc.phrases[test_node], doc.phrases[nbr], edge_data))
edge_type_test.keys()

In [None]:
edge_type_test['subj_obj'][0]

In [None]:
cross_sent_edges = []
for subj, obj, edge_data in edge_type_test['subj_obj']:
    for path in edge_data['paths']:
        if subj.sent.start != path[0].sent.start:
            cross_sent_edges.append((subj, obj, path))
print(len(cross_sent_edges))

In [32]:
prons2corefs = [(p, section.pron_root2corefs[p.root.i]) for section in doc.sections for p in section.prons if any(p.sent.start != coref.sent.start for coref in section.pron_root2corefs[p.root.i])]

In [5]:
# advcl, ccomp: 1. subj in the clause and the objs in the clause; 2. subj in the head clause and the objs are all the nouns in the clause
# acl: clausal modifier of noun (adjectival clause), 1. subj in the clause and the objs in the clause; 2. subj be the head of the clause and objs should be all the nouns in the clause
# xcomp: open clausal complement, 1. subjs are subj/obj in the head clause and the objs are all the nouns in the clause
# relcl: if the clause has a noun phrase as the subj, then the head of the clause is the object, otherwise the head of the clause is the subj
# pcomp: 1. subj in the clause and the objs in the clause; 2. subj in the head clause and the objs are all the nouns in the clause; 3. noun phrase head of the clause as the subj




dep2verbs = defaultdict(list[Token])
# for section in doc.sections:
#     if section.section_nlp_local is None:
#         continue
#     for token in section.section_nlp_local:
#         if token.pos_ == 'VERB':
#             dep2verbs[token.dep_].append(token)

for token in doc.doc_spacy:
    if token.pos_ == 'VERB':
        dep2verbs[token.dep_].append(token)

In [None]:
[(dep, len(dep2verbs[dep])) for dep in sorted(dep2verbs, key=lambda k: len(dep2verbs[k]), reverse=True)]

In [None]:
check_verbs = [token for token in dep2verbs['acomp'] if doc.tid2phrase_id[token.i] < 0]
print(len(check_verbs))

In [None]:
test_verb = check_verbs[0]
print(test_verb)
print(test_verb.sent)

In [None]:
import spacy.displacy

# spacy.displacy.render(subjs[1].sent)
# sent = list(doc.sections[0].section_nlp_local.sents)[0]
# sent = doc.phrases[98].sent
sent = test_verb.sent
spacy.displacy.render(sent)

In [59]:
# advcl, ccomp: 1. subj in the clause and the objs in the clause; 2. subj in the head clause and the objs are all the nouns in the clause
# acl: clausal modifier of noun (adjectival clause), 1. subj in the clause and the objs in the clause; 2. subj be the head of the clause and objs should be all the nouns in the clause
# xcomp: open clausal complement, 1. subjs are subj/obj in the head clause and the objs are all the nouns in the clause
# relcl: if the clause has a noun phrase as the subj, then the head of the clause is the object, otherwise the head of the clause is the subj
# pcomp: 1. subj in the clause and the objs in the clause; 2. subj in the head clause and the objs are all the nouns in the clause; 3. noun phrase head of the clause as the subj

ADVCL, CCOMP, ACL, XCOMP, RELCL, PCOMP = 'advcl', 'ccomp', 'acl', 'xcomp', 'relcl', 'pcomp'

section = doc.sections[0]
clause_deps = {ADVCL, CCOMP, ACL, XCOMP, RELCL, PCOMP}
dkg = nx.MultiDiGraph()

last_subjs, last_sent_start = set[int](), -1
sent = doc.phrases[98].sent
sent = section.section_nlp_local[sent.start-section.section_nlp_global.start: sent.end-section.section_nlp_global.start]
        
dep_trees = list[nx.DiGraph]()
sent_dep_tree = nx.Graph()
roots = [sent.root]
tid2tree_id = dict[int, int]()
root2clause_roots = defaultdict(list[int])
# root2head_id = dict[int, int]()
clause_id2subjs = defaultdict(list[tuple[int, Span]])
clause_id2objs = defaultdict(list[tuple[int, Span]])

while roots:
    dep_tree = nx.DiGraph()
    root = roots.pop()
    dep_tree.add_node(root.i, dep=root.dep_ if root.dep_ not in clause_deps else f'{root.dep_}_ROOT')
    dep_tree.graph['root'] = root.i
    tokens = [root]
    tid2tree_id[root.i] = len(dep_trees)
    while tokens:
        token = tokens.pop()
        for child in token.children:
            sent_dep_tree.add_edge(token.i, child.i)
            if child.dep_ in clause_deps and doc.tid2phrase_id[child.i + section.section_nlp_global.start] < 0:
                roots.append(child)
                # root2head_id[child.i] = token.i
                root2clause_roots[root.i].append(token.i)
            else:
                tid2tree_id[child.i] = len(dep_trees)
                dep_tree.add_node(child.i, dep=child.dep_)
                dep_tree.add_edge(token.i, child.i)
                tokens.append(child)
    dep_trees.append(dep_tree)
    
new_last_subjs = set[int]()

# Collect subjs and objs in each clause and add in-clause subj-obj edges (coreference outside the clause is considered)
for clause_id, dep_tree in enumerate(dep_trees):
    # Find the noun phrases
    noun_phrases = list[Span]()
    phrase_ids = {doc.tid2phrase_id[section.section_nlp_global.start + node] for node in dep_tree.nodes}
    for phrase_id in phrase_ids:
        if phrase_id >= 0:
            noun_phrase = doc.phrases[phrase_id]
            noun_phrases.append(section.section_nlp_local[noun_phrase.start-section.section_nlp_global.start: noun_phrase.end-section.section_nlp_global.start])
    
    for np_label, noun_phrase in [('np', np) for np in noun_phrases] + [('p', pron) for pron in section.prons if dep_tree.has_node(pron.root.i)]:
        root_phrase = noun_phrase

        while root_phrase.root.dep_ in {'conj', 'appos'}:
            root_phrase_id:int = doc.tid2phrase_id[section.section_nlp_global.start + root_phrase.root.head.i]
            if root_phrase_id < 0:
                break
            root_phrase_global = doc.phrases[root_phrase_id]
            root_phrase = section.section_nlp_local[root_phrase_global.start-section.section_nlp_global.start: root_phrase_global.end-section.section_nlp_global.start]
        
        global_phrase_id = -1
        if np_label == 'np':
            global_phrase_id = doc.tid2phrase_id[section.section_nlp_global.start + noun_phrase.root.i]
        else:
            for coref in section.pron_root2corefs[noun_phrase.root.i][::-1]:
                coref_phrase_id = doc.tid2phrase_id[section.section_nlp_global.start + coref.root.i]
                if coref_phrase_id >= 0:
                    global_phrase_id = coref_phrase_id
                    break
        if global_phrase_id < 0:
            continue
        
        if 'subj' in root_phrase.root.dep_:
            clause_id2subjs[clause_id].append((global_phrase_id, root_phrase))
            if dep_tree.nodes[dep_tree.graph['root']]['dep'] == 'ROOT':
                new_last_subjs.add(global_phrase_id)
        else:
            clause_id2objs[clause_id].append((global_phrase_id, root_phrase))

    undirected_dep_tree = dep_tree.to_undirected()
    for (subj, subj_root), (obj, obj_root) in itertools.product(clause_id2subjs[clause_id], clause_id2objs[clause_id]):
        path = tuple([path_id + section.section_nlp_global.start for path_id in nx.shortest_path(undirected_dep_tree, source=subj_root.root.i, target=obj_root.root.i)[1:-1]])
        if not dkg.has_edge(subj, obj, key=SUBJ_OBJ):
            dkg.add_edge(subj, obj, key=SUBJ_OBJ, paths=[], weight=1)
        dkg[subj][obj][SUBJ_OBJ]['paths'].append(path)
        # for last_subj in last_subjs:
        #     if dkg.has_edge(last_subj, obj, key=ADJACENT):
        #         dkg[last_subj][obj][ADJACENT]['sent_range'].append((last_sent_start + section.section_nlp_global.start, sent.end + section.section_nlp_global.start))
        #     else:
        #         dkg.add_edge(last_subj, obj, key=ADJACENT, sent_range=[(last_sent_start + section.section_nlp_global.start, sent.end + section.section_nlp_global.start)], weight=2)
        
# Collect cross-clause edges
for clause_id, dep_tree in enumerate(dep_trees):
    for sub_clause_root in root2clause_roots[dep_tree.graph['root']]:
        sub_clause_id = tid2tree_id[sub_clause_root]
        sub_dep_tree = dep_trees[sub_clause_id]
        if section.section_nlp_local[sub_clause_root].dep_ in {ADVCL, CCOMP}:
            for (subj, subj_root), (obj, obj_root) in itertools.product(clause_id2subjs[clause_id], clause_id2subjs[sub_clause_id] + clause_id2objs[sub_clause_id]):
                path = tuple([path_id + section.section_nlp_global.start for path_id in nx.shortest_path(sent_dep_tree, source=subj_root.root.i, target=obj_root.root.i)[1:-1]])
                if not dkg.has_edge(subj, obj, key=SUBJ_OBJ):
                    dkg.add_edge(subj, obj, key=SUBJ_OBJ, paths=[], weight=1)
                dkg[subj][obj][SUBJ_OBJ]['paths'].append(path)
        elif section.section_nlp_local[sub_clause_root].dep_ == ACL:
            clause_head_tid = section.section_nlp_local[sub_clause_root].head.i
            clause_head_noun_phrase_id = doc.tid2phrase_id[clause_head_tid + section.section_nlp_global.start]
            if clause_head_noun_phrase_id < 0:
                if clause_head_tid in section.pron_root2corefs:
                    for coref in section.pron_root2corefs[clause_head_tid][::-1]:
                        coref_phrase_id = doc.tid2phrase_id[section.section_nlp_global.start + coref.root.i]
                        if coref_phrase_id >= 0:
                            clause_head_noun_phrase_id = coref_phrase_id
                            break
            if clause_head_noun_phrase_id < 0:
                continue
            for (obj, obj_root) in clause_id2subjs[sub_clause_id] + clause_id2objs[sub_clause_id]:
                path = tuple([path_id + section.section_nlp_global.start for path_id in nx.shortest_path(sent_dep_tree, source=clause_head_tid, target=obj_root.root.i)[1:-1]])
                if not dkg.has_edge(clause_head_noun_phrase_id, obj, key=SUBJ_OBJ):
                    dkg.add_edge(clause_head_noun_phrase_id, obj, key=SUBJ_OBJ, paths=[], weight=1)
                dkg[clause_head_noun_phrase_id][obj][SUBJ_OBJ]['paths'].append(path)
        elif section.section_nlp_local[sub_clause_root].dep_ == XCOMP:
            for (subj, subj_root), (obj, obj_root) in itertools.product(clause_id2subjs[clause_id], clause_id2subjs[sub_clause_id] + clause_id2objs[sub_clause_id]):
                path = tuple([path_id + section.section_nlp_global.start for path_id in nx.shortest_path(sent_dep_tree, source=subj_root.root.i, target=obj_root.root.i)[1:-1]])
                if not dkg.has_edge(subj, obj, key=SUBJ_OBJ):
                    dkg.add_edge(subj, obj, key=SUBJ_OBJ, paths=[], weight=1)
                dkg[subj][obj][SUBJ_OBJ]['paths'].append(path)
            for (subj, subj_root), (obj, obj_root) in itertools.product(clause_id2objs[clause_id], clause_id2subjs[sub_clause_id] + clause_id2objs[sub_clause_id]):
                path = tuple([path_id + section.section_nlp_global.start for path_id in range(subj_root.root.i+1, obj_root.root.i)])
                if not dkg.has_edge(subj, obj, key=SUBJ_OBJ):
                    dkg.add_edge(subj, obj, key=SUBJ_OBJ, paths=[], weight=1)
                dkg[subj][obj][SUBJ_OBJ]['paths'].append(path)
        elif section.section_nlp_local[sub_clause_root].dep_ == RELCL:
            clause_head_tid = section.section_nlp_local[sub_clause_root].head.i
            clause_head_noun_phrase_id = doc.tid2phrase_id[clause_head_tid + section.section_nlp_global.start]
            if clause_head_noun_phrase_id < 0:
                if clause_head_tid in section.pron_root2corefs:
                    for coref in section.pron_root2corefs[clause_head_tid][::-1]:
                        coref_phrase_id = doc.tid2phrase_id[section.section_nlp_global.start + coref.root.i]
                        if coref_phrase_id >= 0:
                            clause_head_noun_phrase_id = coref_phrase_id
                            break
            if clause_head_noun_phrase_id < 0:
                continue
            if clause_id2subjs[sub_clause_id]:
                for (subj, subj_root) in clause_id2subjs[sub_clause_id]:
                    path = tuple([path_id + section.section_nlp_global.start for path_id in nx.shortest_path(sent_dep_tree, source=subj_root.root.i, target=clause_head_tid)[1:-1]])
                    if not dkg.has_edge(subj, clause_head_noun_phrase_id, key=SUBJ_OBJ):
                        dkg.add_edge(subj, clause_head_noun_phrase_id, key=SUBJ_OBJ, paths=[], weight=1)
                    dkg[subj][clause_head_noun_phrase_id][SUBJ_OBJ]['paths'].append(path)
            else:
                for (obj, obj_root) in clause_id2objs[sub_clause_id]:
                    path = tuple([path_id + section.section_nlp_global.start for path_id in nx.shortest_path(sent_dep_tree, source=clause_head_tid, target=obj_root.root.i)[1:-1]])
                    if not dkg.has_edge(clause_head_noun_phrase_id, obj, key=SUBJ_OBJ):
                        dkg.add_edge(clause_head_noun_phrase_id, obj, key=SUBJ_OBJ, paths=[], weight=1)
                    dkg[clause_head_noun_phrase_id][obj][SUBJ_OBJ]['paths'].append(path)
        elif section.section_nlp_local[sub_clause_root].dep_ == PCOMP:
            for (subj, subj_root), (obj, obj_root) in itertools.product(clause_id2subjs[clause_id], clause_id2subjs[sub_clause_id] + clause_id2objs[sub_clause_id]):
                path = tuple([path_id + section.section_nlp_global.start for path_id in nx.shortest_path(sent_dep_tree, source=subj_root.root.i, target=obj_root.root.i)[1:-1]])
                if not dkg.has_edge(subj, obj, key=SUBJ_OBJ):
                    dkg.add_edge(subj, obj, key=SUBJ_OBJ, paths=[], weight=1)
                dkg[subj][obj][SUBJ_OBJ]['paths'].append(path)
            clause_head = section.section_nlp_local[sub_clause_root].head
            while clause_head.pos_ not in {'NOUN', 'PRON', 'PROPN'}:
                new_clause_head = clause_head.head
                if new_clause_head == clause_head:
                    break
                clause_head = new_clause_head
            if clause_head.pos_ not in {'NOUN', 'PRON', 'PROPN'}:
                continue
            clause_head_tid = clause_head.i
            clause_head_noun_phrase_id = doc.tid2phrase_id[clause_head_tid + section.section_nlp_global.start]
            if clause_head_noun_phrase_id < 0:
                if clause_head_tid in section.pron_root2corefs:
                    for coref in section.pron_root2corefs[clause_head_tid][::-1]:
                        coref_phrase_id = doc.tid2phrase_id[section.section_nlp_global.start + coref.root.i]
                        if coref_phrase_id >= 0:
                            clause_head_noun_phrase_id = coref_phrase_id
                            break
            if clause_head_noun_phrase_id < 0:
                continue
            for (obj, obj_root) in clause_id2subjs[sub_clause_id] + clause_id2objs[sub_clause_id]:
                path = tuple([path_id + section.section_nlp_global.start for path_id in nx.shortest_path(sent_dep_tree, source=clause_head_tid, target=obj_root.root.i)[1:-1]])
                if not dkg.has_edge(clause_head_noun_phrase_id, obj, key=SUBJ_OBJ):
                    dkg.add_edge(clause_head_noun_phrase_id, obj, key=SUBJ_OBJ, paths=[], weight=1)
                dkg[clause_head_noun_phrase_id][obj][SUBJ_OBJ]['paths'].append(path)
            

last_subjs = new_last_subjs
last_sent_start = sent.start
    

In [None]:
spacy.displacy.render(sent)

In [None]:
dkg.edges(data=True)

In [None]:
section.pron_root2corefs

In [None]:
for subj, obj, edge_data in dkg.edges(data=True):
    print(f'{doc.phrases[subj].text}, {doc.phrases[obj].text}, {edge_data}')

In [None]:
dkg[101][102][SUBJ_OBJ]

## Plot DKG

In [None]:
sent

In [None]:
import dash
import dash_cytoscape as cyto
from dash import html

# [{'data': {'id': token.text, 'label': token.text}, 'position': {'x': token.idx - sent[0].idx, 'y': 50}} for token in sent]

app = dash.Dash(__name__)
app.layout = html.Div([
    cyto.Cytoscape(
        id='cytoscape',
        # elements=[
        #     {'data': {'id': 'one', 'label': 'Node 1'}, 'position': {'x': 50, 'y': 50}},
        #     {'data': {'id': 'two', 'label': 'Node 2'}, 'position': {'x': 200, 'y': 200}},
        #     {'data': {'source': 'one', 'target': 'two','label': 'Node 1 to 2'}}
        # ],
        elements=[{'data': {'id': token.text, 'label': token.text}, 'position': {'x': (token.idx - sent[0].idx) * 6, 'y': 0}} for token in sent],
        layout={'name': 'preset'},
        style={'width': '100%', 'height': '100px', 'backgroundColor': 'white'},
        stylesheet=[
            {
                'selector': 'node',
                'style': {
                    'label': 'data(label)',
                    'font-family': 'Courier',
                    'font-size': '10',
                    'font-weight': 'bold',
                    'text-halign': 'right',
                    'width': '1',
                    'height': '1',
                }
            },
            # {
            #     'selector': 'edge',
            #     'style': {
            #         'label': 'data(label)',
            #         'font-family': 'Courier New',
            #         'font-size': '12px'
            #     }
            # }
        ]
    )
])

if __name__ == '__main__':
    app.run_server(debug=True)