# Tools

In [7]:
from collections import defaultdict, Counter
import math
import re
import pymupdf
import pymupdf4llm
from nltk import sent_tokenize, word_tokenize
import os
from openai import OpenAI
client = OpenAI(api_key=os.environ['OPENAI_AUTO_SURVEY'])

import spacy
import spacy.tokens
import pytextrank
from fastcoref import spacy_component
import numpy as np

# nlp.add_pipe("fastcoref")

## Question decompose

In [8]:
extract_general_term_prompt = '''The following task involves extracting specific information from a paper. The task description may include general terms that represent types of concrete entities, which need to be identified and clarified based on the paper. Your objective is to identify and list **ALL** general terms or concepts in the task description that might be open to interpretation, require further specification using details from the paper and be critical in locating relevant information in the paper. These include:
    1.	Specific entities, models, methods, or processes referenced in the description.
    2.	Broad categories or classifications that require more detailed breakdowns.
    3.	Implicit generalizations or assumptions that could benefit from contextual examples or precise definitions.

Some general terms may refer to the same entity in the description. You should only list one general term for each entity. Make sure you cover **ALL** the entities.

Task: {task}'''


organize_general_term_prompt = '''Understand the hierarchy among the general terms you listed above with respect to the "Parent-Child" relationship:

Parent Concept:
A parent concept represents a broader, overarching idea or category that serves as the foundation for related subordinate ideas. It is independent and provides the contextual framework or structure for its associated dependent concepts.

Child Concept:
A child concept is a more specific, subordinate idea that derives meaning, classification, or context from its associated parent concept. It depends on the parent concept for its definition and existence within a hierarchical structure.

Organize the general terms you listed above hierarchically based on their dependencies, ensuring that parent concepts are listed first, followed by their dependent child concepts. Use indentation to represent the hierarchy, with the format as follows:

1.	Parent concept
    1.1 Dependent child concept
        1.1.1 Dependent grandchild concept
        1.1.2 Dependent grandchild concept
    1.2 Dependent child concept
2.	Parent concept
    2.1 Dependent child concept

Only use the general terms identified in your previous response to create this hierarchical structure.'''

generate_checkpoint_prompt = '''To find the relevant information step by step, break down the task into a series of simple, single-step questions. Each question should be narrowly focused, collecting or verifying only one attribute of one entity or entity type, or serving as a follow-up to refine the scope with one additional attribute. Each question can be either a "What" question or a "True or False" question. Always start with questions for the low level entities (child entities) and then move forward to questions for their parent entity. This structured approach ensures clarity and precision in locating relevant information from the paper.'''

In [9]:
# task = '''Extract the modeling paradigms proposed in the paper that satisfy the following type:

# LLM Embeddings + RS. This modeling paradigm views the language model as a feature extractor, which feeds the features of items and users into LLMs and outputs corresponding embeddings. A traditional RS model can utilize knowledge-aware embeddings for various recommendation tasks.'''

task = '''Extract the modeling paradigms proposed in the paper that satisfy the following type:

LLM as RS. This paradigm aims to directly transfer pre-trained LLM into a powerful recommendation system. The input sequence usually consists of the profile description, behavior prompt, and task instruction. The output sequence is expected to offer a reasonable recommendation result.'''

Sub-questions are designed to find necessary information.

In [None]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": extract_general_term_prompt.format(task=task),
        }
    ],
    model=GPT_MODEL_EXPENSIVE,
)
general_term_str = chat_completion.choices[0].message.content
print(general_term_str)

In [None]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": extract_general_term_prompt.format(task=task),
        }, {
            "role": "assistant",
            "content": general_term_str,
        }, {
            "role": "user",
            "content": organize_general_term_prompt,
        }
    ],
    model=GPT_MODEL_EXPENSIVE,
)
general_term_hierarchy_str = chat_completion.choices[0].message.content
print(general_term_hierarchy_str)

In [None]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": extract_general_term_prompt.format(task=task),
        }, {
            "role": "assistant",
            "content": general_term_str,
        }, {
            "role": "user",
            "content": organize_general_term_prompt,
        }, {
            "role": "assistant",
            "content": general_term_hierarchy_str,
        }, {
            "role": "user",
            "content": generate_checkpoint_prompt,
        }
    ],
    model=GPT_MODEL_EXPENSIVE,
)
checkpoint_str = chat_completion.choices[0].message.content
print(checkpoint_str)

Some necessary sub-questions are not asked in the above example.
The sub-questions above are more about asking definition of terms.
Check how other papers do question decomposition.

## Outline

In [13]:
from sci_review.paper import *
import spacy.displacy

In [None]:
from run_aclsum import *

In [2]:
# doc_file = '../../data/systematic_review_papers/planning/CALM.pdf'
# doc_file = 'aclsum.pdf'
doc_file = f'{ACLSUM_PDF_DIR}/P19-1352.pdf' # 41
outline = '''```
1 Abstract
2 Introduction
3 Approach
    3.1 Shared-Private Bilingual Word Embeddings
        3.1.1 Words with Similar Lexical Meaning
        3.1.2 Words with Same Word Form
        3.1.3 Unrelated Words
    3.2 Implementation
4 Experiments
    4.1 Setup
    4.2 Main Results
    4.3 Effect on Sharing Coefficients
    4.4 Effect on Alignment Quality
    4.5 Analysis of the Translation Results
    4.6 Analysis of the Learned Embeddings
5 Related Work
6 Conclusion
Acknowledgements
References
```
'''

In [16]:
# Todo The following section name is placed in the wrong position in the table of content. Please ensure that all sections are listed in the correct order as they appear in the paper.

# 2.1 Tree Substitution Grammar

In [None]:
with open('words_alpha.txt') as f:
    words_alpha = set(f.read().splitlines())
doc = DocManager(word_vocab=words_alpha)

In [None]:
doc.load_doc(doc_file, outline=outline)

In [None]:
print(doc.full_outline)

In [10]:
with open(f"{ACLSUM_PDF_DIR}/outline_E06-1051.txt", 'w') as f:
    f.write(doc.full_outline)

In [None]:
print(doc.sections[1].text)

In [None]:
print(doc.outline)

In [None]:
doc.sections[4].header

In [None]:
doc.full_outline

In [None]:
print(doc.sections[20].text)

## Plot DKG

In [None]:
doc.plot_dkg()