In [1]:
import asyncio
# import getpass
import os
from datetime import datetime
from hashlib import md5
from typing import Dict, List, Optional

import pandas as pd
# import seaborn as sns
# import tiktoken
from langchain_community.graphs import Neo4jGraph
# from langchain_community.tools import WikipediaQueryRun
# from langchain_community.utilities import WikipediaAPIWrapper
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_text_splitters import TokenTextSplitter
from pydantic import BaseModel, Field, conlist
import openai
from neo4j import GraphDatabase
from langchain.callbacks.openai_info import OpenAICallbackHandler
from langchain_core.rate_limiters import InMemoryRateLimiter
from langchain_core.output_parsers import StrOutputParser

from graphdatascience import GraphDataScience
from langchain.chains import RetrievalQA

  from .autonotebook import tqdm as notebook_tqdm


In [42]:
os.environ["NEO4J_URI"] = os.getenv('NEO4J_URI')
os.environ["NEO4J_USERNAME"] = os.getenv('NEO4J_USERNAME')
os.environ["NEO4J_PASSWORD"] = os.getenv('NEO4J_PASSWORD')
os.environ["OPENAI_API_KEY"] =  os.getenv('OPENAI_API_KEY')
NEO4j_URI = os.getenv('NEO4J_URI')
NEO4j_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4j_PASSWORD = os.getenv('NEO4J_PASSWORD')

graph = Neo4jGraph(url = NEO4j_URI, username=NEO4j_USERNAME, password = NEO4j_PASSWORD, refresh_schema=False)

graph.query("CREATE CONSTRAINT IF NOT EXISTS FOR (c:Chunk) REQUIRE c.id IS UNIQUE")
graph.query("CREATE CONSTRAINT IF NOT EXISTS FOR (c:AtomicFact) REQUIRE c.id IS UNIQUE")
graph.query("CREATE CONSTRAINT IF NOT EXISTS FOR (c:KeyElement) REQUIRE c.id IS UNIQUE")
graph.query("CREATE CONSTRAINT IF NOT EXISTS FOR (d:Document) REQUIRE d.id IS UNIQUE")

[]

In [3]:

# Define the folder path
folder_path = "data/arxiv_stat_text"
documents_full = []
document_names = []

# Loop through the files in the folder
for filename in os.listdir(folder_path):
    paper_path = os.path.join(folder_path, filename)
    # Check if it's a file
    if os.path.isfile(paper_path):
        document_names.append(filename)
        documents_full.append(open(paper_path).read())

## Construct system

Finetuning the prompt

In [4]:
construction_system = """
You are an intelligent assistant tasked with extracting key elements and atomic facts from scientific articles to construct a knowledge graph. 
Your main goal is to identify the same concepts under different names.
Your output must be structured to identify entities, relationships, and hierarchical connections. 


### JSON Output Requirements:
- Ensure all output is valid JSON.
- Each atomic fact must include:
  1. `key_elements`: A **list of unique items** (equations, authors, concepts, methods, etc.). **Limit: 1 to 10 items.** Avoid duplicate or malformed items.
  2. `atomic_fact`: A **single, clear sentence** following the subject-predicate-object structure.
- Ensure that `key_elements` contains no invalid characters (e.g., newlines, excessive commas, or special characters). Remove redundant or irrelevant entries.


### KEY Extraction Guidelines:
1. **Entities**: Extract and classify essential nouns and phrases into the following categories:
   - **Authors**: Names of paper authors.
   - **Concepts**: Theories, definitions, or models introduced or discussed.
   - **Methods**: Experimental techniques, algorithms, or procedures.
   - **Findings**: Results, discoveries, or key conclusions.
   - **References**: Other papers, datasets, or sources cited. Full titles of other papers, datasets, or sources cited. Avoid vague references like "reference [4]" and replace them with the actual paper title or dataset name whenever possible. If the title is unavailable, include other identifying information (e.g., authors or publication year).
   - **Theorems**: Mathematical or logical propositions.
   - Cross-Document Entities: Highlight entities (authors, concepts, methods, etc.) that are shared with or similar to those in other documents.
   
2. **Relationships**: Identify and label verbs or phrases that connect the entities. Examples include:
   - **"Cites"**: Links an author or paper to a referenced work.
   - **"Proposes"**: Connects an author or paper to a method or concept.
   - **"Builds Upon"**: Indicates that a concept/method is an extension of prior work.
   - **"Validates"**: Links findings to methods or experiments.
   - Cross-Document Relationships: Identify relationships that connect entities across multiple documents (e.g., the same concept being expanded upon by different papers).

3. **Hierarchical Relationships**: Identify nested structures or dependencies, such as:
   - A theorem being part of a model.
   - A method comprising multiple steps or components.
   - Cross-document hierarchies, such as a concept from one paper being part of a larger framework in another.

4. **Atomic Facts**: Extract concise, indivisible facts with clear subject-predicate-object structure. Ensure each atomic fact aligns with the entities and relationships identified.

5. **Key Elements**: 
    - Ensure that there are no redundant or repeated key elements. 
    - If the same key element appears more than once, only include it once
    - If a key element is part of an equation, include the equation.

6. **Relevance and Commonality**:
   - Prioritize facts and relationships that are repeated across multiple papers.
   - Highlight connections between entities that are pivotal or query-worthy.
   - Emphasize shared or query-worthy entities and relationships between documents.

7. **Connecting Documents**:
When identifying entities, relationships, or references, always check for overlaps or connections with other documents (e.g., shared concepts, methods, authors, or references).
Explicitly note when:
    - An entity or relationship in the current document appears in or is similar to one from another document.
    - A finding validates or contrasts a finding from another paper.
    - A concept or method builds upon prior work from another document.

8. **Additional Guidelines**:
   - Replace pronouns with specific nouns (e.g., "it" becomes the actual method or concept).
   - Include any implicit causal or temporal relationships.
   - Limit each `key_elements` list to **10 items maximum**. If there are more than 10 entities, select the most relevant ones.
   - Present the key elements and atomic facts in the same language as the original text (e.g., English or Chinese).
   - Avoid including vague or redundant entries in `key_elements` such as `,`or `\n` or `n` or `k`.
   - Ensure that atomic facts are distinct and not repeated.

Example Output:
---
**Key Elements**:
- Authors: John Doe, Jane Smith
- Concepts: Quantum Entanglement, Bell's Theorem
- Methods: Double-slit experiment
- Findings: Violation of Bell's inequality
- References: Paper A (Einstein, 1935)
- Equations: ax=b
- Cross-Document Entities: Bell's Theorem (also discussed in Paper B), Quantum Entanglement (validated by Paper C)

**Atomic Facts**:
1. John Doe and Jane Smith authored the paper.
2. The paper proposes the concept of Quantum Entanglement.
3. Bell's Theorem builds upon Einstein's 1935 work (Paper A).
4. The Double-slit experiment validates Quantum Entanglement.
5. The findings show a violation of Bell's inequality.
6. Cross-Document Fact: Bell's Theorem is cited in Paper B and extended by Paper C.
7. Cross-Document Fact: Quantum Entanglement was experimentally validated in Paper C, supporting findings in this paper.

---
Your output should maintain this structure and be as detailed and accurate as possible.
"""

construction_human = """Use the given format to extract information from the 
following input: {input}"""

construction_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            construction_system,
        ),
        (
            "human",
            (
                "Use the given format to extract information from the "
                "following input: {input}"
            ),
        ),
    ]
)

In [5]:
class AtomicFact(BaseModel):
    key_elements: List[str]= Field(description="""A list of essential entities (e.g., authors, theories, methods, findings) that are pivotal to the atomic fact.
        These entities should align with the ones described in the prompt such as Authors, Concepts, Methods, Findings, References, Theorems, 
        and Cross-Document Entities. Ensure that the key elements are relevant and comprehensive. """) #Max length: 500 characters.""")
    atomic_fact: str = Field(description="""The smallest, indivisible facts, presented as concise sentences. These include
        propositions, theories, existences, concepts, and implicit elements like logic, causality, event
        sequences, interpersonal relationships, timelines, etc.""")

class Extraction(BaseModel):
    atomic_facts: List[AtomicFact] = Field(description="List of atomic facts")


## Model


In [6]:
rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.1,  
    check_every_n_seconds=0.1,
    max_bucket_size=10,  
)

model = ChatOpenAI(model="gpt-4o", temperature=0.1, rate_limiter=rate_limiter)
structured_llm = model.with_structured_output(Extraction)

construction_chain = construction_prompt | structured_llm

  rate_limiter = InMemoryRateLimiter(


In [7]:
import_query = """
MERGE (d:Document {id:$document_name})
WITH d
UNWIND $data AS row
MERGE (c:Chunk {id: row.chunk_id})
SET c.text = row.chunk_text,
    c.index = row.index,
    c.document_name = row.document_name
MERGE (d)-[:HAS_CHUNK]->(c)
WITH c, row
UNWIND row.atomic_facts AS af
MERGE (a:AtomicFact {id: af.id})
SET a.text = af.atomic_fact
MERGE (c)-[:HAS_ATOMIC_FACT]->(a)
WITH c, a, af
UNWIND af.key_elements AS ke
MERGE (k:KeyElement {id: ke})
MERGE (a)-[:HAS_KEY_ELEMENT]->(k)
"""

def encode_md5(text):
    return md5(text.encode("utf-8")).hexdigest()

In [8]:
# Paper used 2k token size
async def process_document(text, document_name, chunk_size=2000, chunk_overlap=200):
    start = datetime.now()
    print(f"Started extraction at: {start}")
    text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_text(text)
    print(f"Total text chunks: {len(texts)}")
    tasks = [
        asyncio.create_task(construction_chain.ainvoke({"input":chunk_text}))
        for index, chunk_text in enumerate(texts)
    ]
    results = await asyncio.gather(*tasks)
    print(f"Finished LLM extraction after: {datetime.now() - start}")
    docs = [el.dict() for el in results]
    for index, doc in enumerate(docs):
        doc['chunk_id'] = encode_md5(texts[index])
        doc['chunk_text'] = texts[index]
        doc['index'] = index
        for af in doc["atomic_facts"]:
            af["id"] = encode_md5(af["atomic_fact"])
    # Import chunks/atomic facts/key elements
    graph.query(import_query, 
            params={"data": docs, "document_name": document_name})
    # Create next relationships between chunks
    graph.query("""MATCH (c:Chunk)<-[:HAS_CHUNK]-(d:Document)
WHERE d.id = $document_name
WITH c ORDER BY c.index WITH collect(c) AS nodes
UNWIND range(0, size(nodes) -2) AS index
WITH nodes[index] AS start, nodes[index + 1] AS end
MERGE (start)-[:NEXT]->(end)
""",
           params={"document_name":document_name})
    print(f"Finished import at: {datetime.now() - start}")

In [9]:
def clean_graph():
    query = """
    MATCH (n)
    DETACH DELETE n
    """
    graph.query(query)
# clean_graph()

on line = ["1812.00492v1.pdf.json", 
        "2201.01879v3.pdf.json", 
        "2209.01679v3.pdf.json",
        "1502.02355v2.pdf.json",
        "2204.10909v2.pdf.json",
        "2301.04439v1.pdf.json"!,
        "1611.04701v2.pdf.json",
        "1108.1098v1.pdf.json",
        "1605.04055v1.pdf.json",
        "1508.02925v1.pdf.json",
        "1505.04215v1.pdf.json"] 


In [16]:
local =["1505.04215v1.pdf.json",
        "2204.10909v2.pdf.json",
        "2209.01679v3.pdf.json",
        "1502.02355v2.pdf.json",
        "1812.00492v1.pdf.json",
        "1508.02925v1.pdf.json",
        "1605.04055v1.pdf.json",
        "1108.1098v1.pdf.json",
        "1611.04701v2.pdf.json",
        "2201.01879v3.pdf.json",
        "1504.05781v2.pdf.json"]


In [19]:
done = ["1505.04215v1.pdf.json", 
        "2204.10909v2.pdf.json",
        "2209.01679v3.pdf.json",
        "1812.00492v1.pdf.json",
        "1508.02925v1.pdf.json",
        "1605.04055v1.pdf.json",
        "1108.1098v1.pdf.json",
        "2201.01879v3.pdf.json",
        "1504.05781v2.pdf.json",
        "1502.02355v2.pdf.json",
        "1611.04701v2.pdf.json"
        ]
trouble = ["1502.02355v2.pdf.json", "1611.04701v2.pdf.json"]

In [20]:
for text, name in zip(documents_full, document_names):
    if name in local and name not in done:
    #if name not in done and name not in trouble:
        print(name)
        await process_document(text, name, chunk_size=500, chunk_overlap=100)

1502.02355v2.pdf.json
Started extraction at: 2024-12-08 23:13:08.109558
Total text chunks: 194
Finished LLM extraction after: 0:30:46.482860
Finished import at: 0:30:46.697878
1611.04701v2.pdf.json
Started extraction at: 2024-12-08 23:43:54.808008
Total text chunks: 309
Finished LLM extraction after: 0:51:39.273226
Finished import at: 0:51:39.919016
