In [1]:
from dotenv import load_dotenv
import os

os.chdir("../")
load_dotenv()

from knowledge_graph_creator.llm.llm_inference import LLMInference, LLMConfig, GroqModel
from knowledge_graph_creator.extractors.paper_relation_extractor import PaperRelationExtractor


  from pydantic.v1.fields import FieldInfo as FieldInfoV1


In [2]:
# Initialize LLM client
api_key = os.getenv("GROP_API_KEY_GRAPH")
llm_config = LLMConfig(model=GroqModel.LLAMA_8B, temperature=0.3)
llm_client = LLMInference(api_key=api_key, config=llm_config)

# Initialize extractor
extractor = PaperRelationExtractor(
    uri=os.getenv("NEO4J_URI"),
    user=os.getenv("NEO4J_USER"),
    password=os.getenv("NEO4J_PASSWORD"),
    llm_client=llm_client,
    min_delay=1
)


In [3]:
# Test 1: Get all triplets from database
triplets = extractor.get_all_triplets(min_citation_count=0, min_year=2022)
print(f"Found {len(triplets)} triplets")
if triplets:
    print("Sample triplet:", triplets[0])


Found 569 triplets
Sample triplet: {'tail_id': '85064a4b1b96863af4fccff9ad34ce484945ad7b', 'tail_title': 'Knowledge Graph Embedding: A Survey from the Perspective of Representation Spaces', 'tail_abstract': 'Knowledge graph embedding (KGE) is an increasingly popular technique that aims to represent entities and relations of knowledge graphs into low-dimensional semantic spaces for a wide spectrum of applications such as link prediction, knowledge reasoning and knowledge completion. In this article, we provide a systematic review of existing KGE techniques based on representation spaces. Particularly, we build a fine-grained classification to categorise the models based on three mathematical perspectives of the representation spaces: (1) algebraic perspective, (2) geometric perspective and (3) analytical perspective. We introduce the rigorous definitions of fundamental mathematical spaces before diving into KGE models and their mathematical properties. We further discuss different KGE m

In [4]:
# Test 2: Extract relation with sample papers (mock data)
sample_citing_paper = {
    "title": "Attention Is All You Need",
    "abstract": "We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.",
}

sample_cited_paper = {
    "title": "Neural Machine Translation by Jointly Learning to Align and Translate",
    "abstract": "We conjecture that the use of a fixed-length vector is a bottleneck in improving the performance of this basic encoder-decoder architecture, and propose to extend this by allowing a model to automatically search for parts of a source sentence that are relevant to predicting a target word.",
}



result = extractor.extract_relation_with_structured_llm(
    citing_paper=sample_citing_paper,
    cited_paper=sample_cited_paper,
)

print("Extraction result:")
if result:
    print(result.model_dump())
else:
    print("No result returned")


Extraction result:
{'relationships': [{'type': 'Extends', 'confidence': 'high', 'evidence': 'Paper 1 proposes a new simple network architecture, the Transformer, which is based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Paper 2 introduces the transformer model and conjectures that a fixed-length vector is a bottleneck in improving performance.', 'explanation': 'Paper 1 builds upon and extends the methodology introduced in Paper 2 by proposing a new network architecture based on attention mechanisms.'}, {'type': 'Requires', 'confidence': 'high', 'evidence': 'Paper 2 introduces the transformer model and conjectures that a fixed-length vector is a bottleneck in improving performance. Paper 1 builds upon this model and proposes a new simple network architecture.', 'explanation': 'Paper 1 depends on and builds directly upon the concepts/methods from Paper 2 as a necessary foundation.'}], 'no_relationship_reason': None}


In [7]:
# Test 3: Process single triplet from database
if triplets:
    single_triplet = triplets[0]
    citing = {
        "title": single_triplet["tail_title"],
        "abstract": single_triplet["tail_abstract"],
    }
    cited = {
        "title": single_triplet["head_title"],
        "abstract": single_triplet["head_abstract"],
    }

    analysis = extractor.extract_relation_with_structured_llm(citing, cited)
    if analysis:
        print("Relationships found:")
        for rel in analysis.relationships:
            print(f"  - {rel.type} (confidence: {rel.confidence})")
            print(f"    Evidence: {rel.evidence}")


Relationships found:
  - Requires (confidence: high)
    Evidence: Paper 1 discusses knowledge graph embedding (KGE) and its applications, which is a concept that Paper 2 touches upon in its abstract.
  - Adapts-from (confidence: medium)
    Evidence: Paper 1 discusses different mathematical perspectives of representation spaces, which is a concept that Paper 2 mentions in its title.


In [6]:
analysis

In [8]:
# Test 4: Process limited batch (first 3 triplets)
limited_triplets = triplets[:3] if len(triplets) >= 3 else triplets

results = []
import time

for idx, triplet in enumerate(limited_triplets):
    citing = {"title": triplet["tail_title"], "abstract": triplet["tail_abstract"]}
    cited = {"title": triplet["head_title"], "abstract": triplet["head_abstract"]}

    print(f"Processing {idx + 1}/{len(limited_triplets)}")
    analysis = extractor.extract_relation_with_structured_llm(citing, cited)

    if analysis:
        results.append({
            "citing_id": triplet["tail_id"],
            "cited_id": triplet["head_id"],
            "relationships": [r.model_dump() for r in analysis.relationships],
        })

    time.sleep(2)  # Rate limiting

print(f"\nProcessed {len(results)} triplets successfully")


Processing 1/3
Processing 2/3
Processing 3/3

Processed 3 triplets successfully


In [11]:
limited_triplets[0]

{'tail_id': '85064a4b1b96863af4fccff9ad34ce484945ad7b',
 'tail_title': 'Knowledge Graph Embedding: A Survey from the Perspective of Representation Spaces',
 'tail_abstract': 'Knowledge graph embedding (KGE) is an increasingly popular technique that aims to represent entities and relations of knowledge graphs into low-dimensional semantic spaces for a wide spectrum of applications such as link prediction, knowledge reasoning and knowledge completion. In this article, we provide a systematic review of existing KGE techniques based on representation spaces. Particularly, we build a fine-grained classification to categorise the models based on three mathematical perspectives of the representation spaces: (1) algebraic perspective, (2) geometric perspective and (3) analytical perspective. We introduce the rigorous definitions of fundamental mathematical spaces before diving into KGE models and their mathematical properties. We further discuss different KGE methods over the three categories,

In [13]:
# Test save_relationships with first triplet
if limited_triplets and results:
    # Use the first triplet and its analysis
    first_triplet = limited_triplets[0]

    # Re-extract or use existing analysis
    citing = {"title": first_triplet["tail_title"], "abstract": first_triplet["tail_abstract"]}
    cited = {"title": first_triplet["head_title"], "abstract": first_triplet["head_abstract"]}

    analysis = extractor.extract_relation_with_structured_llm(citing, cited)

    if analysis:
        extractor.save_relationships(
            citing_id=first_triplet["tail_id"],
            cited_id=first_triplet["head_id"],
            analysis=analysis
        )
        print(f"Saved relationships for {first_triplet['tail_id']} -> {first_triplet['head_id']}")
        print(f"Relationships: {[r.type for r in analysis.relationships]}")
    else:
        print("No analysis to save")

Saved relationships for 85064a4b1b96863af4fccff9ad34ce484945ad7b -> 6e17c6d0491342e040da8f9c7c6aa7ce0b9cd696
Relationships: ['Requires', 'Adapts-from']


In [None]:
# Test 5: Full pipeline with save (use with caution - writes to DB)
# Uncomment to run
# full_results = extractor.process_all_triplets(min_citation_count=5, min_year=2023)
# print(f"Processed {len(full_results)} triplets")


In [14]:
# Cleanup
extractor.close()