In [None]:
from dotenv import load_dotenv
import os
import time

os.chdir("../")
load_dotenv()

In [None]:
from knowledge_graph_creator.doc_extractor.pdf_extractor import PyMuPDFReader
from knowledge_graph_creator.patterns import ReferencePattern
from knowledge_graph_creator.extractors.reference_extractor import ReferenceExtractor
from knowledge_graph_creator.extractors.reference_details import (
    ReferenceDetailsExtractor,
)
import requests

In [None]:
DOC_PATH = "data/3643806.pdf"

In [None]:
pdf_reader = PyMuPDFReader()
pages = pdf_reader.to_list(path=DOC_PATH, select_pages=list(range(32, 42)))

In [None]:
reference_extractor = ReferenceExtractor(ReferencePattern.BRACKETED_NUMBER)

In [None]:
references = {}
for page_text in pages:
    references.update(reference_extractor.extract(text=page_text))

In [None]:
references_details_extractor = ReferenceDetailsExtractor()

references_details = {}
for ref_id, ref_text in references.items():
    details = references_details_extractor.parse_with_regex(
        ref_id=ref_id, ref_text=ref_text
    )
    references_details[ref_id] = details

In [None]:
references_details.get(25).title

In [None]:
# split and create unique authors
from knowledge_graph_creator.extractors.reference_details import ReferenceDetails

all_authors = []
for ref_id, ref_text in references_details.items():
    authors_list = ref_text.authors.split(", ")
    for author in authors_list:
        temp_reference_details = ReferenceDetails(
            id_=ref_id,
            authors=author,
            title=ref_text.title,
            publish=ref_text.publish,
            year=ref_text.year,
            page_or_volume=ref_text.page_or_volume.strip(),
        )
        all_authors.append(temp_reference_details)

In [None]:
print(len(all_authors))

In [None]:
parent_paper = ReferenceDetails(
    id_=0,
    title="Knowledge Graph Embedding: A Survey from the Perspective of Representation Spaces",
    authors="Jiahang Cao",
    publish="ACM Computing Surveys",
    year="2022",
    page_or_volume="",
)

In [None]:
len(references_details)

In [None]:
from tqdm import tqdm
from knowledge_graph_creator.db_neo4j.academic_graph import AcademicKnowledgeGraph
from neo4j import GraphDatabase


def get_api_key():
    """Retrieve the API key from environment variables."""
    api_key = os.getenv("SS_API_KEY")
    if not api_key:
        raise ValueError(
            "API key not found. Please set the SEMANTIC_SCHOLAR_API_KEY environment variable."
        )
    return api_key


def get_paper_json(query, api_key):
    """Fetch paper JSON from Semantic Scholar API based on the paper title."""
    try:
        url = "https://api.semanticscholar.org/graph/v1/paper/search/match"
        query_params = {
            "query": query,
            "fields": "paperId,corpusId,url,title,abstract,venue,publicationVenue,year,referenceCount,citationCount,influentialCitationCount,isOpenAccess,openAccessPdf,fieldsOfStudy,s2FieldsOfStudy,publicationTypes,publicationDate,journal,authors",
        }
        headers = {"x-api-key": api_key}
        response = requests.get(url, params=query_params, headers=headers).json()
        if "error" in response:
            print(f"API Error for query '{query}': {response['error']}")
            return None
        return response["data"][0]
    except Exception as e:
        print(f"Error fetching paper JSON for query '{query}': {e}")
        return None


# Initialize knowledge graph

uri = "bolt://localhost:7687"
user = "neo4j"
password = "your_password"
api_key = get_api_key()

successful_additions = []
# Initialize knowledge graph
kg = AcademicKnowledgeGraph(uri=uri, user=user, password=password)
try:
    # Add parent paper
    print(f"Adding parent paper: {parent_paper.title} ---")
    parent_paper_details = get_paper_json(parent_paper.title, api_key)
    parent_paper_id = kg.add_paper_from_json(parent_paper_details, return_paper_id=True)

    unsuccessful_additions = []
    for reference_detail in tqdm(
        references_details.values(),
        total=len(references_details),
        mininterval=0.1,
        dynamic_ncols=True,
    ):
        paper_details = get_paper_json(reference_detail.title, api_key)
        if paper_details:
            paper_id = kg.add_paper_from_json(paper_details, return_paper_id=True)
            kg.add_citation_relationship(
                citing_paper_id=parent_paper_id, cited_paper_id=paper_id
            )
            successful_additions.append(reference_detail)
            if len(successful_additions) == 100:
                break
        else:
            print(f"Paper not found for title: {reference_detail.title}")
            unsuccessful_additions.append(reference_detail)
        time.sleep(1)
finally:
    kg.close()

In [None]:
unsuccessful_additions

In [None]:
from neo4j import GraphDatabase

# Connect to Neo4j
uri = "bolt://localhost:7687"
user = "neo4j"
password = "your_password"
driver = GraphDatabase.driver(uri, auth=(user, password))


def write_references_to_neo4j(references_details):
    with driver.session() as session:
        for id_, details in enumerate(references_details[:1000]):
            # Example: details might have 'title', 'authors', 'year'
            session.run(
                """
                MERGE (r:Reference {id: $ref_id})
                SET r.title = $title, r.year = $year
                WITH r
                UNWIND $authors AS author
                MERGE (a:Author {name: author})
                MERGE (a)-[:AUTHORED]->(r)
                MERGE (r)-[:YOP]->(y:Year {value: $year})
                """,
                ref_id=details.id_,
                title=details.title,
                year=details.year,
                authors=details.authors,
            )


write_references_to_neo4j(all_authors)
driver.close()

In [None]:
driver = GraphDatabase.driver(uri, auth=(user, password))
with driver.session() as session:
    # Delete all nodes and relationships
    session.run("MATCH (n) DETACH DELETE n")

In [None]:
"https://www.mdclarity.com/denial-code/"