In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, time, json
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm

import sys
sys.path.append("../")
import os

import logging
from src.utils import logging_utils
from src.utils import env_utils
from src import functional

logger = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.DEBUG,
    format=logging_utils.DEFAULT_FORMAT,
    datefmt=logging_utils.DEFAULT_DATEFMT,
    stream=sys.stdout,
)

import torch
import transformers

logger.info(f"{torch.__version__=}, {torch.version.cuda=}")
logger.info(f"{torch.cuda.is_available()=}, {torch.cuda.device_count()=}, {torch.cuda.get_device_name()=}")
logger.info(f"{transformers.__version__=}")

  from .autonotebook import tqdm as notebook_tqdm


2025-04-15 15:44:37 __main__ INFO     torch.__version__='2.6.0+cu124', torch.version.cuda='12.4'
2025-04-15 15:44:37 __main__ INFO     torch.cuda.is_available()=True, torch.cuda.device_count()=1, torch.cuda.get_device_name()='NVIDIA RTX A6000'
2025-04-15 15:44:37 __main__ INFO     transformers.__version__='4.51.2'


In [5]:
import wikipedia
import yake

def extract_keywords_from_wiki(entity_name, language="en"):
    # Get Wikipedia content
    try:
        page = wikipedia.page(entity_name)
        content = page.content
        
        # Extract keywords with YAKE - adjust parameters for better results
        # Using bigrams and trigrams captures more meaningful entities
        kw_extractor = yake.KeywordExtractor(lan=language, n=3, dedupLim=0.9, top=50)
        keywords = kw_extractor.extract_keywords(content)
        
        return {
            "title": page.title,
            "keywords": [kw for kw, score in keywords],
            "url": page.url
        }
    except Exception as e:
        print(f"Error extracting keywords for {entity_name}: {e}")
        return None
    

entity = "Benjamin Franklin"
keywords = extract_keywords_from_wiki(entity)

keywords

2025-04-15 16:45:31 urllib3.connectionpool DEBUG    Starting new HTTP connection (1): en.wikipedia.org:80
2025-04-15 16:45:36 urllib3.connectionpool DEBUG    http://en.wikipedia.org:80 "GET /w/api.php?list=search&srprop=&srlimit=1&limit=1&srsearch=Benjamin+Franklin&srinfo=suggestion&format=json&action=query HTTP/1.1" 301 0
2025-04-15 16:45:36 urllib3.connectionpool DEBUG    Starting new HTTPS connection (1): en.wikipedia.org:443
2025-04-15 16:45:36 urllib3.connectionpool DEBUG    https://en.wikipedia.org:443 "GET /w/api.php?list=search&srprop=&srlimit=1&limit=1&srsearch=Benjamin+Franklin&srinfo=suggestion&format=json&action=query HTTP/1.1" 200 171
2025-04-15 16:45:36 urllib3.connectionpool DEBUG    Starting new HTTP connection (1): en.wikipedia.org:80
2025-04-15 16:45:36 urllib3.connectionpool DEBUG    http://en.wikipedia.org:80 "GET /w/api.php?prop=info%7Cpageprops&inprop=url&ppprop=disambiguation&redirects=&titles=Benjamin+Franklin&format=json&action=query HTTP/1.1" 301 0
2025-04-15 

{'title': 'Benjamin Franklin',
 'keywords': ['Franklin',
  'Benjamin Franklin',
  'Benjamin Franklin House',
  'Benjamin Franklin Papers',
  'Franklin wrote',
  'William Franklin',
  'American',
  'Benjamin Franklin father',
  'William Temple Franklin',
  'Pennsylvania',
  'Benjamin Franklin National',
  'United States',
  'Benjamin',
  'Benjamin Franklin Archived',
  'Philadelphia',
  'Philadelphia Franklin Institute',
  'Benjamin Franklin Medal',
  'Electric Benjamin Franklin',
  'Benjamin Franklin Influence',
  'Benjamin Franklin thought',
  'Benjamin Franklin Tercentenary',
  'Franklin House',
  'Franklin Connecticut Gazette',
  'Franklin Philadelphia trust',
  'Josiah Franklin',
  'Pennsylvania Gazette',
  'Franklin Poor Richard',
  'Franklin papers',
  'Franklin Philadelphia',
  'Philadelphia Franklin',
  'England',
  'Franklin father',
  'Benjamin Franklin string',
  'Benjamin Franklin LibriVox',
  'American Philosophical Society',
  'Benjamin Franklin insisted',
  'Benjamin Fra

In [11]:
import spacy
# spacy.cli.download("en_core_web_lg")

def extract_entities_with_spacy(entity_name):
    # Load SpaCy model
    nlp = spacy.load("en_core_web_lg")
    
    # Get Wikipedia content
    page = wikipedia.page(entity_name)
    content = page.content
    
    # Process with SpaCy
    doc = nlp(content)
    
    # Extract entities by type
    entities = {
        "PERSON": [],
        "ORG": [],
        "GPE": [],  # Countries, cities
        "DATE": [],
        "MISC": []
    }
    
    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_].append(ent.text)
        else:
            entities["MISC"].append(ent.text)
    
    # Deduplicate
    for category in entities:
        entities[category] = list(set(entities[category]))
    
    return entities

keywords = extract_entities_with_spacy(entity)
keywords

2025-04-15 16:56:23 urllib3.connectionpool DEBUG    Starting new HTTPS connection (1): raw.githubusercontent.com:443
2025-04-15 16:56:23 urllib3.connectionpool DEBUG    https://raw.githubusercontent.com:443 "GET /explosion/spacy-models/master/compatibility.json HTTP/1.1" 200 4351
Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selec

{'PERSON': ['Thomas Hutchinson',
  "Thomas Young's",
  'Cretico',
  'Elizabeth Downes',
  'John Hadley',
  'Thomas Tryon',
  'John Adams',
  'John Pringle',
  'Tim Folger',
  'Polly',
  'Hewson',
  'Mary Morrell Folger',
  'Charles I of',
  'Samuel Johnson',
  'Kammen',
  'Michael Faraday',
  'Christ',
  'John Rogers',
  'Kidd',
  'Samuel Keimer',
  'Amelia',
  "Josiah Franklin's",
  'Joseph-Ignace Guillotin',
  'Richard Penn',
  'Freemason',
  'Mather',
  'Jesus',
  'John Woolman',
  'Child',
  'Gaetano Donizetti',
  'William Watson',
  'Paxton',
  'John Paul Jones',
  'Richard Saunders',
  'Nicolas-Louis Robert',
  'Anne Child',
  'Jacques-Donatien Le Ray de Chaumont',
  'Thomas Percival',
  'Le Ray',
  'Voltaire',
  'Smith',
  'George Washington',
  "Benjamin Franklin's",
  'George III',
  'Benjamin',
  'John',
  'Kames',
  'Francis',
  'Tun Tavern',
  'Benjamin Leigh',
  'Chisholm',
  'Penn',
  'Richard Strauss',
  'Joseph Haydn',
  'Peter Timothy',
  'William Keith',
  'Peter Folg

In [29]:
from src.functional import ASK_ORACLE_MODEL
from typing import Literal

def extract_entities_with_oracle_LM(
    entity: str,
    oracle: Literal["gpt4o", "claude"] = "claude"
):
    
    # system_prompt = f"""
    #     Extract key facts, relationships and attributes about {entity}.
    #     Format as a JSON with these categories:
    #     - biography: key biographical facts
    #     - achievements: major accomplishments
    #     - relationships: key people connected to the entity
    #     - organizations: affiliated organizations
    #     - places: significant locations
    #     - dates: important dates
    #     - misc: other noteworthy information
    # """
    system_prompt = f"""
    Extrace key facts, entities, relationsships and attributes about {entity}.
    Format as a JSON array, where each element is a tuple with two elements: "name of the other entity/fact" and "description of the relationship".
    For example, if the entity is "Paris" the output should look like
    ```json
    [
        ["France", "Paris is the capital of France"],
        ["Eiffel Tower", "The Eiffel Tower is located in Paris"],
        ["Louvre Museum", "The Louvre Museum is a famous museum in Paris"],
        ["City of Light", "Paris is often referred to as the City of Light"],

        ....
    ]
    ```
    Make sure to include the most important and relevant facts about the entities. Give as many facts as possible.
    """

    response = ASK_ORACLE_MODEL[oracle](system_prompt)
    
    # Parse the response
    try:
        lines = response.splitlines()[1:-1]
        response = "\n".join(lines)
        response_json = json.loads(response)
    except json.JSONDecodeError:
        print("Failed to parse JSON response.")
        return response
    
    return response_json

In [32]:
# entity = "Leonardo da Vinci"
entity = "Benjamin Franklin"

keywords_gpt = extract_entities_with_oracle_LM(entity, oracle="gpt4o")
keywords_gpt

2025-04-15 17:32:53 httpcore.connection DEBUG    close.started
2025-04-15 17:32:53 httpcore.connection DEBUG    close.complete
2025-04-15 17:32:53 openai._base_client DEBUG    Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': '\n    Extrace key facts, entities, relationsships and attributes about Benjamin Franklin.\n    Format as a JSON array, where each element is a tuple with two elements: "name of the other entity/fact" and "description of the relationship".\n    For example, if the entity is "Paris" the output should look like\n    ```json\n    [\n        ["France", "Paris is the capital of France"],\n        ["Eiffel Tower", "The Eiffel Tower is located in Paris"],\n        ["Louvre Museum", "The Louvre Museum is a famous museum in Paris"],\n        ["City of Light", "Paris is often referred to as the City of Light"],\n\n        ....\n 

[['Founding Father',
  'Benjamin Franklin is one of the Founding Fathers of the United States'],
 ['Declaration of Independence',
  'Benjamin Franklin was a drafter and signer of the Declaration of Independence'],
 ['Constitutional Convention',
  'Benjamin Franklin was a delegate at the Constitutional Convention'],
 ['Electricity',
  'Benjamin Franklin is famous for his experiments with electricity, including the kite experiment'],
 ['Bifocals', 'Benjamin Franklin invented bifocal glasses'],
 ['Franklin Stove', 'Benjamin Franklin invented the Franklin stove'],
 ["Poor Richard's Almanack",
  "Benjamin Franklin published Poor Richard's Almanack under the pseudonym Richard Saunders"],
 ['Diplomat',
  'Benjamin Franklin served as a diplomat to France during the American Revolution'],
 ['Treaty of Paris 1783',
  'Benjamin Franklin was a negotiator and signer of the Treaty of Paris 1783, which ended the American Revolutionary War'],
 ['Pennsylvania Gazette',
  'Benjamin Franklin was the publ

In [33]:
keywords_claude = extract_entities_with_oracle_LM(entity, oracle="claude")
keywords_claude

2025-04-15 17:33:02 anthropic._base_client DEBUG    Request options: {'method': 'post', 'url': '/v1/messages', 'timeout': Timeout(connect=5.0, read=600, write=600, pool=600), 'files': None, 'json_data': {'max_tokens': 4000, 'messages': [{'role': 'user', 'content': [{'type': 'text', 'text': '\n    Extrace key facts, entities, relationsships and attributes about Benjamin Franklin.\n    Format as a JSON array, where each element is a tuple with two elements: "name of the other entity/fact" and "description of the relationship".\n    For example, if the entity is "Paris" the output should look like\n    ```json\n    [\n        ["France", "Paris is the capital of France"],\n        ["Eiffel Tower", "The Eiffel Tower is located in Paris"],\n        ["Louvre Museum", "The Louvre Museum is a famous museum in Paris"],\n        ["City of Light", "Paris is often referred to as the City of Light"],\n\n        ....\n    ]\n    ```\n    Make sure to include the most important and relevant facts abou

[['Birth',
  'Benjamin Franklin was born on January 17, 1706, in Boston, Massachusetts'],
 ['Death',
  'Benjamin Franklin died on April 17, 1790, in Philadelphia, Pennsylvania'],
 ['Founding Father',
  'Benjamin Franklin was one of the Founding Fathers of the United States'],
 ['Declaration of Independence',
  'Franklin was one of the signers of the Declaration of Independence in 1776'],
 ['Constitution',
  'Franklin was a delegate to the Constitutional Convention and signed the U.S. Constitution in 1787'],
 ['Printing',
  'Franklin was a successful printer and publisher who established the Pennsylvania Gazette'],
 ["Poor Richard's Almanack",
  "Franklin published Poor Richard's Almanack, a yearly publication containing weather forecasts, puzzles, and aphorisms"],
 ['Scientist',
  'Franklin conducted extensive research on electricity and other scientific topics'],
 ['Kite Experiment',
  'Franklin performed his famous kite experiment to demonstrate that lightning is electricity'],
 ['Li