In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, time, json
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm

import sys
sys.path.append("../")
import os

import logging
from src.utils import logging_utils
from src.utils import env_utils
from src import functional

logger = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.DEBUG,
    format=logging_utils.DEFAULT_FORMAT,
    datefmt=logging_utils.DEFAULT_DATEFMT,
    stream=sys.stdout,
)

import torch
import transformers

logger.info(f"{torch.__version__=}, {torch.version.cuda=}")
logger.info(f"{torch.cuda.is_available()=}, {torch.cuda.device_count()=}, {torch.cuda.get_device_name()=}")
logger.info(f"{transformers.__version__=}")

  from .autonotebook import tqdm as notebook_tqdm


2025-04-15 15:44:37 __main__ INFO     torch.__version__='2.6.0+cu124', torch.version.cuda='12.4'
2025-04-15 15:44:37 __main__ INFO     torch.cuda.is_available()=True, torch.cuda.device_count()=1, torch.cuda.get_device_name()='NVIDIA RTX A6000'
2025-04-15 15:44:37 __main__ INFO     transformers.__version__='4.51.2'


In [5]:
import wikipedia
import yake

def extract_keywords_from_wiki(entity_name, language="en"):
    # Get Wikipedia content
    try:
        page = wikipedia.page(entity_name)
        content = page.content
        
        # Extract keywords with YAKE - adjust parameters for better results
        # Using bigrams and trigrams captures more meaningful entities
        kw_extractor = yake.KeywordExtractor(lan=language, n=3, dedupLim=0.9, top=50)
        keywords = kw_extractor.extract_keywords(content)
        
        return {
            "title": page.title,
            "keywords": [kw for kw, score in keywords],
            "url": page.url
        }
    except Exception as e:
        print(f"Error extracting keywords for {entity_name}: {e}")
        return None
    

entity = "Benjamin Franklin"
keywords = extract_keywords_from_wiki(entity)

keywords

2025-04-15 16:45:31 urllib3.connectionpool DEBUG    Starting new HTTP connection (1): en.wikipedia.org:80
2025-04-15 16:45:36 urllib3.connectionpool DEBUG    http://en.wikipedia.org:80 "GET /w/api.php?list=search&srprop=&srlimit=1&limit=1&srsearch=Benjamin+Franklin&srinfo=suggestion&format=json&action=query HTTP/1.1" 301 0
2025-04-15 16:45:36 urllib3.connectionpool DEBUG    Starting new HTTPS connection (1): en.wikipedia.org:443
2025-04-15 16:45:36 urllib3.connectionpool DEBUG    https://en.wikipedia.org:443 "GET /w/api.php?list=search&srprop=&srlimit=1&limit=1&srsearch=Benjamin+Franklin&srinfo=suggestion&format=json&action=query HTTP/1.1" 200 171
2025-04-15 16:45:36 urllib3.connectionpool DEBUG    Starting new HTTP connection (1): en.wikipedia.org:80
2025-04-15 16:45:36 urllib3.connectionpool DEBUG    http://en.wikipedia.org:80 "GET /w/api.php?prop=info%7Cpageprops&inprop=url&ppprop=disambiguation&redirects=&titles=Benjamin+Franklin&format=json&action=query HTTP/1.1" 301 0
2025-04-15 

{'title': 'Benjamin Franklin',
 'keywords': ['Franklin',
  'Benjamin Franklin',
  'Benjamin Franklin House',
  'Benjamin Franklin Papers',
  'Franklin wrote',
  'William Franklin',
  'American',
  'Benjamin Franklin father',
  'William Temple Franklin',
  'Pennsylvania',
  'Benjamin Franklin National',
  'United States',
  'Benjamin',
  'Benjamin Franklin Archived',
  'Philadelphia',
  'Philadelphia Franklin Institute',
  'Benjamin Franklin Medal',
  'Electric Benjamin Franklin',
  'Benjamin Franklin Influence',
  'Benjamin Franklin thought',
  'Benjamin Franklin Tercentenary',
  'Franklin House',
  'Franklin Connecticut Gazette',
  'Franklin Philadelphia trust',
  'Josiah Franklin',
  'Pennsylvania Gazette',
  'Franklin Poor Richard',
  'Franklin papers',
  'Franklin Philadelphia',
  'Philadelphia Franklin',
  'England',
  'Franklin father',
  'Benjamin Franklin string',
  'Benjamin Franklin LibriVox',
  'American Philosophical Society',
  'Benjamin Franklin insisted',
  'Benjamin Fra

In [11]:
import spacy
# spacy.cli.download("en_core_web_lg")

def extract_entities_with_spacy(entity_name):
    # Load SpaCy model
    nlp = spacy.load("en_core_web_lg")
    
    # Get Wikipedia content
    page = wikipedia.page(entity_name)
    content = page.content
    
    # Process with SpaCy
    doc = nlp(content)
    
    # Extract entities by type
    entities = {
        "PERSON": [],
        "ORG": [],
        "GPE": [],  # Countries, cities
        "DATE": [],
        "MISC": []
    }
    
    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_].append(ent.text)
        else:
            entities["MISC"].append(ent.text)
    
    # Deduplicate
    for category in entities:
        entities[category] = list(set(entities[category]))
    
    return entities

keywords = extract_entities_with_spacy(entity)
keywords

2025-04-15 16:56:23 urllib3.connectionpool DEBUG    Starting new HTTPS connection (1): raw.githubusercontent.com:443
2025-04-15 16:56:23 urllib3.connectionpool DEBUG    https://raw.githubusercontent.com:443 "GET /explosion/spacy-models/master/compatibility.json HTTP/1.1" 200 4351
Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selec

{'PERSON': ['Thomas Hutchinson',
  "Thomas Young's",
  'Cretico',
  'Elizabeth Downes',
  'John Hadley',
  'Thomas Tryon',
  'John Adams',
  'John Pringle',
  'Tim Folger',
  'Polly',
  'Hewson',
  'Mary Morrell Folger',
  'Charles I of',
  'Samuel Johnson',
  'Kammen',
  'Michael Faraday',
  'Christ',
  'John Rogers',
  'Kidd',
  'Samuel Keimer',
  'Amelia',
  "Josiah Franklin's",
  'Joseph-Ignace Guillotin',
  'Richard Penn',
  'Freemason',
  'Mather',
  'Jesus',
  'John Woolman',
  'Child',
  'Gaetano Donizetti',
  'William Watson',
  'Paxton',
  'John Paul Jones',
  'Richard Saunders',
  'Nicolas-Louis Robert',
  'Anne Child',
  'Jacques-Donatien Le Ray de Chaumont',
  'Thomas Percival',
  'Le Ray',
  'Voltaire',
  'Smith',
  'George Washington',
  "Benjamin Franklin's",
  'George III',
  'Benjamin',
  'John',
  'Kames',
  'Francis',
  'Tun Tavern',
  'Benjamin Leigh',
  'Chisholm',
  'Penn',
  'Richard Strauss',
  'Joseph Haydn',
  'Peter Timothy',
  'William Keith',
  'Peter Folg

In [36]:
from src.functional import ASK_ORACLE_MODEL
from typing import Literal

def extract_entities_with_oracle_LM(
    entity: str,
    oracle: Literal["gpt4o", "claude"] = "claude",
    other_entity: str = None,
):
    
    # system_prompt = f"""
    #     Extract key facts, relationships and attributes about {entity}.
    #     Format as a JSON with these categories:
    #     - biography: key biographical facts
    #     - achievements: major accomplishments
    #     - relationships: key people connected to the entity
    #     - organizations: affiliated organizations
    #     - places: significant locations
    #     - dates: important dates
    #     - misc: other noteworthy information
    # """
    if other_entity is None:
        system_prompt = f"""
Extrace key facts, entities, relationsships and attributes about {entity}.
Format as a JSON array, where each element is a tuple with two elements: "name of the other entity/fact" and "description of the relationship".
For example, if the entity is "Paris" the output should look like
```json
[
    ["France", "Paris is the capital of France"],
    ["Eiffel Tower", "The Eiffel Tower is located in Paris"],
    ["Louvre Museum", "The Louvre Museum is a famous museum in Paris"],
    ["City of Light", "Paris is often referred to as the City of Light"],

    ....
]
```
Make sure to include the most important and relevant facts about the entities. Give as many facts as possible.
"""
    
    else:
        system_prompt = f"""
Given two entities, \"{entity}\" and \"{other_entity}\", find a common link or relation between them.
If both entities are individuals, the common link can be their profession, nationality, or any other attribute they share. Their relation can be if someone is the student/teacher of the other etc.
Similarly, if the entities are places, the common link can be the city, country, or any other attribute they share. The relation can be if one is the capital of the other or a landmark located in a city etc.

Format your answer as a JSON array, where each element is a tuple with two elements: "name of the connection" and "brief explanation of how this connection is relevant to both of the entities".
For example, if the entities are "Batman" and "Ironman", the output should look like

```json
[
    ["Superheroes", "Both Batman and Ironman are iconic superheroes in the comic book world."],
    ["Gadgets", "Both characters use advanced technology and gadgets to fight crime."],
    ["Billionaires", "Both characters are wealthy individuals who use their resources to become superheroes."],
    ....
]
```
Make sure to give as many connections as possible. If you can't find any connection, just return an empty JSON array.
"""

    response = ASK_ORACLE_MODEL[oracle](system_prompt)
    
    # Parse the response
    try:
        lines = response.splitlines()[1:-1]
        response = "\n".join(lines)
        response_json = json.loads(response)
    except json.JSONDecodeError:
        print("Failed to parse JSON response.")
        return response
    
    return response_json

In [37]:
# entity = "Leonardo da Vinci"
# entity = "Benjamin Franklin"
entity = "Japan"

keywords_gpt = extract_entities_with_oracle_LM(entity, oracle="gpt4o")
keywords_gpt

2025-04-15 17:50:33 openai._base_client DEBUG    Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': '\nExtrace key facts, entities, relationsships and attributes about Japan.\nFormat as a JSON array, where each element is a tuple with two elements: "name of the other entity/fact" and "description of the relationship".\nFor example, if the entity is "Paris" the output should look like\n```json\n[\n    ["France", "Paris is the capital of France"],\n    ["Eiffel Tower", "The Eiffel Tower is located in Paris"],\n    ["Louvre Museum", "The Louvre Museum is a famous museum in Paris"],\n    ["City of Light", "Paris is often referred to as the City of Light"],\n\n    ....\n]\n```\nMake sure to include the most important and relevant facts about the entities. Give as many facts as possible.\n'}], 'model': 'gpt-4o', 'max_tokens': 4000, 'temperature': 0

2025-04-15 17:50:33 httpcore.connection DEBUG    connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x7f82ec447190>
2025-04-15 17:50:33 httpcore.connection DEBUG    start_tls.started ssl_context=<ssl.SSLContext object at 0x7f82c64f0b90> server_hostname='api.openai.com' timeout=5.0
2025-04-15 17:50:33 httpcore.connection DEBUG    start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x7f82cf845f10>
2025-04-15 17:50:33 httpcore.http11 DEBUG    send_request_headers.started request=<Request [b'POST']>
2025-04-15 17:50:33 httpcore.http11 DEBUG    send_request_headers.complete
2025-04-15 17:50:33 httpcore.http11 DEBUG    send_request_body.started request=<Request [b'POST']>
2025-04-15 17:50:33 httpcore.http11 DEBUG    send_request_body.complete
2025-04-15 17:50:33 httpcore.http11 DEBUG    receive_response_headers.started request=<Request [b'POST']>
2025-04-15 17:50:42 httpcore.http11 DEBUG    receive_response_headers.complete return_value=

[['Tokyo', 'Tokyo is the capital city of Japan'],
 ['Honshu', 'Honshu is the largest island of Japan'],
 ['Mount Fuji', 'Mount Fuji is the highest mountain in Japan'],
 ['Shinto', 'Shinto is one of the major religions in Japan'],
 ['Buddhism', 'Buddhism is one of the major religions in Japan'],
 ['Yen', 'The yen is the currency of Japan'],
 ['Emperor', 'The Emperor is the ceremonial head of state in Japan'],
 ['Prime Minister', 'The Prime Minister is the head of government in Japan'],
 ['Hiroshima',
  'Hiroshima is a city in Japan that was bombed during World War II'],
 ['Nagasaki',
  'Nagasaki is a city in Japan that was bombed during World War II'],
 ['Shinkansen', 'The Shinkansen is a high-speed train network in Japan'],
 ['Sumo', 'Sumo is a traditional sport in Japan'],
 ['Anime', 'Anime is a popular form of entertainment originating from Japan'],
 ['Sushi', 'Sushi is a traditional Japanese dish'],
 ['Cherry Blossoms',
  'Cherry blossoms are a symbol of Japan and bloom in spring'],

In [38]:
keywords_claude = extract_entities_with_oracle_LM(entity, oracle="claude")
keywords_claude

2025-04-15 17:51:05 anthropic._base_client DEBUG    Request options: {'method': 'post', 'url': '/v1/messages', 'timeout': Timeout(connect=5.0, read=600, write=600, pool=600), 'files': None, 'json_data': {'max_tokens': 4000, 'messages': [{'role': 'user', 'content': [{'type': 'text', 'text': '\nExtrace key facts, entities, relationsships and attributes about Japan.\nFormat as a JSON array, where each element is a tuple with two elements: "name of the other entity/fact" and "description of the relationship".\nFor example, if the entity is "Paris" the output should look like\n```json\n[\n    ["France", "Paris is the capital of France"],\n    ["Eiffel Tower", "The Eiffel Tower is located in Paris"],\n    ["Louvre Museum", "The Louvre Museum is a famous museum in Paris"],\n    ["City of Light", "Paris is often referred to as the City of Light"],\n\n    ....\n]\n```\nMake sure to include the most important and relevant facts about the entities. Give as many facts as possible.\n'}]}], 'model':

[['Geography', 'Japan is an island country located in East Asia'],
 ['Official Name',
  "The official name is 'Nihon-koku' or 'Nippon-koku' in Japanese, meaning 'State of Japan'"],
 ['Capital', 'Tokyo is the capital and largest city of Japan'],
 ['Government',
  'Japan is a constitutional monarchy with a parliamentary government'],
 ['Emperor',
  'The Emperor of Japan is the head of state but has limited powers'],
 ['Prime Minister', 'The Prime Minister is the head of government in Japan'],
 ['Population',
  'Japan has approximately 125 million people, making it the eleventh most populous country'],
 ['Area', 'Japan has a total area of approximately 377,975 square kilometers'],
 ['Islands',
  'Japan consists of 6,852 islands, with the four largest being Honshu, Hokkaido, Kyushu, and Shikoku'],
 ['Mount Fuji',
  "Mount Fuji is Japan's highest mountain at 3,776 meters and is an active volcano"],
 ['Language', 'Japanese is the official language of Japan'],
 ['Writing System',
  'Japanese 

In [44]:
####################################
# entity = "Japan"
# other_entity = "Germany"

entity = "Benjamin Franklin"
other_entity = "Leonardo da Vinci"
####################################

In [45]:
connections_gpt = extract_entities_with_oracle_LM(entity, oracle="gpt4o", other_entity=other_entity)
connections_gpt

2025-04-15 17:54:52 openai._base_client DEBUG    Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': '\nGiven two entities, "Benjamin Franklin" and "Leonardo da Vinci", find a common link or relation between them.\nIf both entities are individuals, the common link can be their profession, nationality, or any other attribute they share. Their relation can be if someone is the student/teacher of the other etc.\nSimilarly, if the entities are places, the common link can be the city, country, or any other attribute they share. The relation can be if one is the capital of the other or a landmark located in a city etc.\n\nFormat your answer as a JSON array, where each element is a tuple with two elements: "name of the connection" and "brief explanation of how this connection is relevant to both of the entities".\nFor example, if the entities are "Ba

[['Polymaths',
  'Both Benjamin Franklin and Leonardo da Vinci were polymaths, excelling in multiple fields such as science, art, and engineering.'],
 ['Inventors',
  'Both individuals were renowned inventors, with Franklin known for inventions like the lightning rod and bifocals, and da Vinci for conceptualizing inventions like the helicopter and parachute.'],
 ['Artists',
  'Leonardo da Vinci is famous for his paintings such as the Mona Lisa, while Benjamin Franklin was also known for his interest in art and printing.'],
 ['Renaissance Men',
  "Both are considered quintessential 'Renaissance men' due to their wide-ranging interests and contributions to various fields."],
 ['Writers',
  'Both Franklin and da Vinci were prolific writers, with Franklin known for his essays and almanacs, and da Vinci for his extensive notebooks.'],
 ['Scientific Contributions',
  "Both made significant contributions to science, with Franklin's work in electricity and da Vinci's studies in anatomy and phy

In [46]:
connections_claude = extract_entities_with_oracle_LM(entity, oracle="claude", other_entity=other_entity)
connections_claude

2025-04-15 17:54:58 anthropic._base_client DEBUG    Request options: {'method': 'post', 'url': '/v1/messages', 'timeout': Timeout(connect=5.0, read=600, write=600, pool=600), 'files': None, 'json_data': {'max_tokens': 4000, 'messages': [{'role': 'user', 'content': [{'type': 'text', 'text': '\nGiven two entities, "Benjamin Franklin" and "Leonardo da Vinci", find a common link or relation between them.\nIf both entities are individuals, the common link can be their profession, nationality, or any other attribute they share. Their relation can be if someone is the student/teacher of the other etc.\nSimilarly, if the entities are places, the common link can be the city, country, or any other attribute they share. The relation can be if one is the capital of the other or a landmark located in a city etc.\n\nFormat your answer as a JSON array, where each element is a tuple with two elements: "name of the connection" and "brief explanation of how this connection is relevant to both of the ent

2025-04-15 17:54:58 httpcore.connection DEBUG    connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x7f82cf4bf250>
2025-04-15 17:54:58 httpcore.connection DEBUG    start_tls.started ssl_context=<ssl.SSLContext object at 0x7f82c64f31d0> server_hostname='api.anthropic.com' timeout=5.0
2025-04-15 17:54:58 httpcore.connection DEBUG    start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x7f82cff46650>
2025-04-15 17:54:58 httpcore.http11 DEBUG    send_request_headers.started request=<Request [b'POST']>
2025-04-15 17:54:58 httpcore.http11 DEBUG    send_request_headers.complete
2025-04-15 17:54:58 httpcore.http11 DEBUG    send_request_body.started request=<Request [b'POST']>
2025-04-15 17:54:58 httpcore.http11 DEBUG    send_request_body.complete
2025-04-15 17:54:58 httpcore.http11 DEBUG    receive_response_headers.started request=<Request [b'POST']>
2025-04-15 17:55:07 httpcore.http11 DEBUG    receive_response_headers.complete return_val

[['Polymaths',
  'Both Benjamin Franklin and Leonardo da Vinci were renowned polymaths who excelled in multiple disciplines including science, art, and invention.'],
 ['Inventors',
  'Both men were prolific inventors. Franklin invented the lightning rod, bifocals, and the Franklin stove, while da Vinci conceptualized flying machines, an armored car, and many other devices.'],
 ['Scientists',
  'Both made significant contributions to scientific knowledge. Franklin conducted experiments with electricity, while da Vinci studied anatomy, geology, and other natural sciences.'],
 ['Artists',
  "Though Franklin was less known for this, he played music and wrote, while da Vinci was one of history's greatest painters, creating masterpieces like the Mona Lisa and The Last Supper."],
 ['Writers',
  "Both men were accomplished writers. Franklin wrote Poor Richard's Almanack and his autobiography, while da Vinci left thousands of pages of notebooks with writings and sketches."],
 ['Diplomats',
  'F