In [1]:
import yaml
import os
import re 
import random
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate

with open('config.yml', 'r') as file:
    config = yaml.safe_load(file)
anthropic_key = config["anthropic_key"]
os.environ["ANTHROPIC_API_KEY"] = anthropic_key
chat = ChatAnthropic(temperature=0, model_name="claude-3-haiku-20240307")

In [2]:
from wikipedia_functions import get_random_page, get_page_links, get_page_content, check_wikipedia_pages_existence

start_page = get_random_page(1)[0]
final_page = "United States"
final_page_content = get_page_content(final_page)
print(start_page)

Rye Brook Open


In [3]:
from prompt_functions import get_reasoned_crawler_template, get_summarize_template, get_broad_links_template, get_explain_links_template

crawler_system, crawler_template = get_reasoned_crawler_template()
summarize_system, summarize_template = get_summarize_template() 
broad_links_system, broad_links_template = get_broad_links_template()
explain_links_system, explain_links_template = get_explain_links_template()

In [4]:
human = summarize_template
prompt = ChatPromptTemplate.from_messages([("system", crawler_system), ("human", human)])
chain = prompt | chat
end_summary = chain.invoke(
        {
            "page_content": final_page_content
        }
    )
model_output = end_summary.content
pattern = r'<summary>(.*?)</summary>'
matches = re.findall(pattern, model_output)
end_content = matches[0]

In [5]:
start_links = get_page_links(start_page)
print(start_links)
checked_links = check_wikipedia_pages_existence(start_links)
valid_links = [link for link in start_links if checked_links[link]]

['tennis', 'Rye Brook, New York', 'Tennis Court#Outdoor court', 'John Ross (tennis)', '1988', 'Lloyd Bourne', '1987 Rye Brook Open – Doubles', 'Andrew Castle', 'outdoor hard courts', '1988 Rye Brook Open – Singles', 'John Ross', 'Grand Prix', 'Ramesh Krishnan', 'Peter Lundgren', 'Jeremy Bates (tennis player)', 'Tim Wilkison', 'Jeff Klaparda', 'Jeremy Bates', '1988 Rye Brook Open – Doubles', 'Hard', 'Mark Woodforde', 'Michael Mortensen', 'Grand Prix tennis circuit', ' ', '1987', 'Grand Prix circuit', '1987 Rye Brook Open – Singles', 'Carl Limberger', 'Hard court', 'Milan Šrejber']


In [6]:
from matching_functions import find_closest_documents
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

<All keys matched successfully>


In [7]:
crawler_human = crawler_template
crawler_prompt = ChatPromptTemplate.from_messages([("system", crawler_system), ("human", crawler_human)])
crawler_chain = crawler_prompt | chat
broad_links_human = broad_links_template
broad_links_prompt = ChatPromptTemplate.from_messages([("system", broad_links_system), ("human", broad_links_human)])
broad_links_chain = broad_links_prompt | chat
explain_links_human = explain_links_template
explain_links_prompt = ChatPromptTemplate.from_messages([("system", explain_links_system), ("human", explain_links_human)])
explain_links_chain = explain_links_prompt | chat

current_page = start_page
current_links = valid_links
forbidden_links = [start_page]
for i in range(15):
    closest_links = find_closest_documents(model, final_page, current_links)
    broad_links = broad_links_chain.invoke({
        "link_list": current_links,
        "end_page": final_page,
        "end_page_summary": end_content
    })
    model_output = broad_links.content
    pattern = r'<output>(.*?)</output>'
    matches = re.findall(pattern, model_output)
    links = [link.strip() for link in matches[0].split(",")]
    current_links = closest_links + links
    explain_links = explain_links_chain.invoke({
        "link_list": current_links,
        "end_page": final_page,
        "end_page_summary": end_content
    })
    reasonings = explain_links.content
    crawler_text = crawler_chain.invoke(
        {
            "current_page": current_page,
            "current_links": current_links,
            "reasonings": reasonings,
            "end_page": final_page,
            "end_page_content": end_content
        }
    )
    model_output = crawler_text.content
    print(model_output)
    pattern = r'<output>(.*?)</output>'
    matches = re.findall(pattern, model_output)
    current_page = matches[0]
    if current_page == "Random":
        current_page = random.choice(current_links)
    if current_page != final_page:
        forbidden_links.append(current_page)
        found_links = get_page_links(current_page)
        checked_links = check_wikipedia_pages_existence(start_links)
        current_links = [link for link in start_links if checked_links[link] and link not in forbidden_links]
    else:
        print(f"Page reached in {i} iterations!")
        break

<reasoning>
Based on the provided information and reasoning, the most relevant link to click on to get closer to the goal page of "United States" appears to be "Grand Prix". This link could provide information about major sporting events and activities that have taken place in the United States, which could help build a broader understanding of the country's culture and history. While some of the other links, such as the specific years or individuals, could also be relevant, the "Grand Prix" link seems to have the most potential to lead to information directly related to the United States.
</reasoning>

<output>Grand Prix</output>
<reasoning>
Based on the provided information, the most relevant link to click on to reach the goal page of "United States" appears to be '1988'. This year could be significant in the history or timeline of the United States, and exploring events or developments from that time period may provide useful context and connections to the broader history and cultur

KeyboardInterrupt: 