In [None]:
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()

## Tools

In [None]:
# Import things that are needed generically
from langchain.pydantic_v1 import BaseModel, Field
from langchain.tools import BaseTool, StructuredTool, tool

In [None]:
class SearchNextStep(BaseModel):
    summary_level: str = Field(description='the summary level under which the sub-tree will be explored.', default='summary_0')
    query: str = Field(description='a query for the information you expect to find in the sub-tree')

In [None]:
from typing import Any, Optional, Type


    
class SummaryTree(BaseTool):
    name = 'branch retrieval'
    description = ' '.join('''
        This tool organizes the document in a summary tree. 
        The leaf nodes are the chunks from the document and the non-leaf nodes are the summaries of their children. 
        Higher-level nodes contain more general but less reliable information. 
        In the initial call, 
        Given a query, if  and a summary level, the tool will return the relevant chunk and all its ancestors as a branch in the summary tree. provide the multi-granularity context. 
        This context is useful in connecting the current relevant node with the remaining parts in the document.
    '''.split())
    args_schema: Type[BaseModel] = SearchNextStep
    return_direct: bool = False
    
    def __init__()

In [None]:
from enum import Enum

In [None]:
from llama_index.core import TreeIndex

In [None]:
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser

In [None]:
SemanticSplitterNodeParser()

# NavigateAgent

In [None]:
import sys
sys.path.append('../..')

from src.summary_tree import *
from tqdm.notebook import tqdm

from langsmith import Client
from langsmith.schemas import Run
from uuid import UUID
import pickle
from wikipediaapi import Wikipedia

client = Client()

## Doc retrieval

In [None]:
f = Factory(llm_name='microsoft/Phi-3-mini-128k-instruct')
wiki_wiki = Wikipedia('MyProjectName (merlin@example.com)', 'en')

In [None]:
page_py = wiki_wiki.page('Python_(programming_language)')

In [None]:
pages:List[str] = []
for sec_text in page_py.sections:
    pages.extend(f.split_text(sec_text.full_text()))

In [None]:
len(pages)

In [None]:
labeled_pages_text = '\n\n'.join([f'Passage {i}: {" ".join(s.split())}' for i, s in enumerate(pages)])

In [None]:
test_prompt = '{context}\n\n\n\nAbove are the passages from a document in their original sequential order. Please suggest passages for the task below. To suggest passages, only return the passage ids like "Passage 1" or "Passage 2 and 3" and a very brief description about the task-relevant information in each passage as your reason for suggestion.\n\nTask: {task}'

In [None]:
response = f.llm.invoke([HumanMessage(content=test_prompt.format(context=labeled_pages_text, task='Introduce the python syntax and semantics.'))])

In [None]:
print(labeled_pages_text)

In [None]:
print(response.content)

In [None]:
pages[17]

In [None]:
dpr_corpus = MyDPR.build_dpr(pages)
dpr_corpus.create_vector_retriever(f.embeder)

In [None]:
dpr_corpus.vectorstore.similarity_search("In what ways does Python aim for simplicity in its design?", 10)

In [None]:
dpr_corpus.vectorstore.similarity_search("Introduce the python design philosophy.", 10)

In [None]:
# Generate queries for the following task to retrieve relevant information from a document.

# Task: Introduce the python design philosophy.

In [None]:
queries = [
    "What are the key principles of Python's design philosophy?", 
    "How does Python's design philosophy influence its syntax and grammar?", 
    "What is meant by Python's 'one obvious way to do it' philosophy?", 
    "In what ways does Python aim for simplicity in its design?"
]
retrieval = []
for q in queries:
    retrieval.extend(dpr_corpus.vectorstore.similarity_search_with_relevance_scores(q))
retrieval.sort(key=lambda x: x[1], reverse=True)
uni_retrieval = []
for doc, score in retrieval:
    if doc.page_content not in uni_retrieval:
        uni_retrieval.append(doc.page_content)

In [None]:
uni_retrieval[:10]

In [None]:
dpr_corpus.vectorstore.similarity_search('What languages are influenced by Python?', 10) # Incomplete retrieval and noisy information

In [None]:
f.llm.invoke([HumanMessage(content='Hello')])

## Data

In [None]:
from datasets import load_dataset

fineweb = load_dataset('HuggingFaceFW/fineweb', 'sample-10BT')

In [None]:
a = fineweb['train'][0]

In [None]:
a

In [None]:
import requests
import json
from urllib.parse import quote_plus
import io
import gzip

# Please note: f-strings require Python 3.6+

# The URL of the Common Crawl Index server
CC_INDEX_SERVER = 'http://index.commoncrawl.org/'

# The Common Crawl index you want to query
INDEX_NAME = 'CC-MAIN-2013-20'      # Replace with the latest index name

# The URL you want to look up in the Common Crawl index
target_url = 'http://daytimeroyaltyonline.com/single/?p=8906650&t=8780053'  # Replace with your target URL

# Function to search the Common Crawl Index
def search_cc_index(url):
    encoded_url = quote_plus(url)
    index_url = f'{CC_INDEX_SERVER}{INDEX_NAME}-index?url={encoded_url}&output=json'
    response = requests.get(index_url)
    print("Response from CCI:", response.text)  # Output the response from the server
    if response.status_code == 200:
        records = response.text.strip().split('\n')
        return [json.loads(record) for record in records]
    else:
        return None

# Function to fetch the content from Common Crawl
def fetch_page_from_cc(records):
    for record in records:
        offset, length = int(record['offset']), int(record['length'])
        prefix = record['filename'].split('/')[0]
        s3_url = f'https://data.commoncrawl.org/{record["filename"]}'
        response = requests.get(s3_url, headers={'Range': f'bytes={offset}-{offset+length-1}'})
        if response.status_code == 206:
            # Process the response content if necessary
            # For example, you can use warcio to parse the WARC record
            zipped_file = io.BytesIO(response.content)
            unzipped_file = gzip.GzipFile(fileobj=zipped_file)

            raw_data: bytes = unzipped_file.read()
            return raw_data.decode('utf-8')
        else:
            print(f"Failed to fetch data: {response.status_code}")
            return None

# Search the index for the target URL
records = search_cc_index(target_url)
if records:
    print(f"Found {len(records)} records for {target_url}")

    # Fetch the page content from the first record
    content = fetch_page_from_cc(records)
    if content:
        print(f"Successfully fetched content for {target_url}")
        # You can now process the 'content' variable as needed
else:
    print(f"No records found for {target_url}")

In [None]:
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(content.strip().split('\r\n\r\n', 2)[2])

In [None]:
print(soup.prettify())

In [None]:
import chardet

In [None]:
detector = chardet.universaldetector.UniversalDetector()

In [None]:
detector.feed(content)

In [None]:
detector.done

In [None]:
detector.close()

In [None]:
from comcrawl import IndexClient

client = IndexClient()

client.search('http://daytimeroyaltyonline.com/single/?p=8906650&t=8780053')
client.download()

first_page_html = client.results[0]["html"]

In [None]:
client.results

In [None]:
content.decode(encoding='utf-8')

## LangSmith

In [None]:
project_map = defaultdict(list)
trace2runs: Dict[UUID, Dict[int, List[Run]]] = {}
for project in tqdm(client.list_projects(), total=38):
    traces = list(client.list_runs(project_name=project.name, is_root=True))
    trace_ids = [t.trace_id for t in traces]
    if 'tree' in project.name:
        project_map['tree'].extend(trace_ids)
    elif 'dpr' in project.name:
        project_map['dpr'].extend(trace_ids)
    for trace in traces:
        runs = [d for d in client.list_runs(run_ids=trace.child_run_ids) if 'langgraph_node' in d.extra['metadata']][::-1]
        step2runs = defaultdict(list)
        for run in runs:
            step2runs[run.extra['metadata']['langgraph_step']].append({'metadata': run.extra['metadata'], 'inputs': run.inputs, 'outputs': run.outputs})
        trace2runs[trace.trace_id] = step2runs

with open('result.pickle', 'wb') as f_out:
    pickle.dump(trace2runs, f_out)
    
with open('project_map.pickle', 'wb') as f_out:
    pickle.dump(project_map, f_out)

In [None]:
with open('result.pickle', 'rb') as f_in:
    trace2runs = pickle.load(f_in)
    
with open('project_map.pickle', 'rb') as f_in:
    project_map = pickle.load(f_in)

In [None]:
def get_steps(step2runs:Dict[int, Any], node:str):
    return [step for step, runs in step2runs.items() if runs and runs[0]['metadata']['langgraph_node'] == node]

In [None]:
trace_keys = [trace_key for trace_key, step2runs in trace2runs.items() if get_steps(step2runs, NavigateAgent.Nodes.REFORM_QUERY) and get_steps(step2runs, NavigateAgent.Nodes.GENERATE_ANSWER)]

In [None]:
len(trace_keys)

In [None]:
scores = defaultdict(list)
proposes = defaultdict(list)
for trace_key in trace_keys:
    step2runs = trace2runs[trace_key]
    answer_steps = get_steps(step2runs, NavigateAgent.Nodes.GENERATE_ANSWER)
    reform_steps = get_steps(step2runs, NavigateAgent.Nodes.REFORM_QUERY)
    propose_num = 0
    accept_num = 0
    temp_proposes = []
    for s in reform_steps:
        propose_num += len(step2runs[s+1][0]['outputs']['output']['new_document_ids'])
        if len(step2runs[s+1][0]['outputs']['output']['new_document_ids']):
            temp_proposes.append(len(step2runs[s+1][0]['outputs']['output']['new_document_ids']))
        accept_num += len(step2runs[s+2][0]['outputs']['output']['new_document_ids'])
        if len(step2runs[s+2][0]['outputs']['output']['new_document_ids']) == 0:
            break
    
    if propose_num > 0:
        if trace_key in project_map['dpr']:
            scores['dpr'].append(accept_num * 1. / propose_num)
            proposes['dpr'].extend(temp_proposes)
        if trace_key in project_map['tree']:
            scores['tree'].append(accept_num * 1. / propose_num)
            proposes['tree'].extend(temp_proposes)

In [None]:
np.mean(scores['dpr'])

In [None]:
np.mean(proposes['dpr'])

In [None]:
len(scores['dpr'])

In [None]:
np.mean(scores['tree'])

In [None]:
np.mean(proposes['tree'])

In [None]:
len(scores['tree'])

In [None]:
propose_num

In [None]:
accept_num

In [None]:
grades = [run['outputs']['output']['score'] for grade_step in grade_steps for run in step2runs[grade_step] if 'output' in run['outputs'] and 'score' in run['outputs']['output']]

In [None]:
grades

In [None]:
grade_steps

In [None]:
[run['outputs'] for run in step2runs[grade_steps[1]]]

In [None]:
[run['outputs'] for run in step2runs[retrieve_steps[1]]]

In [None]:
get_steps(step2runs, NavigateAgent.Nodes.REFORM_QUERY)

In [None]:
step2runs[4][0]

In [None]:
get_steps(step2runs, NavigateAgent.Nodes.GENERATE_ANSWER)

In [None]:
[run.inputs for run in step2runs[5]]

In [None]:
step2runs[6][0].extra['metadata']['langgraph_node']

In [None]:
step = 4
print(step2runs[step][0].extra['metadata']['langgraph_node'])
print(step2runs[step][0].inputs['input'])
print(step2runs[step][0].outputs['output'])

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "EMPTY"
f = Factory()

In [None]:
dataset = QualityDataset(None, split='dev')

In [None]:
test_id = 19
article = dataset.get_article(dataset.data[test_id])
questions, answers = dataset.get_questions_and_answers(dataset.data[test_id])

In [None]:
dpr_retriever, tree_retriever, documents = f.build_corpus(article, dpr_file=os.path.join(dataset.data_dir, f'dpr_{test_id}.json'), tree_file=os.path.join(dataset.data_dir, f'tree_{test_id}.json'))


In [None]:
tree_retriever.retrieve_children(tree_retriever.docs[14])