https://gpt-index.readthedocs.io/en/latest/examples/index_structs/doc_summary/DocSummary.html

In [1]:
import phoenix as px

In [2]:
session = px.launch_app()

🌍 To view the Phoenix app in your browser, visit http://127.0.0.1:6060/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [3]:
import json
import os
from getpass import getpass
from urllib.request import urlopen

import openai
import pandas as pd
import phoenix as px
from gcsfs import GCSFileSystem
from llama_index import ServiceContext, StorageContext, load_index_from_storage, set_global_handler
from llama_index.embeddings import OpenAIEmbedding
from llama_index.graph_stores.simple import SimpleGraphStore
from llama_index.llms import OpenAI
from phoenix.experimental.evals import (
    OpenAIModel,
    compute_precisions_at_k,
    run_relevance_eval,
)
from tqdm import tqdm

pd.set_option("display.max_colwidth", 1000)

In [4]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())


In [5]:
set_global_handler("arize_phoenix")

In [6]:
import os
import openai

In [7]:
import logging
import sys
import json

logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [8]:
import nest_asyncio

nest_asyncio.apply()

In [9]:
from llama_index import (
    SimpleDirectoryReader,
    ServiceContext,
    get_response_synthesizer,
)
from llama_index.indices.document_summary import DocumentSummaryIndex
from llama_index.llms import OpenAI
from llama_index.schema import Document

In [10]:

def get_documents_from_transcription(json_file, chunk_sizes):
    """
    Concatenates words from a given JSON data structure with maximum chunk sizes.

    Parameters:
        json_file (str): The path to the JSON file.
        chunk_sizes (list of int): The list of maximum number of words allowed in each concatenated chunk.

    Returns:
        list: A list of nodes. Each node contains concatenated text and metadata (start_time, end_time).

    """
    def add_document_to_chunks(word_list, start_time, end_time, chunks,chunk_size):
        """Helper function to add a new Document to chunks."""
        text = " ".join(word_list)
        doc = Document(page_content=text, metadata={'start': start_time, 
                                              "end": end_time,
                                             "chunk_size":chunk_size})
        chunks.append(doc)

    concatenated_chunks = []
    current_chunk = []
    current_word_count = 0
    current_start_time = None
    current_end_time = None

    # Load JSON data
    with open(json_file, "r") as f:
        json_data = json.load(f)

    # Initialize service context
    llm = OpenAI(model="gpt-4")
    service_context = ServiceContext.from_defaults(llm=llm)
    
    if not isinstance(chunk_sizes, list):
        chunk_sizes = [chunk_sizes]
    for chunk_size in chunk_sizes:
        for segment in json_data["segments"]:
            for word_info in segment["words"]:
                if current_word_count + 1 > chunk_size:
                    add_document_to_chunks(current_chunk, current_start_time, current_end_time, concatenated_chunks, chunk_size)
                    current_chunk = []
                    current_word_count = 0
                    current_start_time = None

                current_chunk.append(word_info["word"].strip())
                current_word_count += 1

                if current_start_time is None:
                    current_start_time = word_info["start"]
                current_end_time = word_info["end"]

        if current_chunk:
            add_document_to_chunks(current_chunk, current_start_time, current_end_time, concatenated_chunks, chunk_size)

    return concatenated_chunks


# Example usage
chunk_sizes = [128, 256, 512, 1024]
docs = get_documents_from_transcription(json_file="transcriptions/youtube_UyoXmHS-KGc.json", chunk_sizes=chunk_sizes)
len(docs)

13

In [11]:
import json
from llama_index import Document

def load_json_file_and_extract_text(json_file_path):
    """Loads a JSON file and extracts the content of the "text" key.

    Args:
    json_file_path: The path to the JSON file.

    Returns:
    A list of Document objects, containing the extracted content of the "text" key.
    """

    documents = []

    with open(json_file_path, "r") as json_file:
        json_data = json.load(json_file)
        # return json_data
        documents.append(Document(doc_id=json_file_path , text=json_data["text"]))

    return documents


In [12]:
docs = load_json_file_and_extract_text("transcriptions/youtube_5p248yoa3oE.json")
docs


[Document(id_='transcriptions/youtube_5p248yoa3oE.json', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='1dd6c2d04a5947bc4f30e51c9b0b71a2c7a4c7ec35a706ddd2ae404b324b54e0', text=" It is my pleasure to welcome Dr. Andrew Wu tonight. Andrew is the managing general partner of AI Fund, founder of Deep Learning AI, and learning and lending AI, chairman and co-founder of Coursera, and an adjunct professor of computer science here at Stanford. Previously he had started and led the Google Brain team, which had helped Google adopt modern AI, and he was also director of the Stanford AI lab. About 8 million people, one in 1,000 persons on the planet, have taken an AI class from him, and through both his education and his AI work, he has changed humor's lives. Please welcome Dr. Andrew Wu. Thank you Lisa, it's good to see everyone. So what I want to do today is chat to you about some opportunities in AI. So I've been saying AI is 

# Build Document Summary Index

In [13]:
# LLM (gpt-3.5-turbo)
chatgpt = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=chatgpt, chunk_size=1024)

In [14]:
# default mode of building the index
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize", use_async=True
)
doc_summary_index = DocumentSummaryIndex.from_documents(
    docs,
    service_context=service_context,
    response_synthesizer=response_synthesizer,
    show_progress=True,
)


Parsing documents into nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Summarizing documents:   0%|          | 0/1 [00:00<?, ?it/s]

current doc id: transcriptions/youtube_5p248yoa3oE.json


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
retreiver = doc_summary_index.as_retriever()

In [16]:
print(doc_summary_index.get_document_summary(doc_id='transcriptions/youtube_5p248yoa3oE.json'))

The provided text is about various aspects of artificial intelligence (AI) and its impact on society. It discusses the current state and future potential of AI technology in various industries, the challenges of applying AI outside of the tech and consumer software internet sectors, and the importance of addressing bias and ethical considerations in AI systems. The text also touches upon the potential disruption to jobs caused by AI automation and the need to ensure that people affected by job displacement are well taken care of. It mentions the hype surrounding artificial general intelligence (AGI) and the challenges in achieving human-like intelligence in AI. The text also highlights the potential benefits of AI in addressing real extinction risks to humanity, such as pandemics or climate change.

Some questions that this text can answer include:
- What are the challenges of applying AI outside of the tech and consumer software internet sectors?
- What progress has been made in reduc

In [17]:
doc_summary_index.storage_context.persist("index")

In [18]:
from llama_index.indices.loading import load_index_from_storage
from llama_index import StorageContext

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="index")
doc_summary_index = load_index_from_storage(storage_context)

In [77]:
query_engine = doc_summary_index.as_query_engine(
    response_mode="tree_summarize", use_async=True
)
response = query_engine.query("What are the author's thoughts on the risks and benefits of AI for humanity")
response

Response(response='The author acknowledges the risks associated with AI, particularly in terms of job disruption. They express a strong obligation to ensure that people who are affected by AI automation are well taken care of. However, the author does not see AI as a meaningful extinction risk for humanity and believes that with proper oversight, AI can be managed to be safe. They also highlight the potential benefits of AI, stating that it creates new opportunities for everyone and can be a key part of the solution to real extinction risks such as pandemics or climate change. Overall, the author sees AI as a general-purpose technology that has the potential to bring value and opportunities, but also recognizes the importance of addressing the challenges it presents.', source_nodes=[NodeWithScore(node=TextNode(id_='4c2ae659-5a8d-42fa-a6bd-5a7f46174978', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: 

# Perform Retrieval from Document Summary Index

In [81]:
query_engine = doc_summary_index.as_query_engine(
    response_mode="tree_summarize", use_async=True
)

In [82]:
# response = query_engine.query("bias and fairness in AI systems ")


# Index Chunks

In [83]:
from llama_index import (
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    VectorStoreIndex,
)

In [84]:
from llama_index.schema import Node, Document
from llama_index.indices.document_summary import DocumentSummaryIndex

def index_json_file(json_file_path):
    with open(json_file_path, "r") as f:
        json_data = json.load(f)
    
    nodes = []
    for segment in json_data["segments"].values():
        text = segment["text"]
        start = segment["start"]
        metadata = {"start": start}
        node = Node(text=text, metadata=metadata)
        nodes.append(node)
    
    docs = [Document(nodes=nodes)]
    index = DocumentSummaryIndex.from_documents(docs)
    index.storage_context.persist("index")

def json_file_document(json_file_path):
    with open(json_file_path, "r") as f:
        json_data = json.load(f)
    
    documents = []
    for segment in json_data["segments"]:
        text = segment["text"]
        start = segment["start"]
        metadata = {"start": start}
        documents.append(Document(text=text,
                                 metadata=metadata)
                        )
    return documents

In [85]:
docs = json_file_document("transcriptions/youtube_5p248yoa3oE.json")

In [86]:
from llama_index.embeddings import HuggingFaceEmbedding
import chromadb


In [87]:
# create client and a new collection
chroma_client = chromadb.EphemeralClient()
try:
    chroma_collection = chroma_client.create_collection("quickstart")
except:
    pass
# define embedding function
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

In [88]:
docs

[Document(id_='7f47d3d5-4409-4f45-be37-c363fb941920', embedding=None, metadata={'start': 4.999999999999998}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='7b0d5cbaea75ebe4cd88b681e44129bd0181221b1f90613cb5bc7251bcd1268c', text=' It is my pleasure to welcome Dr. Andrew Wu tonight.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='775b4db5-fd43-438d-863c-145c0c82610e', embedding=None, metadata={'start': 11.28}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='75ff31414aef3d654c85fff4f31f4bedadc847b3d1a37445c3c71ab30457e991', text=' Andrew is the managing general partner of AI Fund, founder of Deep Learning AI, and', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='2058d552-24a9-4a47-84

In [89]:
from llama_index.vector_stores import ChromaVectorStore


In [90]:

# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embed_model)
index = VectorStoreIndex.from_documents(
    docs, storage_context=storage_context, service_context=service_context,
show_progress=True,)


Parsing documents into nodes:   0%|          | 0/370 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/370 [00:00<?, ?it/s]

In [91]:
retreiver = index.as_retriever(similarity_top_k=5)

In [92]:
retreiver.retrieve("What are the author's thoughts on the risks and benefits of AI for humanity")

[NodeWithScore(node=TextNode(id_='1d632109-971a-48eb-a04e-50e19ec0a5a1', embedding=None, metadata={'start': 2088.7}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e8b06bf6-4bbc-48f4-8a07-59dbe67f6808', node_type=None, metadata={'start': 2088.7}, hash='e4872cfaa3054be9387a77c40752021dd325b30cc3d87caeca52f3707a11245c')}, hash='472f3b5f3877567d9843336ec4c83ec8c2408a404f9115a600eeddd297f27ea1', text="meaningful extinction risk for humanity. I think that people worry we can't control AI,", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=7.585418697260143e-64),
 NodeWithScore(node=TextNode(id_='da3984c2-e333-4f16-b4fa-f7cb46cf1e2b', embedding=None, metadata={'start': 2088.7}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo

In [93]:
query_engine = index.as_query_engine(similarity_top_k=5)

In [94]:
from IPython.display import Markdown, display

In [95]:
response = query_engine.query("What are the author's thoughts on the risks and benefits of AI for humanity")
display(Markdown(f"<b>{response}</b>"))

<b>The author expresses concern about the risks associated with AI, specifically the potential for it to pose a meaningful extinction risk to humanity. They mention that people worry about our ability to control AI. However, the author also suggests that AI, with its greater intelligence, may have benefits for humanity.</b>

In [68]:
from llama_index import download_loader, GPTSimpleVectorIndex


ImportError: cannot import name 'GPTSimpleVectorIndex' from 'llama_index' (/Users/saazizi/miniconda3/lib/python3.10/site-packages/llama_index/__init__.py)