In [40]:
pip install nb_black nbdime

Note: you may need to restart the kernel to use updated packages.


In [1]:
%load_ext lab_black

In [100]:
from urllib.parse import urlparse, parse_qs
import whisper_timestamped as whisper
from pydub import AudioSegment
from logger import logger
from pytube import YouTube

import whisper

import json
import os

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [101]:
# https://huggingface.co/openai/whisper-large-v2#long-form-transcription

In [102]:
output_path_youtube = "YoutubeAudios"
output_path_transcription = "transcriptions"

In [103]:
from pytube import YouTube
import json
import whisper  # Assuming you have a package or module named whisper
from logger import logger

class YouTubeTranscriber:

    def __init__(self, url, output_path_youtube, output_path_transcription):
        self.output_path_youtube = output_path_youtube
        self.yt = YouTube(url)
        self.transcription = None
        self.url = url
        self.filename_path = None 
        self.output_path_transcription = output_path_transcription

    def extract_main_domain_and_video_id(self):
        parsed_url = urlparse(self.url)
        domain_parts = parsed_url.netloc.split('.')
        main_domain = domain_parts[-2] if len(domain_parts) >= 2 else None
        query_params = parse_qs(parsed_url.query)
        video_id = query_params.get('v', [None])[0]
        self.video_id = f"{main_domain}_{video_id}"

    def download_youtube(self):
        self.filename = f"{self.video_id}.mp3"
        
        audio_stream = self.yt.streams.filter(only_audio=True).first()
        
        audio_stream.download(output_path=self.output_path_youtube, filename=self.filename)
        logger.info(f"Audio downloaded to {self.output_path_youtube}/{self.filename}")


    def transcribe_audio(self, model_name, device):
        audio = whisper.load_audio(f"{self.output_path_youtube}/{self.filename}")
        model = whisper.load_model(model_name, device=device)
        self.transcription = whisper.transcribe(model, audio)

    def write_to_json(self):
        with open(f"{self.output_path_transcription}/{self.video_id}.json", 'w') as f:
            json.dump(self.transcription, f)
        logger.info(f"Transcription downloaded to {self.output_path_transcription}/{self.video_id}.json")

    def merge_segments(self, num_to_merge):
        merged_segments = []
        segments = self.transcription["segments"]
        for i in range(0, len(segments), num_to_merge):
            merged_dict = {}
            slice_ = segments[i:i + num_to_merge]

            # Merging the 'text' fields
            merged_dict['text'] = " ".join(item['text'] for item in slice_)

            # Get the 'start' time from the first dictionary and the 'end' time from the last dictionary
            merged_dict['start'] = int(slice_[0]['start'])
            merged_dict['end'] = int(slice_[-1]['end'])

  

            merged_segments.append(merged_dict)

        self.transcription["merged_segments"] = merged_segments



    
    def run(self, num_to_merge=4, model_name="base", device="cpu"):
        
        logger.info("extract_main_domain_and_video_id")
        self.extract_main_domain_and_video_id()
        
        logger.info("download_youtube")
        self.download_youtube()
        
        logger.info("transcribe_audio")
        self.transcribe_audio(model_name=model_name,
                             device=device)
        
        logger.info("merge_segments")
        self.merge_segments(num_to_merge)
        
        logger.info("write_to_json")
        self.write_to_json()
        


# Usage
output_path = output_path_youtube
url = 'https://www.youtube.com/watch?v=5p248yoa3oE'
# url = "https://www.youtube.com/watch?v=UyoXmHS-KGc"
yt_transcriber = YouTubeTranscriber(url=url, 
                                    output_path_youtube=output_path_youtube,
                                   output_path_transcription=output_path_transcription)

yt_transcriber.run()

2023-10-02 16:21:35,757 ./logs/auto-labeler INFO extract_main_domain_and_video_id [4038077557.py]
2023-10-02 16:21:35,758 ./logs/auto-labeler INFO download_youtube [4038077557.py]
2023-10-02 16:21:39,820 ./logs/auto-labeler INFO Audio downloaded to YoutubeAudios/youtube_5p248yoa3oE.mp3 [4038077557.py]
2023-10-02 16:21:39,821 ./logs/auto-labeler INFO transcribe_audio [4038077557.py]
2023-10-02 16:24:00,239 ./logs/auto-labeler INFO merge_segments [4038077557.py]
2023-10-02 16:24:00,241 ./logs/auto-labeler INFO write_to_json [4038077557.py]
2023-10-02 16:24:00,249 ./logs/auto-labeler INFO Transcription downloaded to transcriptions/youtube_5p248yoa3oE.json [4038077557.py]


# LamaIndex

In [104]:
from llama_index import ServiceContext
import json
from logger import logger
from llama_index.llms import OpenAI
# from service_context.node_parser import NodeParser
from llama_index import (
    VectorStoreIndex,
    SummaryIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
)


In [17]:
chunk_size = 128
# service_context = ServiceContext.from_defaults(chunk_size=chunk_size)


In [18]:
from llama_index import Document
from llama_index.llms import OpenAI


In [19]:


def get_nodes_from_transcription(json_file, chunk_sizes):
    """
    Concatenates words from a given JSON data structure with maximum chunk sizes.

    Parameters:
        json_file (str): The path to the JSON file.
        chunk_sizes (list of int): The list of maximum number of words allowed in each concatenated chunk.

    Returns:
        list: A list of nodes. Each node contains concatenated text and metadata (start_time, end_time).

    """
    def add_document_to_chunks(word_list, start_time, end_time, chunks,chunk_size):
        """Helper function to add a new Document to chunks."""
        text = " ".join(word_list)
        doc = Document(text=text, extra_info={'start': start_time, 
                                              "end": end_time,
                                             "chunk_size":chunk_size})
        chunks.append(doc)

    concatenated_chunks = []
    current_chunk = []
    current_word_count = 0
    current_start_time = None
    current_end_time = None

    # Load JSON data
    with open(json_file, "r") as f:
        json_data = json.load(f)

    # Initialize service context
    llm = OpenAI(model="gpt-4")
    service_context = ServiceContext.from_defaults(llm=llm)
    
    if not isinstance(chunk_sizes, list):
        chunk_sizes = [chunk_sizes]
    for chunk_size in chunk_sizes:
        for segment in json_data["segments"]:
            for word_info in segment["words"]:
                if current_word_count + 1 > chunk_size:
                    add_document_to_chunks(current_chunk, current_start_time, current_end_time, concatenated_chunks, chunk_size)
                    current_chunk = []
                    current_word_count = 0
                    current_start_time = None

                current_chunk.append(word_info["word"].strip())
                current_word_count += 1

                if current_start_time is None:
                    current_start_time = word_info["start"]
                current_end_time = word_info["end"]

        if current_chunk:
            add_document_to_chunks(current_chunk, current_start_time, current_end_time, concatenated_chunks, chunk_size)

        nodes = service_context.node_parser.get_nodes_from_documents(concatenated_chunks)
        
        # for node in nodes:
        #     node.metadata["chunk_size"] = chunk_size

    return nodes


# Example usage
chunk_sizes = [128, 256, 512, 1024]
nodes = get_nodes_from_transcription(json_file="transcriptions/youtube_UyoXmHS-KGc.json", chunk_sizes=chunk_sizes)
len(nodes)


14

In [25]:
import openai

openai.api_key = "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"
nodes_list = []
vector_indices = []
query_engines = []
chunk_sizes = [128, 256, 512, 1024]
for chunk_size in chunk_sizes:
    print(f"Chunk Size: {chunk_size}")
    nodes = get_nodes_from_transcription(json_file="transcriptions/youtube_UyoXmHS-KGc.json", chunk_sizes=chunk_size)
    

    # add chunk size to nodes to track later
    logger.info("add chunk size to nodes to track later")
    for node in nodes:
        node.metadata["chunk_size"] = chunk_size
        node.excluded_embed_metadata_keys = ["chunk_size"]
        node.excluded_llm_metadata_keys = ["chunk_size"]

    nodes_list.append(nodes)

    # build vector index
    logger.info("build vector index")
    vector_index = VectorStoreIndex(nodes, show_progress=True)
    vector_indices.append(vector_index)

    # query engines
    logger.info("query engines")
    
    query_engines.append(vector_index.as_query_engine())

2023-10-02 15:36:41,421 ./logs/auto-labeler INFO add chunk size to nodes to track later [1093469001.py]
2023-10-02 15:36:41,422 ./logs/auto-labeler INFO build vector index [1093469001.py]


Chunk Size: 128


Generating embeddings:   0%|          | 0/6 [00:00<?, ?it/s]

2023-10-02 15:36:42,403 ./logs/auto-labeler INFO query engines [1093469001.py]
2023-10-02 15:36:42,442 ./logs/auto-labeler INFO add chunk size to nodes to track later [1093469001.py]
2023-10-02 15:36:42,443 ./logs/auto-labeler INFO build vector index [1093469001.py]


Chunk Size: 256


Generating embeddings:   0%|          | 0/3 [00:00<?, ?it/s]

2023-10-02 15:36:43,025 ./logs/auto-labeler INFO query engines [1093469001.py]
2023-10-02 15:36:43,038 ./logs/auto-labeler INFO add chunk size to nodes to track later [1093469001.py]
2023-10-02 15:36:43,038 ./logs/auto-labeler INFO build vector index [1093469001.py]


Chunk Size: 512


Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

2023-10-02 15:36:43,581 ./logs/auto-labeler INFO query engines [1093469001.py]
2023-10-02 15:36:43,591 ./logs/auto-labeler INFO add chunk size to nodes to track later [1093469001.py]
2023-10-02 15:36:43,592 ./logs/auto-labeler INFO build vector index [1093469001.py]


Chunk Size: 1024


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

2023-10-02 15:36:44,167 ./logs/auto-labeler INFO query engines [1093469001.py]


In [26]:
r = query_engines[0]

In [27]:
r.query('advice')

Response(response='You can find advice in the forum mentioned in the context.', source_nodes=[NodeWithScore(node=TextNode(id_='9d7b4452-0201-48c1-92ef-4e8890141e0a', embedding=None, metadata={'start': 258.08, 'end': 293.92, 'chunk_size': 128}, excluded_embed_metadata_keys=['chunk_size'], excluded_llm_metadata_keys=['chunk_size'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='4a50f545-8f52-458a-a040-20f482cd9f77', node_type=None, metadata={'start': 258.08, 'end': 293.92, 'chunk_size': 128}, hash='20cc92ce5293b915b7c6f133ececfa84401db5f8cafd638b3b669b98277d0732')}, hash='20cc92ce5293b915b7c6f133ececfa84401db5f8cafd638b3b669b98277d0732', text="from you fellow scholars telling me that you have been inspired by the series, but don't really know where to start. And here it is. In this forum, you can share your projects, ask for advice, look for collaborators, and more. Make sure to visit www .me -slash -paper -forum and say hi or just click the link in the video des

In [28]:
# try ensemble retrieval

from llama_index.tools import RetrieverTool
from llama_index.schema import IndexNode

# retriever_tools = []
retriever_dict = {}
retriever_nodes = []
for chunk_size, vector_index in zip(chunk_sizes, vector_indices):
    node_id = f"chunk_{chunk_size}"
    print(node_id)
    node = IndexNode(
        text=f"Retrieves relevant advice (chunk size {chunk_size})",
        index_id=node_id,
    )
    retriever_nodes.append(node)
    retriever_dict[node_id] = vector_index.as_retriever()

chunk_128
chunk_256
chunk_512
chunk_1024


In [29]:
r = retriever_dict['chunk_128']


In [30]:
r.retrieve("Gigagen-based method")

[NodeWithScore(node=TextNode(id_='7fc0010f-3c17-4278-a13e-a5486df563dc', embedding=None, metadata={'start': 49.02, 'end': 102.68, 'chunk_size': 128}, excluded_embed_metadata_keys=['chunk_size'], excluded_llm_metadata_keys=['chunk_size'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='35b68b88-fe29-42d6-916c-689b651c724d', node_type=None, metadata={'start': 49.02, 'end': 102.68, 'chunk_size': 128}, hash='f771b1137f9784b6d21604e6698d65ab116b20050f687025185e3238423d4248')}, hash='f771b1137f9784b6d21604e6698d65ab116b20050f687025185e3238423d4248', text='That is extremely quick. For instance, the previous StarGain -based method that could be roughly as fast we needed to make significant concessions in terms of quality. Not anymore. Loving it. Two, it is not only fast, but it is so fast that it can create several images per second and thus it offers the controllable latent space. This is a hallmark for Gigagen -based methods and leads to incredible artistic controllab

In [31]:
from llama_index.selectors.pydantic_selectors import PydanticMultiSelector

# from llama_index.retrievers import RouterRetriever
from llama_index.retrievers import RecursiveRetriever
from llama_index import SummaryIndex

# the derived retriever will just retrieve all nodes
summary_index = SummaryIndex(retriever_nodes)

retriever = RecursiveRetriever(
    root_id="root",
    retriever_dict={"root": summary_index.as_retriever(), **retriever_dict},
)

In [32]:
result = await retriever.aretrieve(
    "Gigagen"
)
result

[]

In [40]:
pip install service-context


[31mERROR: Could not find a version that satisfies the requirement service-context (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for service-context[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [44]:
retriever.retrieve_all()

TypeError: RecursiveRetriever.retrieve_all() missing 1 required positional argument: 'query_bundle'

In [33]:
# define reranker
from llama_index.indices.postprocessor import (
    LLMRerank,
    SentenceTransformerRerank,
    CohereRerank,
)

reranker = LLMRerank()
# reranker = SentenceTransformerRerank(top_n=10)

from llama_index.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine(retriever, node_postprocessors=[reranker])

In [34]:
query_engine

<llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine at 0x28c6274f0>

In [35]:
query_engine.query("Gigagen methods")

Response(response='Empty Response', source_nodes=[], metadata=None)

In [154]:
retriever.get_service_context()

# LangChain

## retreival

In [105]:
from langchain.schema import Document 


In [106]:


def get_documents_from_transcription(json_file, chunk_sizes):
    """
    Concatenates words from a given JSON data structure with maximum chunk sizes.

    Parameters:
        json_file (str): The path to the JSON file.
        chunk_sizes (list of int): The list of maximum number of words allowed in each concatenated chunk.

    Returns:
        list: A list of nodes. Each node contains concatenated text and metadata (start_time, end_time).

    """
    def add_document_to_chunks(word_list, start_time, end_time, chunks,chunk_size):
        """Helper function to add a new Document to chunks."""
        text = " ".join(word_list)
        doc = Document(page_content=text, metadata={'start': start_time, 
                                              "end": end_time,
                                             "chunk_size":chunk_size})
        chunks.append(doc)

    concatenated_chunks = []
    current_chunk = []
    current_word_count = 0
    current_start_time = None
    current_end_time = None

    # Load JSON data
    with open(json_file, "r") as f:
        json_data = json.load(f)

    # Initialize service context
    llm = OpenAI(model="gpt-4")
    service_context = ServiceContext.from_defaults(llm=llm)
    
    if not isinstance(chunk_sizes, list):
        chunk_sizes = [chunk_sizes]
    for chunk_size in chunk_sizes:
        for segment in json_data["segments"]:
            for word_info in segment["words"]:
                if current_word_count + 1 > chunk_size:
                    add_document_to_chunks(current_chunk, current_start_time, current_end_time, concatenated_chunks, chunk_size)
                    current_chunk = []
                    current_word_count = 0
                    current_start_time = None

                current_chunk.append(word_info["word"].strip())
                current_word_count += 1

                if current_start_time is None:
                    current_start_time = word_info["start"]
                current_end_time = word_info["end"]

        if current_chunk:
            add_document_to_chunks(current_chunk, current_start_time, current_end_time, concatenated_chunks, chunk_size)

    return concatenated_chunks


# Example usage
chunk_sizes = [128, 256, 512, 1024]
docs = get_documents_from_transcription(json_file="transcriptions/youtube_UyoXmHS-KGc.json", chunk_sizes=chunk_sizes)
len(docs)


13

In [107]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import FAISS

In [108]:
# Build a sample vectorDB
from langchain.vectorstores import Chroma
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load blog post
# loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
# data = loader.load()

# # Split
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
# splits = text_splitter.split_documents(data)

# VectorDB
embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(documents=docs, embedding=embedding)

In [109]:
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever

question = "advice"
llm = ChatOpenAI(temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(), llm=llm
)
retriever_from_llm

MultiQueryRetriever(tags=None, metadata=None, retriever=VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], metadata=None, vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x17f62ac20>, search_type='similarity', search_kwargs={}), llm_chain=LLMChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, prompt=PromptTemplate(input_variables=['question'], output_parser=None, partial_variables={}, template='You are an AI language model assistant. Your task is \n    to generate 3 different versions of the given user \n    question to retrieve relevant documents from a vector  database. \n    By generating multiple perspectives on the user question, \n    your goal is to help the user overcome some of the limitations \n    of distance-based similarity search. Provide these alternative \n    questions separated by newlines. Original question: {question}', template_format='f-string', validate_template=True), llm=ChatOpenAI(cache=None, ve

In [65]:
retriever_from_llm.get_relevant_documents(query="advice")

[Document(page_content="from you fellow scholars telling me that you have been inspired by the series, but don't really know where to start. And here it is. In this forum, you can share your projects, ask for advice, look for collaborators, and more. Make sure to visit www .me -slash -paper -forum and say hi or just click the link in the video description. Our thanks to weights and biases for their long -standing support and for helping us make better videos for you. Thanks for watching and for your generous support, and I'll see you next time.", metadata={'chunk_size': 128, 'end': 293.92, 'start': 258.08}),
 Document(page_content="projects, ask for advice, look for collaborators, and more. Make sure to visit www .me -slash -paper -forum and say hi or just click the link in the video description. Our thanks to weights and biases for their long -standing support and for helping us make better videos for you. Thanks for watching and for your generous support, and I'll see you next time."

# LLM

In [142]:
template_summary = '''
Article: {ARTICLE}
You will generate increasingly concise, entity-dense summaries of the above article.

Repeat the following 2 steps 5 times.

Step 1. Identify 1-3 informative entities (";" delimited) from the article which are missing from the previously generated summary.
Step 2. Write a new, denser summary of identical length which covers every entity and detail from the previous summary plus the missing entities.

A missing entity is:
- relevant to the main story,
- specific yet concise (5 words or fewer),
- novel (not in the previous summary),
- faithful (present in the article),
- anywhere (can be located anywhere in the article).

Guidelines:

- The first summary should be long (4-5 sentences, ~80 words) yet highly non-specific, containing little information beyond the entities marked as missing. Use overly verbose language and fillers (e.g., "this article discusses") to reach ~80 words.
- Make every word count: rewrite the previous summary to improve flow and make space for additional entities.
- Make space with fusion, compression, and removal of uninformative phrases like "the article discusses".
- The summaries should become highly dense and concise yet self-contained, i.e., easily understood without the article.
- Missing entities can appear anywhere in the new summary.
- Never drop entities from the previous summary. If space cannot be made, add fewer new entities.

Remember, use the exact same number of words for each summary.
Answer in JSON. The JSON should be a list (length 5) of dictionaries whose keys are "Missing_Entities" and "Denser_Summary".
'''

temple_transcription =  '''
Transcription: {TRANSCRIPTION}
You will generate increasingly concise, entity-dense summaries of the above webinar or conference transcription.

Repeat the following 2 steps 5 times.

Step 1. Identify 1-3 informative entities (";" delimited) from the webinar or conference transcription which are missing from the previously generated summary. These entities could include key quotes, pivotal moments, technical terms, etc.
Step 2. Write a new, denser summary of identical length that covers every entity and detail from the previous summary, plus the missing entities.

A missing entity is:
- relevant to the main ideas or experience,
- specific yet concise (5 words or fewer),
- novel (not in the previous summary),
- faithful (present in the webinar or conference transcription),
- anywhere (can appear at any point during the transcription).

Guidelines:

- The first summary should be long (4-5 sentences, ~80 words) and may focus on the overarching themes discussed. Use verbose language and fillers (e.g., "the transcription reveals discussions about") to reach ~80 words.
- Make every word count: rewrite the previous summary to improve flow and make space for additional entities.
- Make space with fusion, compression, and the removal of uninformative phrases like "it was mentioned that".
- The summaries should become highly dense and concise yet self-contained, i.e., easily understood without having to read the full transcription.
- Missing entities can appear anywhere in the new summary.
- Never drop entities from the previous summary. If space cannot be made, add fewer new entities.

Remember, use the exact same number of words for each summary.
Answer in JSON. The JSON should be a list (length 5) of dictionaries whose keys are "Missing_Entities" and "Denser_Summary".
'''



In [111]:
# 

In [112]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())


In [113]:
from langchain.llms import HuggingFaceHub

In [114]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)

In [158]:
map_prompt_template = """
                      Write a summary of this chunk of text that includes the main points and any important details.
                      {text}
                      """

map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])

combine_prompt_template = """
                      Write a concise summary of the following text delimited by triple backquotes.
                      Return your response in bullet points which covers the key points of the text.
                      ```{text}```
                      BULLET POINT SUMMARY:
                      """

combine_prompt = PromptTemplate(
    template=combine_prompt_template, input_variables=["text"]
)

In [161]:
from langchain.chains.summarize import load_summarize_chain
map_reduce_chain = load_summarize_chain(
    llm,
    chain_type="map_reduce",
    map_prompt=map_prompt,
    combine_prompt=combine_prompt,
    return_intermediate_steps=True,
)

In [186]:
# This is a long document we can split up.
with open('state_of_the_union.txt') as f:
    state_of_the_union = f.read()

In [202]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    # length_function = len,
    chunk_size=100, chunk_overlap=50, 
)
split_docs = text_splitter.create_documents([state_of_the_union])
len(split_docs)

184

In [207]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 1000,
    chunk_overlap  = 20,
    # length_function = len,
    # is_separator_regex = False,
)
texts = text_splitter.create_documents([data["text"]])
len(texts)

36

In [208]:
map_reduce_outputs = map_reduce_chain({"input_documents": texts})
map_reduce_outputs

{'input_documents': [Document(page_content="It is my pleasure to welcome Dr. Andrew Wu tonight. Andrew is the managing general partner of AI Fund, founder of Deep Learning AI, and lending AI, Chairman and co-founder of Coursera, and an unjunct professor of computer science here at Stanford. Previously he had started and led the Google Brain Team, which had helped Google adopt modern AI, and he was also director of the Stanford AI lab. From 8 million people, one in 1,000 persons on the planet have taken an AI class from him, and through both his education and his AI work, he has changed humor's lives. Please welcome Dr. Andrew Wu. Thank you Lisa, it's good to see everyone. So what I want to do today is chat to you about some opportunities in AI. So I've been saying AI is a new electricity. One of the difficult things to understand about AI is that it is a general purpose technology, meaning that it's not useful only for one thing, but it's useful for lots of different applications, kind

In [209]:
map_reduce_outputs.keys()

dict_keys(['input_documents', 'intermediate_steps', 'output_text'])

In [211]:
print(map_reduce_outputs["output_text"])

- AI has the potential to impact various industries and fields
- Supervised learning and generative AI are currently important tools in AI
- Large-scale supervised learning has been the focus of the last decade in AI
- The use of large neural networks and data has driven AI progress
- AI applications can now be built in a matter of weeks instead of months or years
- The number of large corporations exploring AI applications is increasing
- There is a need to expand the use cases of AI and develop user-friendly tools
- Customization in AI is estimated to be worth billions of dollars
- Opportunities for AI exist in various industries for both startups and incumbent companies
- Collaboration between AI experts and subject matter experts is important
- AI has challenges with bias, fairness, and accuracy, but technology is improving
- Automation affects both lower-wage and higher-wage jobs
- Support is needed for those affected by automation
- Achieving human-level artificial general intell

In [213]:
print(map_reduce_outputs["intermediate_steps"])

['to think about all the things that electricity is used for, you would probably come up with a long list. Similarly, AI can be applied to a wide range of industries and fields. Dr. Andrew Wu, who has a background in AI and has taught AI classes to millions of people, believes that AI has the potential to change many aspects of our lives. He compares AI to electricity, stating that it is a general purpose technology that can be used in various applications. In his talk, he will discuss some of the opportunities that AI presents.', 'classify it as spam or not spam. This tool is widely used in various applications such as image recognition, speech recognition, and recommendation systems. On the other hand, generative AI is a newer development that focuses on creating new content or generating new data based on patterns learned from existing data. This tool has been used in applications like image generation, text generation, and music composition. The speaker emphasizes that these two to