Unlocking the Power of RAG Application! 🚀

Dive into the RAG framework, where identifying the perfect documents is key!
Query Magic: ✨

Your query embarks on a journey through retrievers, armed with cool similarity matrices like cosine, MRR, and Jaccard.
The result? A stellar lineup of the top N documents ready for action!
Elevate the RAG Pipeline: 🌟

Supercharge your strategy by blending different indexes – keyword, vector, graph – creating a powerhouse of results.
Each database gets its moment in the spotlight, retrieving the dazzling top 10 documents.
Rerank for Brilliance: 🔄

Unleash the power of reranking! By using multiple indexes, we fine-tune our results to select the crème de la crème, the top 10 (or let's make it 7! 🎉).
LLamam Magic Touch: 🌈

LLamam index swoops in with fantastic strategies, like the Node post processor for epic reranking.
Hold onto your hat as we demonstrate reranking using the one and only GPT-3 – a paper-reranker extraordinaire! It's like magic for finding the most relevant documents.
Answer Synthesis Extravaganza: 🎊

GPT-3 joins the party, bringing an exciting and friendly tone to answer synthesis.
The result? An answer synthesis with the perfect blend of friendliness and flair! 🌟


In [2]:
import nest_asyncio

nest_asyncio.apply()

In [3]:
import nest_asyncio
nest_asyncio.apply()

import llama_index
llama_index.set_global_handler("simple")

import os

os.environ["OPENAI_API_KEY"] = "sk-..."

import logging
import sys

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output

from llama_index import (
    KnowledgeGraphIndex,
    ServiceContext,
    SimpleDirectoryReader,
    SimpleKeywordTableIndex
)
from llama_index.storage.storage_context import StorageContext
from llama_index.graph_stores import NebulaGraphStore
from llama_index.llms import OpenAI

from IPython.display import Markdown, display
from llama_index.llms.palm import PaLM
from llama_index.embeddings import GooglePaLMEmbedding


from llama_index.callbacks import (
    CallbackManager,
    LlamaDebugHandler
)


from llama_index.retrievers import (
    KeywordTableSimpleRetriever
)

from llama_index import Document, SummaryIndex
from llama_index.query_engine import PandasQueryEngine, RetrieverQueryEngine
from llama_index.retrievers import RecursiveRetriever
from llama_index.schema import IndexNode
from llama_hub.file.pymu_pdf.base import PyMuPDFReader
from pathlib import Path
from typing import List
from llama_index.readers import WikipediaReader

from llama_index import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    SQLDatabase,
)

from llama_index.node_parser import SentenceSplitter
from llama_index.schema import IndexNode
from llama_index.response.notebook_utils import display_source_node


from llama_index.node_parser import SentenceSplitter
from llama_index.schema import IndexNode
from llama_index.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
)


In [4]:
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

In [5]:
palm_api_key  = "AIzaSyApBCzqW_RF4qbkX9kMoNwjooIqrm8oZEQ"
llm = PaLM(api_key=palm_api_key)

model_name = "models/embedding-gecko-001"
embed_model = GooglePaLMEmbedding(model_name=model_name, api_key=palm_api_key)

service_context = ServiceContext.from_defaults(
                                    llm = llm,
                                    embed_model = embed_model,
                                    chunk_size=512,
                                    callback_manager=callback_manager)

In [7]:
from llama_index.postprocessor import LLMRerank
from llama_index.llms import OpenAI
from IPython.display import Markdown, display


In [None]:
import os

OPENAI_API_TOKEN = "sk-"
os.environ["OPENAI_API_KEY"] = OPENAI_API_TOKEN

In [9]:
from pathlib import Path
import requests

wiki_titles = [
    "Vincent van Gogh",
]


data_path = Path("data_wiki")

for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            "explaintext": True,
        },
    ).json()
    page = next(iter(response["query"]["pages"].values()))
    wiki_text = page["extract"]

    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", "w", encoding="utf-8") as fp:
        fp.write(wiki_text)

In [10]:
documents = SimpleDirectoryReader("./data_wiki/").load_data()


In [11]:
index = VectorStoreIndex.from_documents(
    documents, service_context=service_context
)

**********
Trace: index_construction
    |_node_parsing ->  0.13596 seconds
      |_chunking ->  0.111031 seconds
    |_embedding ->  3.256311 seconds
    |_embedding ->  1.059069 seconds
    |_embedding ->  1.058989 seconds
    |_embedding ->  1.238246 seconds
    |_embedding ->  0.665584 seconds
**********


In [None]:
from llama_index.retrievers import VectorIndexRetriever
from llama_index.schema import QueryBundle
import pandas as pd
from IPython.display import display, HTML

# from llama_index.postprocessor import RankGPTRerank


def get_retrieved_nodes(
    query_str, vector_top_k=10, reranker_top_n=3, with_reranker=False
):
    query_bundle = QueryBundle(query_str)
    # configure retriever
    retriever = VectorIndexRetriever(
        index=index,
        similarity_top_k=vector_top_k,
    )
    retrieved_nodes = retriever.retrieve(query_bundle)

    if with_reranker:
        # configure reranker
        reranker = RankGPTRerank(
            llm=OpenAI(
                model="gpt-3.5-turbo-16k",
                temperature=0.0,
                api_key=OPENAI_API_TOKEN,
            ),
            top_n=reranker_top_n,
            verbose=True,
        )
        retrieved_nodes = reranker.postprocess_nodes(
            retrieved_nodes, query_bundle
        )

    return retrieved_nodes


def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n", "<br>")))


def visualize_retrieved_nodes(nodes) -> None:
    result_dicts = []
    for node in nodes:
        result_dict = {"Score": node.score, "Text": node.node.get_text()}
        result_dicts.append(result_dict)

    pretty_print(pd.DataFrame(result_dicts))