In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext, load_index_from_storage, Settings
from llama_index.core.node_parser import TokenTextSplitter, SentenceSplitter, SemanticSplitterNodeParser, HierarchicalNodeParser, SentenceWindowNodeParser, get_leaf_nodes
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank
from llama_index.core.extractors import QuestionsAnsweredExtractor, TitleExtractor
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.query_engine import TransformQueryEngine, RetrieverQueryEngine
from llama_index.core.tools import QueryEngineTool
from llama_index.core.query_engine import SubQuestionQueryEngine
from langchain_openai import ChatOpenAI

import phoenix as px
from openinference.instrumentation.langchain import LangChainInstrumentor
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from phoenix.trace import using_project

import functions

import nest_asyncio 
nest_asyncio.apply()

In [None]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["TOKENIZERS_PARALLELISM"]='true'

data_path = '../content/data'
storage_path = "../content/storage"
testset_path = '../content/testset/testset.csv'

ENDPOINT = 'http://127.0.0.1:6006/v1/traces'

In [None]:
documents = SimpleDirectoryReader(data_path).load_data()

Settings.llm = OpenAI(model="gpt-3.5-turbo-0125", temperature=0.1)
Settings.embed_model = OpenAIEmbedding()
eval_model = ChatOpenAI(model_name="gpt-4o-mini")

In [None]:
testset = functions.generate_testset(testset_path, data_path)
testset.head(2)

In [None]:
session = px.launch_app()
tracer_provider = TracerProvider()
tracer_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(ENDPOINT)))

LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)
LangChainInstrumentor().instrument(tracer_provider=tracer_provider)
client = px.Client()

In [None]:
with using_project("indexing-token_splitter"):
    token_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128)
    token_splitter_index = functions.build_index(storage_path, 'token_splitter', documents, [token_splitter])
    token_splitter_engine = token_splitter_index.as_query_engine(similarity_top_k=5)
    result_token_splitter, eval_token_splitter = functions.evaluation('querying-token_splitter', token_splitter_engine, testset, eval_model)
    eval_token_splitter, eval_token_splitter_scores = functions.add_evaluation(client, 'querying-token_splitter', eval_token_splitter)

with using_project("indexing-sentence_splitter"):
    sentence_splitter = SentenceSplitter(chunk_size=512)
    sentence_splitter_index = functions.build_index(storage_path, 'sentence_splitter', documents, [sentence_splitter])
    sentence_splitter_engine = sentence_splitter_index.as_query_engine(similarity_top_k=5)
    result_sentence_splitter, eval_sentence_splitter = functions.evaluation('querying-sentence_splitter', sentence_splitter_engine, testset, eval_model)
    eval_sentence_splitter, eval_sentence_splitter_scores = functions.add_evaluation(client, 'querying-sentence_splitter', eval_sentence_splitter)

with using_project("indexing-semantic_splitter"):
    semantic_splitter = SemanticSplitterNodeParser(buffer_size=1, embed_model=Settings.embed_model)
    semantic_splitter_index = functions.build_index(storage_path, 'semantic_splitter', documents, [semantic_splitter])
    semantic_splitter_engine = semantic_splitter_index.as_query_engine(similarity_top_k=5)
    result_semantic_splitter, eval_semantic_splitter = functions.evaluation('querying-semantic_splitter', semantic_splitter_engine, testset, eval_model)
    eval_semantic_splitter, eval_semantic_splitter_scores = functions.add_evaluation(client, 'querying-semantic_splitter', eval_semantic_splitter)

In [None]:
types = ['Token', 'Sentence', 'Semantic']
scores = [eval_token_splitter, eval_sentence_splitter, eval_semantic_splitter]
evals = functions.create_results(types, scores)
functions.plot_aggregate_evaluation(evals)

In [None]:
with using_project("indexing-hierarchical_node_parser"):
    storage_name = 'hierarchical_node_parser'
    if not os.path.exists(storage_path+str(f'/{storage_name}')):
        node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=[2048, 512, 128])
        nodes = node_parser.get_nodes_from_documents(documents)
        leaf_nodes = get_leaf_nodes(nodes)
        automerging_index = VectorStoreIndex(leaf_nodes)
        automerging_index.storage_context.persist(persist_dir=storage_path+str(f'/{storage_name}')) 
    else:
        automerging_index = load_index_from_storage(StorageContext.from_defaults(persist_dir=storage_path+str(f'/{storage_name}')))

    automerging_retriever = automerging_index.as_retriever(similarity_top_k=12)
    retriever = AutoMergingRetriever(automerging_retriever, automerging_index.storage_context)
    auto_merging_engine = RetrieverQueryEngine.from_args(automerging_retriever)
    result_auto_merging, eval_auto_merging = functions.evaluation('querying-auto_merging', auto_merging_engine, testset, eval_model)
    eval_auto_merging, eval_auto_merging_scores = functions.add_evaluation(client, 'querying-auto_merging', eval_auto_merging)

In [None]:
with using_project("indexing-sentence_window"):
    sentence_window = SentenceWindowNodeParser.from_defaults(window_size=3, window_metadata_key="window", original_text_metadata_key="original_text")
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    sentence_window_index = functions.build_index(storage_path, 'sentence_window', documents, [sentence_window])
    sentence_window_engine = sentence_window_index.as_query_engine(similarity_top_k=5, node_postprocessors=[postproc])
    result_sentence_window, eval_sentence_window = functions.evaluation('querying-sentence_window', sentence_window_engine, testset, eval_model)
    eval_sentence_window, eval_sentence_window_scores = functions.add_evaluation(client, 'querying-sentence_window', eval_sentence_window)

In [None]:
types.append('AutoMerging')
scores.append(eval_auto_merging)

types.append('Sentence window')
scores.append(eval_sentence_window)

evals = functions.create_results(types, scores)
functions.plot_aggregate_evaluation(evals)

In [None]:
dfs = [eval_token_splitter_scores, eval_sentence_splitter_scores, eval_semantic_splitter_scores, eval_auto_merging_scores, eval_sentence_window_scores]

functions.plot_individual_evaluation(dfs, 'context_precision')
functions.plot_individual_evaluation(dfs, 'faithfulness')
functions.plot_individual_evaluation(dfs, 'answer_relevancy')

In [None]:
with using_project("extracting-metadata"):
    title_extractor = TitleExtractor(nodes=3)
    qa_extractor = QuestionsAnsweredExtractor(questions=2)
    metadata_index = functions.build_index(storage_path, 'metadata_token', documents, [sentence_splitter, title_extractor, qa_extractor])
    metadata_engine = metadata_index.as_query_engine(similarity_top_k=5)

    result_metadata, eval_metadata = functions.evaluation('querying-metadata', metadata_engine, testset, eval_model)
    eval_metadata, eval_metadata_scores = functions.add_evaluation(client, 'querying-metadata', eval_metadata)

In [None]:
types.append('Metadata')
scores.append(eval_metadata)

evals = functions.create_results(types, scores)
functions.plot_aggregate_evaluation(evals)

In [None]:
hyde = HyDEQueryTransform(include_original=True)
hyde_engine = TransformQueryEngine(token_splitter_engine, hyde)

result_hyde, eval_hyde = functions.evaluation('querying-hyde', hyde_engine, testset, eval_model)
eval_hyde, eval_hyde_scores = functions.add_evaluation(client, 'querying-hyde', eval_hyde)

In [None]:
result_hyde, eval_hyde = functions.evaluation('querying-hyde-simple', hyde_engine, testset[testset.evolution_type == 'simple'], eval_model)
eval_hyde, eval_hyde_scores = functions.add_evaluation(client, 'querying-hyde-simple', eval_hyde)

In [None]:
result_hyde, eval_hyde = functions.evaluation('querying-hyde_reasoning', hyde_engine, testset[testset.evolution_type == 'reasoning'], eval_model)
eval_hyde, eval_hyde_scores = functions.add_evaluation(client, 'querying-hyde_reasoning', eval_hyde)

In [None]:
result_hyde, eval_hyde = functions.evaluation('querying-hyde_multi_context', hyde_engine, testset[testset.evolution_type == 'multi_context'], eval_model)
eval_hyde, eval_hyde_scores = functions.add_evaluation(client, 'querying-hyde_multi_context', eval_hyde)

In [None]:
tool = QueryEngineTool.from_defaults(query_engine=token_splitter_engine)
subquestion_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools=[tool])

result_subquestion, eval_subquestion = functions.evaluation('querying-subquestion', subquestion_engine, testset, eval_model)
eval_subquestion, eval_subquestion_scores = functions.add_evaluation(client, 'querying-subquestion', eval_subquestion)

In [None]:
tool = QueryEngineTool.from_defaults(query_engine=token_splitter_engine)
subquestion_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools=[tool])

result_subquestion, eval_subquestion = functions.evaluation('querying-subquestion-simple', subquestion_engine, testset[testset.evolution_type == 'simple'], eval_model)
eval_subquestion, eval_subquestion_scores = functions.add_evaluation(client, 'querying-subquestion-simple', eval_subquestion)

In [None]:
tool = QueryEngineTool.from_defaults(query_engine=token_splitter_engine)
subquestion_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools=[tool])

result_subquestion, eval_subquestion = functions.evaluation('querying-subquestion-reasoning', subquestion_engine, testset[testset.evolution_type == 'reasoning'], eval_model)
eval_subquestion, eval_subquestion_scores = functions.add_evaluation(client, 'querying-subquestion-reasoning', eval_subquestion)

In [None]:
tool = QueryEngineTool.from_defaults(query_engine=token_splitter_engine)
subquestion_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools=[tool])

result_subquestion, eval_subquestion = functions.evaluation('querying-subquestion-multi_context', subquestion_engine, testset[testset.evolution_type == 'multi_context'], eval_model)
eval_subquestion, eval_subquestion_scores = functions.add_evaluation(client, 'querying-subquestion-multi_context', eval_subquestion)

In [None]:
types.append('HyDE')
scores.append(eval_hyde)

types.append('SubQuest')
scores.append(eval_subquestion)

evals = functions.create_results(types, scores)
functions.plot_aggregate_evaluation(evals)

In [None]:
bge_rereanker_base = SentenceTransformerRerank(model="BAAI/bge-reranker-base", top_n=2)
bge_reranker_large = SentenceTransformerRerank(model="BAAI/bge-reranker-large", top_n=2)

In [None]:
base_index = functions.build_index(storage_path, 'token_splitter', documents, [sentence_splitter, title_extractor, qa_extractor])
base_engine = base_index.as_query_engine(similarity_top_k=5, node_postprocessors=[bge_rereanker_base])
result_base, eval_base = functions.evaluation('querying-base', base_engine, testset, eval_model)
eval_base, eval_base_scores = functions.add_evaluation(client, 'querying-base', eval_base)

In [None]:
base_index = functions.build_index(storage_path, 'token_splitter', documents, [sentence_splitter, title_extractor, qa_extractor])
base_engine = base_index.as_query_engine(similarity_top_k=5, node_postprocessors=[bge_rereanker_base])
result_base, eval_base = functions.evaluation('querying-base-simple', base_engine, testset[testset.evolution_type == 'simple'], eval_model)
eval_base, eval_base_scores = functions.add_evaluation(client, 'querying-base-simple', eval_base)

In [None]:
base_index = functions.build_index(storage_path, 'token_splitter', documents, [sentence_splitter, title_extractor, qa_extractor])
base_engine = base_index.as_query_engine(similarity_top_k=5, node_postprocessors=[bge_rereanker_base])
result_base, eval_base = functions.evaluation('querying-base-reasoning', base_engine, testset[testset.evolution_type == 'reasoning'], eval_model)
eval_base, eval_base_scores = functions.add_evaluation(client, 'querying-base-reasoning', eval_base)

In [None]:
base_index = functions.build_index(storage_path, 'token_splitter', documents, [sentence_splitter, title_extractor, qa_extractor])
base_engine = base_index.as_query_engine(similarity_top_k=5, node_postprocessors=[bge_rereanker_base])
result_base, eval_base = functions.evaluation('querying-base-multi_context', base_engine, testset[testset.evolution_type == 'multi_context'], eval_model)
eval_base, eval_base_scores = functions.add_evaluation(client, 'querying-base-multi_context', eval_base)

In [None]:
large_index = functions.build_index(storage_path, 'token_splitter', documents, [sentence_splitter, title_extractor, qa_extractor])
large_engine = large_index.as_query_engine(similarity_top_k=5, node_postprocessors=[bge_reranker_large])
result_large, eval_large = functions.evaluation('querying-large', large_engine, testset, eval_model)
eval_large, eval_large_scores = functions.add_evaluation(client, 'querying-large', eval_large)

In [None]:
large_index = functions.build_index(storage_path, 'token_splitter', documents, [sentence_splitter, title_extractor, qa_extractor])
large_engine = large_index.as_query_engine(similarity_top_k=5, node_postprocessors=[bge_reranker_large])
result_large, eval_large = functions.evaluation('querying-large-simple', large_engine, testset[testset.evolution_type == 'simple'], eval_model)
eval_large, eval_large_scores = functions.add_evaluation(client, 'querying-large-simple', eval_large)

In [None]:
large_index = functions.build_index(storage_path, 'token_splitter', documents, [sentence_splitter, title_extractor, qa_extractor])
large_engine = large_index.as_query_engine(similarity_top_k=5, node_postprocessors=[bge_reranker_large])
result_large, eval_large = functions.evaluation('querying-large-reasoning', large_engine, testset[testset.evolution_type == 'reasoning'], eval_model)
eval_large, eval_large_scores = functions.add_evaluation(client, 'querying-large-reasoning', eval_large)

In [None]:
large_index = functions.build_index(storage_path, 'token_splitter', documents, [sentence_splitter, title_extractor, qa_extractor])
large_engine = large_index.as_query_engine(similarity_top_k=5, node_postprocessors=[bge_reranker_large])
result_large, eval_large = functions.evaluation('querying-large-multi_context', large_engine, testset[testset.evolution_type == 'multi_context'], eval_model)
eval_large, eval_large_scores = functions.add_evaluation(client, 'querying-large-multi_context', eval_large)

In [None]:
types.append('Raranker base')
scores.append(eval_base)

types.append('Raranker large')
scores.append(eval_large)

evals = functions.create_results(types, scores)
functions.plot_aggregate_evaluation(evals)