[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/castillosebastian/genai0/blob/main/related_works/RAG_base/L1-Advanced_RAG_Pipeline.ipynb)


# Lesson 1: Advanced RAG Pipeline

## Librerias para la tarea

El primer punto a estudiar es el 'fundational model' a usar. Aquí propondremos una demostración usando Llama2, pero existen otros modelos atractivos para el dominio de finanzas. Por ejemplo:

1. [FinMA](https://huggingface.co/ChanceFocus/finma-7b-full)
2. [FinGPT](https://huggingface.co/FinGPT)    

In [None]:
# GPU llama-cpp-python
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.78 --force-reinstall --upgrade --no-cache-dir --verbose

In [None]:
!pip install huggingface_hub

## Setup del ambiente

In [None]:
import utils
import os
import llama_cpp
from torch import cuda

In [23]:
use_cuda = cuda.is_available()
llama = True

In [12]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["ey-nl-financial-statements-2023-en.pdf"]
).load_data()

In [13]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

<class 'list'> 

112 

<class 'llama_index.schema.Document'>
Doc ID: b7e5c2fc-23f8-499e-a2c1-42220d063625
Text: Financial  Statements For the year ended 30 June 2023  Ernst &
Young Nederland LLPConcept


## Basic RAG pipeline

In [14]:
from llama_index import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))

In [19]:
len(document.get_content())

258211

## Modelo

In [None]:
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML"
model_basename = "llama-2-13b-chat.ggmlv3.q5_1.bin" # the model is in bin format

In [None]:
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

Downloading (…)chat.ggmlv3.q5_1.bin:   0%|          | 0.00/9.76G [00:00<?, ?B/s]

In [None]:

if llama:
    # GPU
    from llama_cpp import Llama
    llm = None
    llm = Llama(
        model_path=model_path,
        n_threads=2, # CPU cores
        n_batch=512, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
        n_gpu_layers=43, # Change this value based on your model and your GPU VRAM pool.
        n_ctx=4096, # Context window
    )
    
else:
    from transformers import LlamaTokenizer, LlamaForCausalLM
    tokenizer = LlamaTokenizer.from_pretrained('ChanceFocus/finma-7b-full')
    llm = LlamaForCausalLM.from_pretrained('ChanceFocus/finma-7b-full', device_map='auto')

In [36]:
from llama_index import VectorStoreIndex
from llama_index import ServiceContext

service_context = ServiceContext.from_defaults(
    llm=llm, embed_model="local:BAAI/bge-small-en-v1.5"
)
index = VectorStoreIndex.from_documents([document],
                                        service_context=service_context)

In [37]:
query_engine = index.as_query_engine()

In [38]:
response = query_engine.query(
    "What are steps to take when finding projects to build your experience?"
)
print(str(response))

## Evaluation setup using TruLens

In [39]:
eval_questions = []
with open('eval_questions.txt', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        print(item)
        eval_questions.append(item)

In [40]:
# You can try your own question:
new_question = "What is the right AI job for me?"
eval_questions.append(new_question)

In [41]:
print(eval_questions)

In [42]:
from trulens_eval import Tru
tru = Tru()

tru.reset_database()

In [43]:
from utils import get_prebuilt_trulens_recorder

tru_recorder = get_prebuilt_trulens_recorder(query_engine,
                                             app_id="Direct Query Engine")

In [None]:
with tru_recorder as recording:
    for question in eval_questions:
        response = query_engine.query(question)

In [None]:
records, feedback = tru.get_records_and_feedback(app_ids=[])

In [None]:
records.head()

In [None]:
# launches on http://localhost:8501/
tru.run_dashboard()

## Advanced RAG pipeline

### 1. Sentence Window retrieval

In [None]:
from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

In [None]:
from utils import build_sentence_window_index

sentence_index = build_sentence_window_index(
    document,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="sentence_index"
)

In [None]:
from utils import get_sentence_window_query_engine

sentence_window_engine = get_sentence_window_query_engine(sentence_index)

In [None]:
window_response = sentence_window_engine.query(
    "how do I get started on a personal project in AI?"
)
print(str(window_response))

In [None]:
tru.reset_database()

tru_recorder_sentence_window = get_prebuilt_trulens_recorder(
    sentence_window_engine,
    app_id = "Sentence Window Query Engine"
)

In [None]:
for question in eval_questions:
    with tru_recorder_sentence_window as recording:
        response = sentence_window_engine.query(question)
        print(question)
        print(str(response))

In [None]:
tru.get_leaderboard(app_ids=[])

In [None]:
# launches on http://localhost:8501/
tru.run_dashboard()

### 2. Auto-merging retrieval

In [None]:
from utils import build_automerging_index

automerging_index = build_automerging_index(
    documents,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="merging_index"
)

In [None]:
from utils import get_automerging_query_engine

automerging_query_engine = get_automerging_query_engine(
    automerging_index,
)

In [None]:
auto_merging_response = automerging_query_engine.query(
    "How do I build a portfolio of AI projects?"
)
print(str(auto_merging_response))

In [None]:
tru.reset_database()

tru_recorder_automerging = get_prebuilt_trulens_recorder(automerging_query_engine,
                                                         app_id="Automerging Query Engine")

In [None]:
for question in eval_questions:
    with tru_recorder_automerging as recording:
        response = automerging_query_engine.query(question)
        print(question)
        print(response)

In [None]:
tru.get_leaderboard(app_ids=[])

In [None]:
# launches on http://localhost:8501/
tru.run_dashboard()