# **Local RAG (Walk-through)**
<center> 
<img src="../readme_images/my_rag_chart.png">
</center>

### **1. Create the Embeddings of the documents:**

In [19]:
# import the necessary modules & setup global variables
import sys
import os
import glob
import fitz # for pdf reading
os.environ["TQDM_DISABLE"] = "1"
from tqdm import tqdm
from tqdm import tqdm
import spacy # For Text preprocessing 
from spacy.lang.en import English
from spacy_cleaner import processing, Cleaner
from spacy_cleaner.processing import removers
from sentence_transformers import util, SentenceTransformer # For Loading LLMs
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import TextIteratorStreamer # For streaming response
import pandas as pd
from time import perf_counter as timer
import re
import torch 
import numpy as np
from threading import Thread
sys.path.append(os.path.abspath(os.path.join('..')))
import utils 
import gradio as gr

# set the path to your data directory
DATA_DIR = '../data/'
# Define split size to turn groups of sentences into chunks
CHUNK_SIZE_IN_SENTENCES = 6
# define the min number of tokens in a chunk (the rest will be filtered)
MIN_TOKEN_LENGTH_PER_CHUNK = 30
# embedding model
EMBEDDING_MODEL = "all-mpnet-base-v2"
# device
DEVICE = "cuda"
# Embedding output path
EMBEDDING_OUTPUT_PATH = "../vector_store/embeddings.csv"

#### **load & preprocess documents:**
We first load the PDF files then do some text cleaning (ex.removing URLs), and then we extract sentences. Finally, we get a list of dictionaries with the processed text and the metadata of each page of our PDFs. 

In [4]:
# Add a sentencizer pipeline & cleaner
NLP = English()
NLP.add_pipe("sentencizer")
model = spacy.load("en_core_web_sm")
cleaner_pipeline = Cleaner(
    model,
    removers.remove_url_token,
    removers.remove_email_token)

def clean_text(text):
    cleaned_text = cleaner_pipeline.clean(text)
    return cleaned_text

def get_sentences(txt):
    sentences = list(NLP(txt).sents)
    sentences = [str(sentence) for sentence in sentences]
    return sentences

def read_files(data_dir):
    # loop over your files
    print(f"Started processing files in directory: {data_dir}   ...")
    t1 = timer()
    extracted_data = []
    for file in glob.glob(os.path.join(data_dir, "*.pdf")):
        # open the doc
        document = fitz.open(file)
        # process
        for page_num, page in enumerate(document):
            # get the raw text of each page
            txt = page.get_text()
            # do some cleaning
            cleaned_text = clean_text([txt])[0]
            sentences = get_sentences(cleaned_text)
            entry = {"file_path": file,
                     "page_number": page_num,
                     "page_char_count": len(cleaned_text),
                     "page_word_count": len(cleaned_text.split(" ")),
                     "page_sentence_count": len(sentences),
                     "page_token_count": len(cleaned_text) / 4,
                     "text": cleaned_text,
                     "sentences": sentences}
            extracted_data.append(entry)
    t2 = timer()
    print(f"processing is finished, time needed: {t2 - t1:.5f} seconds")
    return extracted_data

# load & process documents
extracted_data = read_files(DATA_DIR)
extracted_data[10]

Started processing files in directory: ../data/   ...
processing is finished, time needed: 5.60754 seconds


{'file_path': '../data\\attention is all you need.pdf',
 'page_number': 10,
 'page_char_count': 3410,
 'page_word_count': 624,
 'page_sentence_count': 62,
 'page_token_count': 852.5,
 'text': '[ 5 ] Kyunghyun Cho , Bart van Merrienboer , Caglar Gulcehre , Fethi Bougares , Holger Schwenk , and Yoshua Bengio . Learning phrase representations using rnn encoder - decoder for statistical machine translation . CoRR , abs/1406.1078 , 2014 . [ 6 ] Francois Chollet . Xception : Deep learning with depthwise separable convolutions . arXiv preprint arXiv:1610.02357 , 2016 . [ 7 ] Junyoung Chung , Çaglar Gülçehre , Kyunghyun Cho , and Yoshua Bengio . Empirical evaluation of gated recurrent neural networks on sequence modeling . CoRR , abs/1412.3555 , 2014 . [ 8 ] Chris Dyer , Adhiguna Kuncoro , Miguel Ballesteros , and Noah A. Smith . Recurrent neural network grammars . In Proc . of NAACL , 2016 . [ 9 ] Jonas Gehring , Michael Auli , David Grangier , Denis Yarats , and Yann N. Dauphin . Convolu- ti

In [8]:
# lets veiw some stats
df = pd.DataFrame(extracted_data)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,num_chunks
count,79.0,79.0,79.0,79.0,79.0,79.0
mean,11.29,3382.61,635.19,28.14,845.65,5.09
std,8.75,1042.17,193.59,15.02,260.54,2.5
min,0.0,845.0,176.0,8.0,211.25,2.0
25%,4.5,2651.0,490.0,18.0,662.75,3.0
50%,9.0,3607.0,677.0,24.0,901.75,4.0
75%,16.0,3958.0,759.0,35.0,989.5,6.0
max,33.0,5503.0,961.0,64.0,1375.75,11.0


From the stats, we can see that the average num of chunks per page is 5, and the average token count is 845. we can conclude that each chunk has 845/5 ~ 169 tokens. meaning we need to choose an embedding model that has a context length >= 169. for example all-mpnet-base-v2 model (it has a capacity of 384 tokens)

#### **Chunking The text into a group of sentences**
We are grouping each couple of sentences into one chunk. We are also keeping their metadata like (file path, page number, etc) to be able to return citations when we generate the RAG answer. The need to split pages into smaller chunks is because the embedding model has a limited context length, in our model **all-mpnet-base-v2** is 384 tokens. 

In [5]:
def convert_to_chunk_dict(text_dict):
    extracted_chunks = []
    for item in text_dict:
        for sentence_chunk in item["sentence_chunks"]:
            chunk_dict = {}
            chunk_dict["file_path"] = item["file_path"]
            chunk_dict["page_number"] = item["page_number"]
            # Join the sentences together into a paragraph-like structure
            joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
            # add a space after a period if it's followed by an uppercase letter
            joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
            chunk_dict["sentence_chunk"] = joined_sentence_chunk
            # Get stats about the chunk
            chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
            chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
            chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4  # 1 token = ~4 characters
            extracted_chunks.append(chunk_dict)
    return extracted_chunks

def chunking(list_of_sentences, chunk_size):
    # We group sentences based on the chunk size (estimated in sentences)
    sentence_chunks = [list_of_sentences[i:i + chunk_size] for i in range(0, len(list_of_sentences), chunk_size)]
    return sentence_chunks

def chunk_text(data, chunk_size_in_sentences):
    print("Chunking text ..")
    t1 = timer()
    for entry in data:
        entry["sentence_chunks"] = chunking(entry["sentences"], chunk_size_in_sentences)
        entry["num_chunks"] = len(entry["sentence_chunks"])
    # here we create new dict with chunks as entries & we keep their metadata
    extracted_chunks = convert_to_chunk_dict(data)
    t2 = timer()
    print(f"Chunking is finished, time needed: {t2 - t1:.5f} seconds")
    return extracted_chunks


# chunking into groups of sentences
extracted_chunks = chunk_text(extracted_data, CHUNK_SIZE_IN_SENTENCES)
extracted_chunks[10]

Chunking text ..
Chunking is finished, time needed: 0.00767 seconds


{'file_path': '../data\\attention is all you need.pdf',
 'page_number': 2,
 'sentence_chunk': 'This masking , combined with fact that the output embeddings are offset by one position , ensures that the predictions for position i can depend only on the known outputs at positions less than i. 3.2 Attention An attention function can be described as mapping a query and a set of key - value pairs to an output , where the query , keys , values , and output are all vectors . The output is computed as a weighted sum 3',
 'chunk_char_count': 420,
 'chunk_word_count': 82,
 'chunk_token_count': 105.0}

In [10]:
# lets veiw some stats
df = pd.DataFrame(extracted_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,402.0,402.0,402.0,402.0
mean,11.16,662.79,123.68,165.7
std,7.65,384.15,71.0,96.04
min,0.0,2.0,1.0,0.5
25%,6.0,370.75,68.25,92.69
50%,10.0,611.5,112.5,152.88
75%,14.0,903.5,168.0,225.88
max,33.0,2328.0,393.0,582.0


Then we filter out chunks with token count less that **min_token_length_per_chunk**: 

In [11]:
def filter_chunks(chunks, min_token_length_per_chunk):
    chunks_df = pd.DataFrame(chunks)
    filtered_chunks = chunks_df[chunks_df["chunk_token_count"]
                                > min_token_length_per_chunk].to_dict(orient="records")
    return filtered_chunks


# filter short chunks
extracted_chunks_df_filtered = filter_chunks(extracted_chunks, MIN_TOKEN_LENGTH_PER_CHUNK)


#### **Create the Embeddings** 
Here we load the embedding model and then we embedd each chunk of the docuemnts. the output vectors has dimension of  **768**. Then we save these embeddings to the disk. 

In [15]:
def create_embeddings(chunks_df, embedding_model_name, device, embedding_output_path):
    # load embedding model
    print(f"Loading embedding model \"{embedding_model_name}\" on {'GPU' if device == 'cuda' else device}  ...")
    embedding_model = SentenceTransformer(model_name_or_path=embedding_model_name, device=device)

    # start creating the embeddings
    print("Started creating the embeddings ...")
    t1 = timer()
    for item in tqdm(chunks_df):
        item["embedding"] = embedding_model.encode(item["sentence_chunk"])
    t2 = timer()
    print(f"Creating the embeddings is finished, time needed: {t2 - t1:.5f} seconds")

    # save the embedding on disk
    embeddings_df = pd.DataFrame(chunks_df)
    os.makedirs(os.path.dirname(embedding_output_path), exist_ok=True)
    embeddings_df.to_csv(embedding_output_path, index=False, escapechar="\\")
    print(f"Embeddings have been saved on disk at:  {embedding_output_path}")


# load embedding model & create embedding & save on disk
create_embeddings(extracted_chunks_df_filtered, EMBEDDING_MODEL,
                 DEVICE, EMBEDDING_OUTPUT_PATH)


Loading embedding model "all-mpnet-base-v2" on GPU  ...
Started creating the embeddings ...
Creating the embeddings is finished, time needed: 6.94670 seconds
Embeddings have been saved on disk at:  ../vector_store/embeddings.csv


### **2. Build the RAG Pipeline:**
We have already created a vector store that holds the locally created embeddings, now we need to have Q & A pipeline with augmented-prompts by our data files and **LLM** that is hosted **locally (on my GPU)**.

In [27]:
VECTOR_STORE_PATH = "../vector_store/embeddings.csv"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EMBEDDING_MODEL = "all-mpnet-base-v2"
NUM_OF_RELEVANT_CHUNKS = 5
SIMILARITY_THRESHOLD = 0.3
LLM_MODEL_ID = "google/gemma-2b-it"
TEMPERATURE = 0.1
MAX_NEW_TOKENS = 512

#### **Load the vector store**

In [35]:
def load_vector_store(vector_store_path, device):
    loaded_df = pd.read_csv(vector_store_path)
    # Convert embedding column back to np.array if they were string
    if isinstance(loaded_df["embedding"][0], str):
        loaded_df["embedding"] = loaded_df["embedding"].apply(
            lambda x: np.fromstring(x.strip("[]"), sep=" "))

    # Convert texts and embedding df to list of dicts (data index)
    data_index = loaded_df.to_dict(orient="records")

    # Convert embeddings to torch tensor and send to device
    embeddings = torch.tensor(np.array(loaded_df["embedding"].tolist()), dtype=torch.float32).to(device)
    return embeddings, data_index

# load the vector-store
embeddings, data_index = load_vector_store(VECTOR_STORE_PATH, DEVICE)
# Load the embedding model
embedding_model = SentenceTransformer(model_name_or_path=EMBEDDING_MODEL, device=DEVICE)

# show samples
print(f"vector dim: {len(embeddings[10])}, device: {DEVICE}")
print(embeddings[10])



vector dim: 768, device: cuda
tensor([-5.4377e-05, -7.6402e-02, -1.7080e-02,  2.7743e-02, -4.9856e-02,
         2.4300e-02,  5.5249e-02,  2.0908e-03, -1.6833e-02, -2.3803e-03,
         3.8728e-03, -4.2185e-02,  3.8551e-02,  3.8432e-02,  8.0141e-02,
        -7.0942e-02,  1.7040e-02, -2.2024e-02, -7.9899e-02, -7.3238e-03,
        -2.3346e-02, -5.0946e-02, -1.8225e-02,  4.4284e-02,  1.2815e-02,
        -1.7583e-02,  5.1758e-03, -1.5506e-02,  4.4298e-03, -2.7047e-02,
        -4.8557e-03,  1.8168e-02,  2.2040e-02, -8.7429e-03,  1.8417e-06,
        -1.2308e-03, -7.2533e-03, -5.0766e-04, -1.9028e-02, -3.4150e-02,
        -4.7926e-02, -3.0096e-03,  2.4131e-02,  8.9170e-03, -2.9060e-03,
         4.4912e-02,  4.0294e-02,  2.7502e-02,  1.7774e-02,  3.4721e-02,
        -2.3266e-03, -2.1491e-02, -2.2072e-02, -4.3160e-02,  8.3654e-02,
        -9.8258e-03,  3.9857e-02, -6.5048e-02, -6.2145e-02, -2.5086e-03,
        -1.9408e-02,  2.5230e-02,  1.3058e-02, -1.4173e-02,  9.8689e-02,
         4.8417e-02, 

#### **Load LLM in 4bit Precision**
Here we load **"google/gemma-2b-it"** model which has 2 billion parameters. My GPU is **Nvidia RTX 3060** with **6GB** memory. Loading 2 Billion  parameters model in full precision needs **2b * 4 ~ 8GB** of GPU memory. I need to do quantization to **int-8** or **int-4**

In [36]:
def load_llm(model_id):
    # load in 4bit precision (boost the inference time significantly)
    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)
    llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                     torch_dtype=torch.bfloat16,
                                                     quantization_config=quantization_config,
                                                     low_cpu_mem_usage=False)
    return tokenizer, llm_model

# Load LLM locally "google/gemma-2b-it"
tokenizer, llm_model = load_llm(model_id=LLM_MODEL_ID)
llm_model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
     

from this, we can notice:
- Vocab size is **256k**
- The hidden size is **2048**
- Context length from the model card **8192**

#### **Build the Retriever**
We calculate then **dot product** between the **embedded query** and the **embeddings** in our vector store. The **dot product** is the same as the **cosine similarity** when the vectors are **normalised**, which is the case of the output of our embedding model. 

In [41]:
def rag_retrieve(query, embedding_model, vectore_store, top_k):
    # embedd the query
    embedded_query = embedding_model.encode(query, convert_to_tensor=True)
    # dot product (cosine similarity because vectors are normalized)
    scores = util.dot_score(a=embedded_query, b=vectore_store)[0]
    # get the top k results
    scores, indices = torch.topk(input=scores, k=top_k)
    return scores, indices

def show_retrieval_results(data_dict, query, scores, indices):
    print(f"Query: {query}\n")
    print("Results:\n")
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print file path the page number
        print(f"File path: {data_dict[index]['file_path']}")
        print(f"Page number: {data_dict[index]['page_number']}")
        # Print relevant sentence chunk
        print("Text:")
        print(data_dict[index]["sentence_chunk"])
        print("\n")

# test
query = "VPT video pre-training"
scores, indices = rag_retrieve(query, embedding_model, vectore_store=embeddings, top_k=2)
show_retrieval_results(data_index, query, scores, indices)

Query: VPT video pre-training

Results:

Score: 0.4666
File path: ../data\\video pretraining VPT.pdf
Page number: 3
Text:
Collecting “ Clean ” Data Training the VPT Foundation Model via Behavioral Cloning Training the Inverse Dynamics Model ( IDM ) ~270k hours unlabeled video ~70k hours unlabeled video ~2k hours video labeled with actions Filter for “ clean ” video segments Search for relevant Minecraft videos via keywords Contractors collect data Label videos with IDM ~70k hours video IDM - labeled with actions Train non - causal IDM Train causal VPT Foundation Model a d space w a d space w Figure 2 : Video Pretraining ( VPT ) Method Overview .3 Methods Inverse Dynamics Models ( IDM ) VPT , illustrated in Figure 2 , requires we first collect a small amount of labeled contractor data with which to train an inverse dynamics model pIDM(at|o1 ... T ) , which seeks to minimize the negative log - likelihood of an action at timestep t given a trajectory of T observations ot : t ∈ [ 1 ... T ]

#### **Augmented Generation** 
Here we build a pipeline that starts with a prompt from the user, then it gets merged with another base prompt that is supported with **few-shot prompting** (in context learning) to better responses. Then a **similarity search** will happen between the embedded user query and the embeddings in the vector store to retrieve the **relevant chunks**. These chunks are added to the formatted prompt as a **context**. Then we prompt our **local LLM** to generate an answer. The generation is **streamed** and the relevant chunks are returned. Finally, we make sure to **empty the cache** of the GPU after each generation to not face memory issues (if any). 

In [46]:
def prepare_augmented_prompt(query, relevant_chunks, tokenizer):
    """
    function to better format the prompt:
    - use few-shot prompting (in context learning)
    - add context from relevant chunks (augmentation)
    """

    # join relevant chunks in one context string
    chunks = [chunk["sentence_chunk"] for chunk in relevant_chunks]
    chunks = " -" + "\n -".join(chunks)

    # few-shot prompting
    base_prompt = """Based on the following context items, please answer the query.
     Don't return the thinking, only return the answer.
     Make sure your answers are as explanatory as possible.
     Use the following example as a reference for the ideal answer style.
     \nExample 1:
     Query: What is the role of backpropagation in neural networks?
     Answer: Backpropagation is a key algorithm used for training neural networks by minimizing the error between predicted and actual outputs. It involves a forward pass where the input data is propagated through the network to generate an output, and a backward pass where the error is propagated back through the network to update the weights. This is done using the gradient descent optimization method, which calculates the gradient of the loss function with respect to each weight and adjusts the weights to reduce the error. Backpropagation allows neural networks to learn complex patterns in data by iteratively improving the model's accuracy.
     \nNow use the following context items to answer the user query:
     {context}
     \nRelevant passages: <extract relevant passages from the context here>
     \nUser query: {query}
     Answer:"""

    # Add relevant chunks
    base_prompt = base_prompt.format(context=chunks, query=query)
    # final prompt, suited for instruction-tuned models
    template = [{"role": "user", "content": base_prompt}]
    # add_generation_prompt argument tells the template to add tokens that indicate the start of a bot response
    prompt = tokenizer.apply_chat_template(conversation=template, tokenize=False, add_generation_prompt=True)

    return prompt


def augmented_generation(query, embedding_model, vector_store, data_index,
                         top_k, similarity_threshold, llm_model, tokenizer, temperature, max_new_tokens, device):
    # query your RAG to get relevant text
    scores, indices = rag_retrieve(query=query, embedding_model=embedding_model, vectore_store=vector_store,
                                   top_k=top_k)

    # only keep chunks with scores higher than the similarity threshold
    filtered_indices = [index for score, index in zip(scores, indices) if score > similarity_threshold]
    relevant_chunks = [data_index[i] for i in filtered_indices]

    # prepare the prompt
    prompt = prepare_augmented_prompt(query=query, relevant_chunks=relevant_chunks, tokenizer=tokenizer)

    # prompt the LLM
    input_ids = tokenizer(prompt, return_tensors="pt").to(device)

    # for streaming the response
    response_streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    # Generate an output of tokens
    generation_kwargs = dict(**input_ids, streamer=response_streamer,
                             temperature=temperature,
                             do_sample=True,
                             max_new_tokens=max_new_tokens)

    # for streaming we run the generation in a different thread
    thread = Thread(target=llm_model.generate, kwargs=generation_kwargs)
    thread.start()

    # join retrieved resources in one text
    retrieved_resources = ""
    for i, source in enumerate(relevant_chunks):
        retrieved_resources += (f"Resource {i + 1}: \n"
                                f"File path: {source['file_path']} \n"
                                f"Page: {source['page_number']} \n\n")
    if not retrieved_resources:
        retrieved_resources = "No resources found for your query!"
    return response_streamer, retrieved_resources


query = "Explain attention in detail"
# Clear GPU cache before generation
torch.cuda.empty_cache()
streamer, retrieved_resources = augmented_generation(query=query, embedding_model=embedding_model,
                                                     vector_store=embeddings, data_index=data_index,
                                                     top_k=NUM_OF_RELEVANT_CHUNKS,
                                                     similarity_threshold=SIMILARITY_THRESHOLD,
                                                     llm_model=llm_model,
                                                     tokenizer=tokenizer,
                                                     temperature=TEMPERATURE, max_new_tokens=MAX_NEW_TOKENS,
                                                     device=DEVICE)

for new_text in streamer:
    print(new_text, end="") 
print("\n\n", retrieved_resources)

Sure, here's an explanation of attention:

Attention is a mechanism in artificial intelligence (AI) that allows a neural network to focus on specific parts of the input data that are most relevant to a particular task. It involves calculating the weighted sum of the product of the query and each key, where the weights are determined by the attention score. The attention score is a measure of how much weight to assign to each key.

In the context, attention is used in the encoder self-attention layer of a neural network. The encoder self-attention layer is a type of self-attention that is used to learn long-range dependencies in the input data.

The output of the attention layer is a weighted sum of the dot products between the query and each key, where the weights are determined by the attention score. The attention scores are calculated by comparing the query to each key and then selecting the key that is most similar to the query.

The attention mechanism allows the model to focus on

#### **Run Gradio Web GUI**

In [47]:
def rag_answer(query):
    # Clear GPU cache before generation
    torch.cuda.empty_cache()
    streamer, retrieved_resources = augmented_generation(query=query, embedding_model=embedding_model,
                                                         vector_store=embeddings, data_index=data_index,
                                                         top_k=NUM_OF_RELEVANT_CHUNKS,
                                                         similarity_threshold=SIMILARITY_THRESHOLD,
                                                         llm_model=llm_model,
                                                         tokenizer=tokenizer,
                                                         temperature=TEMPERATURE, max_new_tokens=MAX_NEW_TOKENS,
                                                         device=DEVICE)
    generated_text = ""
    for new_text in streamer:
        generated_text += new_text
        yield generated_text, retrieved_resources

# Launch the app
theme = gr.themes.Default()
demo = utils.gradio_rag_blocks(title="Chat With Your Data! (Local GPU)",
                                      description="Ask your documents using my local " \
                                                  "Retrieval-Augmented Generation (RAG) pipeline.",
                                      submit_fun=rag_answer,
                                      theme=theme)
free_port = utils.get_free_port()
demo.launch(server_port=free_port)

Running on local URL:  http://127.0.0.1:51031

To create a public link, set `share=True` in `launch()`.


