<a href="https://colab.research.google.com/github/akashmathur-2212/LLMs-playground/blob/main/LlamaIndex-applications/Advanced-RAG/document_tracking_debugging/document_tracking_debugging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

1. In this notebook we will see Efficient Document Tracing and Debugging on multiple text files. These files are present in the google drive.
2. We will use Open Source LLM [`zephyr-7b-alpha`](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) and embedding [`BAAI/bge-large-en-v1.5`](https://huggingface.co/BAAI/bge-large-en-v1.5)

In [1]:
!pip install -qqq llama-index llama-hub langchain accelerate==0.21.0 bitsandbytes==0.40.2 transformers sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.7/806.7 kB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.1/225.1 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━

In [2]:
import nest_asyncio
nest_asyncio.apply()

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

import json
import torch
from pathlib import Path
import pandas as pd
pd.set_option("display.max_colwidth", -1)

from copy import deepcopy

# transformers
from transformers import BitsAndBytesConfig

# llama_index
from llama_index.prompts import PromptTemplate
from llama_index.llms import HuggingFaceLLM
from llama_index import SimpleDirectoryReader, download_loader, Document, VectorStoreIndex, ServiceContext
from llama_index.node_parser import SentenceSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index import StorageContext, load_index_from_storage
from llama_index.callbacks import CallbackManager, LlamaDebugHandler

from llama_index.response.notebook_utils import display_source_node
from IPython.display import Markdown, display, HTML
from llama_index.retrievers import VectorIndexRetriever

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

  pd.set_option("display.max_colwidth", -1)


# Mount Google Drive to Access Data

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Models

## LLM (`zephyr-7b-alpha`)

In [4]:
from google.colab import userdata

# huggingface api token
hf_token = userdata.get('hf_token')

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


def messages_to_prompt(messages):
  prompt = ""
  for message in messages:
    if message.role == 'system':
      prompt += f"<|system|>\n{message.content}</s>\n"
    elif message.role == 'user':
      prompt += f"<|user|>\n{message.content}</s>\n"
    elif message.role == 'assistant':
      prompt += f"<|assistant|>\n{message.content}</s>\n"

  # ensure we start with a system prompt, insert blank if needed
  if not prompt.startswith("<|system|>\n"):
    prompt = "<|system|>\n</s>\n" + prompt

  # add final assistant prompt
  prompt = prompt + "<|assistant|>\n"

  return prompt


llm = HuggingFaceLLM(
    model_name="HuggingFaceH4/zephyr-7b-alpha",
    tokenizer_name="HuggingFaceH4/zephyr-7b-alpha",
    query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
    context_window=3900,
    max_new_tokens=256,
    model_kwargs={"quantization_config": quantization_config},
    # tokenizer_kwargs={},
    generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95, "do_sample":True},
    messages_to_prompt=messages_to_prompt,
    device_map="auto",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/628 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

## Embedding (`BAAI/bge-large-en-v1.5`)

In [5]:
embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/92.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [14]:
# Using the LlamaDebugHandler to print the trace of the sub questions
# captured by the SUB_QUESTION callback event type
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

# ServiceContext
service_context = ServiceContext.from_defaults(llm=llm,
                                               embed_model=embed_model,
                                               callback_manager=callback_manager
                                               )

# Load Data

In [16]:
!ls '/content/gdrive/MyDrive/document_management/'

evaluation_metrics.txt	imbalanced_data.txt  neural_network.txt  statistics.txt


In [17]:
files_path = '/content/gdrive/MyDrive/document_management/'
index_persist_dir = './storage/document_management/'

try:
    storage_context = StorageContext.from_defaults(persist_dir=index_persist_dir)
    index = load_index_from_storage(storage_context)
    print('loading from disk')
except:
    documents = SimpleDirectoryReader(files_path, filename_as_id=True).load_data()
    index = VectorStoreIndex.from_documents(documents, service_context=service_context)
    index.storage_context.persist(persist_dir=index_persist_dir)
    print('persisting to disk')

**********
Trace: index_construction
    |_node_parsing ->  0.092604 seconds
      |_chunking ->  0.032556 seconds
      |_chunking ->  0.003054 seconds
      |_chunking ->  0.035871 seconds
      |_chunking ->  0.015034 seconds
    |_embedding ->  2.645123 seconds
    |_embedding ->  0.132147 seconds
**********
persisting to disk


Let's do sanity check to ensure what documents the index has actually ingested.

Each index has a `ref_doc_info` property which is kind of a mapping to the original doc IDs that we input.

In [18]:
print('ref_docs ingested: ', len(index.ref_doc_info))
print('number of input documents: ', len(documents))

ref_docs ingested:  4
number of input documents:  4


So, the index is properly inserted in all our documents.

Let's load the index which we persisted/stored. You can also store it to any of the Vector Database and load it from there.

Then, double check to ensure the original number of documents are stored from the Memory or Vector Database.

In [19]:
index = load_index_from_storage(StorageContext.from_defaults(persist_dir=index_persist_dir),
                                service_context=service_context)

print('ref_docs ingested: ', len(index.ref_doc_info))
print('number of input documents: ', len(documents))

**********
Trace: index_construction
**********
ref_docs ingested:  4
number of input documents:  4


Let's check a random question which is related to Andrew Huberman's sleep podcast for which there is NO informartion present in the Vector DataStore.

In [20]:
query_input = "How music can help in sleep?"
response = index.as_query_engine().query(query_input)



**********
Trace: query
    |_query ->  13.783097 seconds
      |_retrieve ->  0.045693 seconds
        |_embedding ->  0.040198 seconds
      |_synthesize ->  13.737183 seconds
        |_templating ->  3.7e-05 seconds
        |_llm ->  13.722639 seconds
**********


In [21]:
from llama_index.response.pprint_utils import pprint_response
pprint_response(response, show_source=True)

Final Response: According to studies, listening to slow, calming music
can help in sleep by reducing anxiety and promoting relaxation. This
can lower heart rate and decrease muscle tension, making it easier to
fall asleep and stay asleep. Some people prefer classical music or
nature sounds, while others prefer instrumental or ambient music
without lyrics. The music should be chosen based on personal
preference and can be played softly through headphones or speakers.
However, it's essential to note that individual preferences and
sensitivities may vary, and listening to music that might cause
anxiety or discomfort should be avoided.
______________________________________________________________________
Source Node 1/2
Node ID: ac070d99-9d27-4235-aa11-576aff922145
Similarity: 0.36091599439297894
Text: Best thing about Relu bcoz derivate is zero when z is negative,
and for z>0 the activation function gives very different value, and
faster to learn for NN coz the slope is not zero like in 

Cleary a case of Halluciniation, Look at the Similarity Score. Let's debug further

In [22]:
# Print info on llm inputs/outputs - returns start/end events for each LLM call
event_pairs = llama_debug.get_llm_inputs_outputs()

# print(event_pairs[0][0]) # Shows what was sent to LLM
# print(event_pairs[0][1].payload.keys()) # What other things you can debug by simply passing the key
# print(event_pairs[0][1].payload["completion"]) # Shows the LLM response it generated.

In [23]:
print(event_pairs[0][0]) # Shows what was sent to LLM

CBEvent(event_type=<CBEventType.LLM: 'llm'>, payload={<EventPayload.PROMPT: 'formatted_prompt'>: "<|system|>\n</s>\n<|user|>\nContext information is below.\n---------------------\nfile_path: /content/gdrive/MyDrive/document_management/neural_network.txt\n\nBest thing about Relu bcoz derivate is zero when z is negative, and for z>0 the activation function gives very different value, and faster to learn for NN coz the slope is not zero like in sigmoid/tanh\nif you let the function g of z be equal to tanh(z) (hyperbolic tangent function), this almost always works better than the sigmoid function because the values between plus 1 and minus 1, the mean of the activations that come out of your head, and they are closer to having a 0 mean. And so just as sometimes when you train a learning algorithm, you might center the data and have your data have 0 mean using a tanh instead of a sigmoid function. It kind of has the effect of centering your data so that the mean of your data is closer to 0 

In [24]:
print(event_pairs[0][1].payload.keys()) # What other things you can debug by simply passing the key

dict_keys([<EventPayload.PROMPT: 'formatted_prompt'>, <EventPayload.COMPLETION: 'completion'>])


In [25]:
# Look at the Payload prompt sent to LLM
# It has no mention of Sleep/Music etc.
event_pairs[0][0].payload['formatted_prompt']

"<|system|>\n</s>\n<|user|>\nContext information is below.\n---------------------\nfile_path: /content/gdrive/MyDrive/document_management/neural_network.txt\n\nBest thing about Relu bcoz derivate is zero when z is negative, and for z>0 the activation function gives very different value, and faster to learn for NN coz the slope is not zero like in sigmoid/tanh\nif you let the function g of z be equal to tanh(z) (hyperbolic tangent function), this almost always works better than the sigmoid function because the values between plus 1 and minus 1, the mean of the activations that come out of your head, and they are closer to having a 0 mean. And so just as sometimes when you train a learning algorithm, you might center the data and have your data have 0 mean using a tanh instead of a sigmoid function. It kind of has the effect of centering your data so that the mean of your data is closer to 0 rather than, maybe 0.5. And this actually makes learning for the next layer a little bit easier. 

In [27]:
# clearly a case of halluciniation
print(event_pairs[0][1].payload["completion"]) # Shows the LLM response it generated.

According to studies, listening to slow, calming music can help in sleep by reducing anxiety and promoting relaxation. This can lower heart rate and decrease muscle tension, making it easier to fall asleep and stay asleep. Some people prefer classical music or nature sounds, while others prefer instrumental or ambient music without lyrics. The music should be chosen based on personal preference and can be played softly through headphones or speakers. However, it's essential to note that individual preferences and sensitivities may vary, and listening to music that might cause anxiety or discomfort should be avoided.


**Bonus Tip** 💡

Llama Debug Handler not only contains information about the current state of LLM, Infact there’s a lot more that you can do with it. For example, You can debug the following other parts the same way we did for LLM one.

- CBEventType.LLM
- CBEventType.EMBEDDING
- CBEventType.CHUNKING
- CBEventType.NODE_PARSING
- CBEventType.RETRIEVE
- CBEventType.SYNTHESIZE
- CBEventType.TREE
- CBEventType.QUERY

Refer [LlamaIndex](https://docs.llamaindex.ai/en/latest/index.html) for more information

# 1. Insert a New Document

Now, lets insert AH Sleep

In [28]:
documents = SimpleDirectoryReader(files_path, filename_as_id=True).load_data()
print(f"Loaded {len(documents)} docs")

Loaded 5 docs


In [29]:
# Run refresh_ref_docs method to check for document updates
refreshed_docs = index.refresh_ref_docs(documents)

print(refreshed_docs, "\n")
print('Number of newly inserted/refreshed docs: ', sum(refreshed_docs))

**********
Trace: refresh
    |_node_parsing ->  0.029424 seconds
      |_chunking ->  0.02686 seconds
    |_embedding ->  0.867388 seconds
**********
[True, False, False, False, False] 

Number of newly inserted/refreshed docs:  1


In [30]:
query_input = "How music can help in sleep?"
response = index.as_query_engine().query(query_input)

**********
Trace: query
    |_query ->  12.421759 seconds
      |_retrieve ->  0.031556 seconds
        |_embedding ->  0.028932 seconds
      |_synthesize ->  12.390024 seconds
        |_templating ->  2.6e-05 seconds
        |_llm ->  12.380728 seconds
**********


In [None]:
from llama_index.response.pprint_utils import pprint_response
pprint_response(response, show_source=True)

Final Response: Music can enhance motivation, especially if it is the
kind of music that really puts you in the groove for the particular
type of work you're going to do. However, in terms of sleep, music can
be used as a sleep aid for some people. According to the given
context, music's great, and it can stimulate dopamine release, we know
this. But to incorporate music into a sleep routine, it's essential to
choose soothing and calming music that can help in inducing sleep.
Listening to music before bedtime can also help in reducing stress and
anxiety, which can interfere with sleep. The context also suggests
that people should be wary of consuming too much caffeine, music, and
other dopamine-stimulating substances and activities in the long run,
as it can lead to depletion of dopamine reserves and postpartum low.
So, while music can be helpful in sleep, it should be used in
moderation and with caution, especially if someone is dealing with
sleep issues or addiction-related issues.
_

In [32]:
for doc_num in range(len(documents)):
  print(f"document-{doc_num} --> {documents[doc_num].id_}")

document-0 --> /content/gdrive/MyDrive/document_management/andrew_sleep.txt
document-1 --> /content/gdrive/MyDrive/document_management/evaluation_metrics.txt
document-2 --> /content/gdrive/MyDrive/document_management/imbalanced_data.txt
document-3 --> /content/gdrive/MyDrive/document_management/neural_network.txt
document-4 --> /content/gdrive/MyDrive/document_management/statistics.txt


In [33]:
info_dict = index.ref_doc_info
info_dict

{'/content/gdrive/MyDrive/document_management/evaluation_metrics.txt': RefDocInfo(node_ids=['bc8b5471-da2a-4879-b113-d0821ec64110', '2b5e6e0b-c13f-4072-9cb5-90d91be99bba', 'f7cf0dc4-e084-48df-a5c6-1497ba13e12f', '504c5b3e-9efc-44fc-8f47-05b68706e0d6', 'b983adb4-8f6e-4684-8982-b083143cb880'], metadata={'file_path': '/content/gdrive/MyDrive/document_management/evaluation_metrics.txt', 'file_name': 'evaluation_metrics.txt', 'file_type': 'text/plain', 'file_size': 15621, 'creation_date': '2024-02-02', 'last_modified_date': '2024-02-02', 'last_accessed_date': '2024-02-02'}),
 '/content/gdrive/MyDrive/document_management/imbalanced_data.txt': RefDocInfo(node_ids=['8e340d20-ebc3-40b3-8ac9-4007f9edb8b1'], metadata={'file_path': '/content/gdrive/MyDrive/document_management/imbalanced_data.txt', 'file_name': 'imbalanced_data.txt', 'file_type': 'text/plain', 'file_size': 3218, 'creation_date': '2024-02-02', 'last_modified_date': '2024-02-02', 'last_accessed_date': '2024-02-02'}),
 '/content/gdriv

In [None]:
info_dict.keys()

dict_keys(['/content/gdrive/MyDrive/document_management/evaluation_metrics.txt', '/content/gdrive/MyDrive/document_management/imbalanced_data.txt', '/content/gdrive/MyDrive/document_management/neural_network.txt', '/content/gdrive/MyDrive/document_management/statistics.txt', '/content/gdrive/MyDrive/document_management/andrew_sleep.txt'])

In [42]:
info_dict['/content/gdrive/MyDrive/document_management/evaluation_metrics.txt'].node_ids

['bc8b5471-da2a-4879-b113-d0821ec64110',
 '2b5e6e0b-c13f-4072-9cb5-90d91be99bba',
 'f7cf0dc4-e084-48df-a5c6-1497ba13e12f',
 '504c5b3e-9efc-44fc-8f47-05b68706e0d6',
 'b983adb4-8f6e-4684-8982-b083143cb880']

## Dissecting the `docstore.json`

In [44]:
# Opening JSON file
f = open('./storage/document_management/docstore.json')

# returns JSON object as a dictionary
data = json.load(f)

In [46]:
data.keys()

dict_keys(['docstore/metadata', 'docstore/data', 'docstore/ref_doc_info'])

In [49]:
data['docstore/ref_doc_info']

{'/content/gdrive/MyDrive/document_management/evaluation_metrics.txt': {'node_ids': ['bc8b5471-da2a-4879-b113-d0821ec64110',
   '2b5e6e0b-c13f-4072-9cb5-90d91be99bba',
   'f7cf0dc4-e084-48df-a5c6-1497ba13e12f',
   '504c5b3e-9efc-44fc-8f47-05b68706e0d6',
   'b983adb4-8f6e-4684-8982-b083143cb880'],
  'metadata': {'file_path': '/content/gdrive/MyDrive/document_management/evaluation_metrics.txt',
   'file_name': 'evaluation_metrics.txt',
   'file_type': 'text/plain',
   'file_size': 15621,
   'creation_date': '2024-02-02',
   'last_modified_date': '2024-02-02',
   'last_accessed_date': '2024-02-02'}},
 '/content/gdrive/MyDrive/document_management/imbalanced_data.txt': {'node_ids': ['8e340d20-ebc3-40b3-8ac9-4007f9edb8b1'],
  'metadata': {'file_path': '/content/gdrive/MyDrive/document_management/imbalanced_data.txt',
   'file_name': 'imbalanced_data.txt',
   'file_type': 'text/plain',
   'file_size': 3218,
   'creation_date': '2024-02-02',
   'last_modified_date': '2024-02-02',
   'last_acce

In [47]:
data['docstore/metadata']

{'/content/gdrive/MyDrive/document_management/evaluation_metrics.txt': {'doc_hash': '8c5422e73bb851e326777bb8847122c77366120256b4798505af5bc665c3d337'},
 '/content/gdrive/MyDrive/document_management/imbalanced_data.txt': {'doc_hash': '3c4310e8342f9a2aa79fc2a86eb5d8c6d0ee490c5aaf5fd92a3f6ddc2d27d3df'},
 '/content/gdrive/MyDrive/document_management/neural_network.txt': {'doc_hash': '320777ffbaa9200cdef3f45f51f1da0b2f75d31bc33f7bace2fe0119e4e9936d'},
 '/content/gdrive/MyDrive/document_management/statistics.txt': {'doc_hash': '40e2bbf4f2e2a7af90dd1fa587420461568859a677b25ae909b393b4f51500d2'},
 'bc8b5471-da2a-4879-b113-d0821ec64110': {'doc_hash': '646a327006c74e23000863546633d57555de2e49276ffcce3ad6f4d8d7d3e6ca',
  'ref_doc_id': '/content/gdrive/MyDrive/document_management/evaluation_metrics.txt'},
 '2b5e6e0b-c13f-4072-9cb5-90d91be99bba': {'doc_hash': 'f37f7d0a8a3413c205ba2976c05cb1ccccedd6d7282e7e3a6118acd763fc1c2c',
  'ref_doc_id': '/content/gdrive/MyDrive/document_management/evaluation_

In [50]:
# data['docstore/data']

# 2. Delete a Document

In [58]:
index.delete_ref_doc("/content/gdrive/MyDrive/document_management/imbalanced_data.txt", delete_from_docstore=True)

In [60]:
documents = SimpleDirectoryReader(files_path, filename_as_id=True).load_data()
print(f"Loaded {len(documents)} docs")

Loaded 4 docs


In [61]:
# Run refresh_ref_docs method to check for document updates
refreshed_docs = index.refresh_ref_docs(documents)

print(refreshed_docs, "\n")
print('Number of newly inserted/refreshed docs: ', sum(refreshed_docs))

**********
Trace: refresh
**********
[False, False, False, False] 

Number of newly inserted/refreshed docs:  0


In [62]:
for doc_num in range(len(documents)):
  print(f"document-{doc_num} --> {documents[doc_num].id_}")

document-0 --> /content/gdrive/MyDrive/document_management/andrew_sleep.txt
document-1 --> /content/gdrive/MyDrive/document_management/evaluation_metrics.txt
document-2 --> /content/gdrive/MyDrive/document_management/neural_network.txt
document-3 --> /content/gdrive/MyDrive/document_management/statistics.txt


In [63]:
query_input = "What are the various techniques mentioned in the document to handle imbalanced data?"
response = index.as_query_engine().query(query_input)

**********
Trace: query
    |_query ->  22.3107 seconds
      |_retrieve ->  0.030881 seconds
        |_embedding ->  0.027437 seconds
      |_synthesize ->  22.279649 seconds
        |_templating ->  2.5e-05 seconds
        |_llm ->  22.272297 seconds
**********


In [None]:
from llama_index.response.pprint_utils import pprint_response
pprint_response(response, show_source=True)

Final Response: The document mentions two techniques to handle
imbalanced data:  1. Undersampling: This technique involves reducing
the number of samples in the majority class to make the class
distributions more balanced. The most common technique is random
undersampling, which randomly selects a subset of the majority class
to reduce the class imbalance.  2. Oversampling: This technique
involves increasing the number of samples in the minority class to
make the class distributions more balanced. The most common technique
is synthetic oversampling, which generates synthetic samples for the
minority class using techniques like SMOTE (Synthetic Minority Over-
sampling Technique) or ADASYN (Adaptive Synthetic Sampling Approach).
However, the document does not explicitly mention these techniques as
a solution to handle imbalanced data. Instead, it provides an
explanation of the imbalance issue and how it can affect the
evaluation metrics commonly used in Kaggle competitions. The document


# 3. Update an Existing Document

In [None]:
documents = SimpleDirectoryReader(files_path, filename_as_id=True).load_data()
print(f"Loaded {len(documents)} docs")

Loaded 4 docs


In [None]:
# Run refresh_ref_docs method to check for document updates
refreshed_docs = index.refresh_ref_docs(documents)

print(refreshed_docs, "\n")
print('Number of newly inserted/refreshed docs: ', sum(refreshed_docs))

**********
Trace: refresh
    |_node_parsing ->  0.027337 seconds
      |_chunking ->  0.02558 seconds
    |_embedding ->  0.442353 seconds
**********
[False, False, True, False] 

Number of newly inserted/refreshed docs:  1


In [None]:
query_input = "What is SMOTE technique mentioned in the document to handle imbalanced data?"
response = index.as_query_engine().query(query_input)

**********
Trace: query
    |_query ->  7.718252 seconds
      |_retrieve ->  0.031824 seconds
        |_embedding ->  0.028906 seconds
      |_synthesize ->  7.686214 seconds
        |_templating ->  2.6e-05 seconds
        |_llm ->  7.67561 seconds
**********


In [None]:
from llama_index.response.pprint_utils import pprint_response
pprint_response(response, show_source=True)

Final Response: The SMOTE (Synthetic Minority Over-sampling Technique)
technique is not explicitly mentioned in the given context
information. The text only mentions other techniques such as Random
Under-Sampling, Random Over-Sampling, and Cluster-Based Over Sampling.
______________________________________________________________________
Source Node 1/2
Node ID: 8bbf7968-6599-4a0a-a231-95a80795d5fd
Similarity: 0.6323434092866764
Text: In this way, each class will count as much the others, no matter
how frequent its positive cases are or how important they are for your
problem, resulting therefore in equal penalizations when the model
doesn’t perform well with any class:    • Micro averaging: This
approach will sum all the contributions from each class to compute an
aggregated...
______________________________________________________________________
Source Node 2/2
Node ID: 89c8d5c1-e268-49fc-945f-22b750510d63
Similarity: 0.6199370792784977
Text: - The class of Gated Linear Unit (GLU) h

# END