In [None]:
! pip install llama-index==0.10.18
! pip install langchain==0.1.11
! pip install faiss-gpu
! pip install sentence-transformers
! pip install torch==2.2.1
! pip install accelerate
! pip install pypdf
! pip install llama-index-vector-stores-faiss
! pip install llama-index-embeddings-langchain
! pip install llama-index-embeddings-huggingface
! pip install llama-index-llms-huggingface

In [1]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import SimpleDirectoryReader, load_index_from_storage, VectorStoreIndex, StorageContext, Settings
import faiss
from llama_index.vector_stores.faiss import FaissVectorStore
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from pathlib import Path
import glob
import pprint
from transformers import AutoTokenizer, Pipeline, AutoModelForCausalLM
import os
from llama_index.core import load_index_from_storage

cache_dir = "/home/ubuntu/RAG/CACHE"


  from .autonotebook import tqdm as notebook_tqdm

2024-05-21 05:29:39.460367: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-21 05:29:39.519564: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# !wget -O quantum.pdf  https://www.dst.defence.gov.au/sites/default/files/events/documents/Quantum%20Computing%20Insights%20Paper.pdf

In [2]:

dataset_path = "/home/ubuntu/RAG/datasets/chemistry/*.pdf"
input_files = glob.glob(dataset_path)
reader = SimpleDirectoryReader(input_files=input_files)
documents = reader.load_data()

print('Number of pages:', len(documents))
print(documents)


Number of pages: 297
[Document(id_='91eae4b1-8be8-484a-af18-f74f5925d886', embedding=None, metadata={'page_label': '1', 'file_name': 'Chemistry103.pdf', 'file_path': '/home/ubuntu/RAG/datasets/chemistry/Chemistry103.pdf', 'file_type': 'application/pdf', 'file_size': 3373825, 'creation_date': '2024-05-15', 'last_modified_date': '2024-05-15'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Chemistry, by its very nature, is concerned with change.\nSubstances with well defined properties are converted\nby chemical reactions into other substances with\ndifferent properties. For any chemical reaction, chemists\ntry to find out\n(a)the feasibility of a chemical reaction which can be\npredicted by thermodynamics ( as you know that a\nreaction with D G < 0, at c

In [3]:
parser = SentenceSplitter.from_defaults(chunk_size=1024, chunk_overlap=30) # starting to increase chunk_overlap from 20 to 30% to see if it helps with the issue
nodes = parser.get_nodes_from_documents(documents)
print(f"Number of nodes created: {len(nodes)}")

pprint.pprint([nodes[i] for i in range(3)])
output_file = "output.txt"
file_path = os.path.join(cache_dir, output_file)

# Use pprint to format the list of nodes
formatted_output = pprint.pformat([nodes[i] for i in range(3)])

# Write the formatted output to the text file
with open(file_path, "w", encoding="utf-8") as file:
    file.write(formatted_output)

print(f"Output saved successfully to: {file_path}")


Number of nodes created: 309
[TextNode(id_='d7135f4e-2224-4b18-979c-05299437fab4', embedding=None, metadata={'page_label': '1', 'file_name': 'Chemistry103.pdf', 'file_path': '/home/ubuntu/RAG/datasets/chemistry/Chemistry103.pdf', 'file_type': 'application/pdf', 'file_size': 3373825, 'creation_date': '2024-05-15', 'last_modified_date': '2024-05-15'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='91eae4b1-8be8-484a-af18-f74f5925d886', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'Chemistry103.pdf', 'file_path': '/home/ubuntu/RAG/datasets/chemistry/Chemistry103.pdf', 'file_type': 'application/pdf', 'file_size': 3373825, 'creation_date': '2024-05-15', 'last_modified_date': '

In [4]:
faiss_index = faiss.IndexFlatL2(768)
Settings.embed_model = LangchainEmbedding(
  HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)
subject_dir = "/home/ubuntu/RAG/storage/chemistry"
# service_context = ServiceContext.from_defaults(llm=None, embed_model=embed_model)

vector_store = FaissVectorStore(faiss_index=faiss_index)
# Check for environment variable first
storage_dir = os.getenv("STORAGE_DIR", "./storage/chemistry")

# Use storage_dir in your code
storage = StorageContext.from_defaults( vector_store=vector_store)

index = VectorStoreIndex(
    nodes, storage_context=storage
)

index.storage_context.persist(persist_dir=subject_dir)




In [None]:
# Settings.llm = HuggingFaceLLM(
#     context_window=2048,
#     max_new_tokens=512,
#     generate_kwargs={"temperature": 0.1, "do_sample": False},
#     tokenizer_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
#     model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
#     tokenizer_kwargs={"max_length": 2048},
#     model_kwargs={"torch_dtype": torch.float16}
# )

In [5]:
Settings.llm = HuggingFaceLLM(
    context_window=2048,
    max_new_tokens=512,
    generate_kwargs={"temperature": 0.1, "do_sample": True},
    tokenizer_name="/home/ubuntu/RAG/models",
    model_name="/home/ubuntu/RAG/models",
    tokenizer_kwargs={"max_length": 10000},
    model_kwargs={"torch_dtype": torch.float16}
)
   

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.48s/it]


In [7]:
storage_context = StorageContext.from_defaults(persist_dir=subject_dir, vector_store=vector_store)

stored_index = load_index_from_storage(storage_context)

query_engine = stored_index.as_query_engine()
prompt="what is amines"

import time
t0=time.time()
response = query_engine.query(prompt)
print(f"Time: {time.time()-t0}")
print(response)

Time: 1.0364813804626465

Amines are organic compounds derived by replacing one or more hydrogen atoms of ammonia molecule by alkyl/aryl group(s).


In [None]:
import pandas as pd
from IPython.display import display, HTML


pd.set_option("display.max_colwidth", -1)


def pretty_print(df):
    return display(HTML(df.to_html().replace("\n", "")))


def visualize_retrieved_nodes(nodes) -> None:
    result_dicts = []
    for node in nodes:
        result_dict = {"Score": node.score, "Text": node.node.get_text()}
        result_dicts.append(result_dict)

    pretty_print(pd.DataFrame(result_dicts))


print(response.response)

nodes= response.source_nodes
visualize_retrieved_nodes(nodes)

In [11]:

cache_dir = "/home/ubuntu/RAG/CACHE"
model_dir = "/home/ubuntu/RAG/models"
datasets_dir = "/home/ubuntu/RAG/datasets"
data_dir=datasets_dir
model_name = "gemma"  
tokenizer = AutoTokenizer.from_pretrained(model_dir, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir, cache_dir=cache_dir)
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
embedder_gemma = HuggingFaceEmbeddings(
                model_name=model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs)
embedder_sentence = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2",
        model_kwargs={'device': 'cuda'},
        encode_kwargs={'normalize_embeddings': False}
    )

def gemma_lm(prompt, max_length):
  input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
  sequences = model.generate(input_ids,
                              max_length=max_length,
                              num_return_sequences=1,
                              no_repeat_ngram_size=2,
                              eos_token_id=tokenizer.eos_token_id,
                              top_k=50,
                              do_sample=False, 
                              
                              )
  for seq in sequences:
      generated_text = tokenizer.decode(seq, skip_special_tokens=True)
      # print(f"Result: {generated_text}")
      return generated_text
      

Loading checkpoint shards: 100%|██████████| 2/2 [02:48<00:00, 84.11s/it] 


In [None]:

prompt = "what do you know about googler linux commands"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
Settings.llm = model.generate(input_ids,
                              max_length=2048,
                              num_return_sequences=1,
                              no_repeat_ngram_size=2,
                              eos_token_id=tokenizer.eos_token_id,
                              top_k=50, do_sample=True,
                              )


In [None]:
decoded_text = tokenizer.decode(Settings.llm[0], skip_special_tokens=True)
print(decoded_text)

In [12]:
sample_query = "what do you know about googler linux commands"
gemma_lm(sample_query, max_length=512)
print(f"Result: {gemma_lm(sample_query, max_length=2048)}")

Result: what do you know about googler linux commands?

Googler Linux commands are a set of commands that are used to manage and control Google Cloud Platform (GCP) resources. They are similar to the standard Linux `sudo` command, but they are specifically designed for use with GCP.

Here are some of the key features of Google Linux Commands:

* They allow you to run commands as a different user, such as `root` or a specific service account.
* You can use them to create, modify, and delete resources such a VMs, networks, disks, storage, SQL databases, Cloud Storage, Pub/Sub topics, etc. 
  
Here is a list of some commonly used Google commands:
 

- `gcloud compute create-vm`
-  `gsutil` for managing Google Storage
 - `cloud-sql` to interact with Cloud SQL
 – `compute`  to manage VMs
– `storage`   to interact  with Google  Storage
`-`cloudrun`    to run containerized applications

**Benefits of using Google Commands:**

Using Google Command can be beneficial for the following reasons: 


In [None]:
# Settings.llm = HuggingFaceLLM(
#     context_window=2048,
#     max_new_tokens=512,
#     generate_kwargs={"temperature": 0.1, "do_sample": False},
#     tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
#     model_name="HuggingFaceH4/zephyr-7b-beta",
#     tokenizer_kwargs={"max_length": 2048},
#     model_kwargs={"torch_dtype": torch.float16}
# )

In [None]:
stored_index = load_index_from_storage(storage_context)
query_engine = stored_index.as_query_engine()
prompt="how is preparation of financial statements for a sole proprietary is done?"
import time
t0=time.time()
response = query_engine.query(prompt)
print(f"Time: {time.time()-t0}")

In [None]:
stored_index = load_index_from_storage(storage_context)

print("Shape of response tensor:", stored_index)

retriever = index.as_retriever()

prompt = "how much did Himanshu withdraw?"
response =retriever.retrieve(prompt)
print("Response:", response)
print("Type of response:", type(response))


encoded_prompt = tokenizer.encode(prompt, return_tensors="pt")
print("Prompt Tensor Shape:", encoded_prompt.shape)

assert len(encoded_prompt.shape) == 2, "Prompt tensor should be 2-dimensional"

print("Response Tensor:", response)

for node_with_score in response:
    print("Node Text:", node_with_score.node.text)
    print("Score:", node_with_score.score)
    print("Metadata:", node_with_score.node.metadata)
    print("--------------------")

for node_with_score in response:
    print("Node Text:", node_with_score.node.text)
    print("Score:", node_with_score.score)
    print("Metadata:", node_with_score.node.metadata)
    print("--------------------")

first_node_with_score = response[0]  
print("First Node Text:", first_node_with_score.node.text)
print("First Node Metadata:", first_node_with_score.node.metadata)

encoded_prompt = tokenizer.encode(prompt, return_tensors="pt")
print("Prompt Tensor Shape:", encoded_prompt.shape)

