<a href="https://colab.research.google.com/github/almutareb/rag-based-llm-app/blob/main/Ray_Code_Docs_AI_Q%26A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install needed packages
!pip install -qU transformers accelerate einops langchain xformers bitsandbytes faiss-gpu sentence_transformers typing-extensions==4.8.0

In [None]:
from torch import cuda, bfloat16
import transformers

model_id = 'mistralai/Mistral-7B-Instruct-v0.1'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the 'bitsandbytes' library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map='auto',
)

# enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")


In [None]:
# Define stop tokens to controll output
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

# initialize a text-generation transformer pipeline
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    # pass the following model parameters
    # ensure the model doesn't ramble during chat
    #stopping_criteria=stopping_criteria,
    # max number of tokens to generate in the output
    max_new_tokens=2048,
    # limit repition in the output
    repetition_penalty=1.2
)

In [None]:
# now with a HF pipeline
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

llm(prompt="Explain to me the difference between Data Lakehouse and Data Warehouse")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader

from bs4 import BeautifulSoup as Soup

url = "https://docs.ray.io/en/master/"
loader = RecursiveUrlLoader(url=url, max_depth=4, extractor=lambda x: Soup(x, "html.parser").text)
documents = loader.load()

In [None]:
len(documents)

In [None]:
# split the documents into chunks with a small overlap
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=150,
    #separators=["\n\n", "\n", "(?<=\. )", " ", ""]
    )
all_splits = text_splitter.split_documents(documents)

In [None]:
len(all_splits)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# use all-mpnet-base-v2 sentence transformer to convert pieces of text in vectors to store them in the vector store
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs
    )

vectorstores = FAISS.from_documents(all_splits, embeddings)
FAISS_INDEX_PATH = "faiss_index"
vectorstores.save_local(FAISS_INDEX_PATH)

In [None]:
# initialize a conversation chain, a summary buffer memory for chat history
from langchain.chains import ConversationalRetrievalChain

chain = ConversationalRetrievalChain.from_llm(
    llm,
    vectorstores.as_retriever(search_type = "mmr"), # Maximum marginal relevance (mmr) strives to achieve both relevance to the query and diversity among the results.
    return_source_documents=True
    )

In [None]:
def colab_print(text, max_width = 120):
  words = text.split()
  line = ""
  for word in words:
    if len(line) + len(word) + 1 > max_width:
      print(line)
      line = ""
    line += word + " "
  print (line)

In [None]:
# add a 'chat history' for testing
# should use langchain's ChatMessageHistory instead
chat_history = []

query = "What are placement groups?"
result = chain({"question": query, "chat_history": chat_history})

sources = [doc.metadata.get("source") for doc in result['source_documents']]
src_list = '\n'.join(sources)

colab_print(result['answer'] + src_list)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Placement groups allow users to atomically reserve groups of resources across multiple nodes (i.e., gang scheduling). 
They can be then used to schedule Ray tasks and actors packed as close as possible for locality (PACK), or spread apart 
(SPREAD).https://docs.ray.io/en/master/_sources/ray-core/scheduling/placement-group.rst.txt 
https://docs.ray.io/en/master/_sources/ray-core/scheduling/placement-group.rst.txt 
https://docs.ray.io/en/master/ray-core/scheduling/placement-group.html 
https://docs.ray.io/en/master/ray-core/scheduling/placement-group.html 


In [None]:
sources = [doc.metadata.get("source") for doc in result['source_documents']]
print(*sources, sep='\n')

https://docs.ray.io/en/master/_sources/ray-core/scheduling/placement-group.rst.txt
https://docs.ray.io/en/master/_sources/ray-core/scheduling/placement-group.rst.txt
https://docs.ray.io/en/master/ray-core/scheduling/placement-group.html
https://docs.ray.io/en/master/ray-core/scheduling/placement-group.html


In [None]:
# source documents returned by FAISS
# TODO: remove duplicates, there is a langchain tutorial where the returned unique sources -> look for it!
print(*result['source_documents'], sep='\n')

In [None]:
chat_history = [(query, result["answer"])]

query = "how can I verify if the new placement group is pending creation?"
result = chain({"question": query, "chat_history": chat_history})

sources = [doc.metadata.get("source") for doc in result['source_documents']]
src_list = '\n'.join(sources)

colab_print(result['answer'] + src_list)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You can use the `ray list placement-groups` command to check the state of your placement groups. The output will show 
the name of each placement group along with its ID, creator job ID, and state. Look for the placement group you want to 
check and see if it has a state of 
`PENDING`.https://docs.ray.io/en/master/_sources/ray-core/scheduling/placement-group.rst.txt 
https://docs.ray.io/en/master/ray-core/scheduling/placement-group.html 
https://docs.ray.io/en/master/ray-core/scheduling/placement-group.html 
https://docs.ray.io/en/master/ray-core/scheduling/placement-group.html 


In [None]:
chat_history = [(query, result["answer"])]

query = "how do I do that in java?"
result = chain({"question": query, "chat_history": chat_history})

sources = [doc.metadata.get("source") for doc in result['source_documents']]
src_list = '\n'.join(sources)

colab_print(result['answer'] + src_list)