## Hello, Here's How to use RAG w HF Models

Install some dependencies

In [None]:
!pip install -q -U bitsandbytes==0.42.0
!pip install -q -U peft==0.8.2
!pip install -q -U trl==0.7.10
!pip install -q -U accelerate==0.27.1
!pip install -q -U datasets==2.17.0
!pip install -q -U transformers==4.41.0
!pip install langchain sentence-transformers chromadb langchainhub
!pip install tensorflow
!pip install tf-keras
!pip install chromadb
!pip install langchain-community langchain-core


Get the Model You Want

In [46]:
from langchain_community.llms import HuggingFaceEndpoint
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# get the repository ID for the Gemma 2b model which I am testing with
repo_id = "google/gemma-2-2b-it"

Define Variables

In [47]:
import os

# set your own hf token then fetch it here
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# obv params, max_length is max token len for generated text, temp=0.1 means give more predictable and less random results
llm = HuggingFaceEndpoint(
    task='text-generation',
    repo_id=repo_id,
    model="google/gemma-2-2b-it",
    max_length=1024,
    temperature=0.1,
    huggingfacehub_api_token=hf_token
)

                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.


Define Data Sources

In [56]:
import pandas as pd

# load ur data
health_data = pd.read_csv('sample_data/data-with-sources.csv')
work_data = pd.read_csv('sample_data/work-and-education-data.csv')
# transit_data = pd.read_csv('../Transit-Data-Ques-Ans/vancouver_transit_qa_pairs.csv')

health_data_sample = health_data
work_data_sample = work_data
# transit_data_sample = transit_data

health_data_sample['text'] = health_data_sample['Question'].fillna('') + ' ' + health_data_sample['Answer'].fillna('')
work_data_sample['text'] = work_data_sample['Theme'].fillna('') + ' ' + work_data_sample['Content'].fillna('')
# transit_data_sample['text'] = transit_data_sample['question'].fillna('') + ' ' + transit_data_sample['answer'].fillna('')

To Delete Collections

In [101]:
# # Print the permissions of your database directory
# db_path = "./chroma_db"
# print(f"Directory permissions: {oct(os.stat(db_path).st_mode)[-3:]}")

# # Try to make it writable
# try:
#     os.chmod(db_path, 0o755)  # rwxr-xr-x
#     # Also make the files inside writable
#     for root, dirs, files in os.walk(db_path):
#         for d in dirs:
#             os.chmod(os.path.join(root, d), 0o755)
#         for f in files:
#             os.chmod(os.path.join(root, f), 0o644)  # rw-r--r--
#     print("Permissions updated")
# except Exception as e:
#     print(f"Error changing permissions: {e}")

# existing_collections = client.list_collections()
# print(f"Existing collections: {existing_collections}")

# client = chromadb.PersistentClient(path="./chroma_db")

# # Delete collections if they exist
# try:
#     client.delete_collection("health_docs")
#     print("Deleted health_docs collection")
# except Exception as e:
#     print(f"Error deleting health_docs: {e}")

# try:
#     client.delete_collection("work_docs")
#     print("Deleted work_docs collection")
# except Exception as e:
#     print(f"Error deleting work_docs: {e}")
    
# try:
#     client.delete_collection("transit_docs")
#     print("Deleted transit_docs collection")
# except Exception as e:
#     print(f"Error deleting transit_docs: {e}")

Directory permissions: 755
Permissions updated
Existing collections: ['health_docs', 'work_docs', 'study_docs']
Deleted health_docs collection
Deleted work_docs collection
Error deleting transit_docs: Collection transit_docs does not exist.


Set Embedding Model, and Chroma Client to Interact w Vector Database and Create Collections

In [102]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import chromadb

# pt model for generating embeddings used pretty often
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-MiniLM-L6-v2" ## currently the best one found 
)

## embeddings and time it takes 
# sentence-transformers/all-MiniLM-L6-v2 = 32.7 seconds
# sentence-transformers/paraphrase-MiniLM-L6-v2 = 30.5 seconds !!!!!!!!
# sentence-transformers/all-roberta-large-v1 = 1m 55.8 seconds 
# sentence-transformers/all-MiniLM-L12-v2 = 54.4 seconds 
# sentence-transformers/multi-qa-MiniLM-L6-cos-v1 = 49.5 seconds 
# sentence-transformers/paraphrase-mpnet-base-v2 = over 3 minutes 
# sentence-transformers/multi-qa-mpnet-base-dot-v1 = 2m 18.7 seconds
# "neuml/pubmedbert-base-embeddings" = 2m 27.4 seconds 

# persistent client to interact w chroma vector store
client = chromadb.PersistentClient(path="./chroma_db")

# create collections for each data (for testing rn)
health_collection = client.get_or_create_collection(name="health_docs")
work_collection = client.get_or_create_collection(name="work_docs")
# transit_collection = client.get_or_create_collection(name="transit_docs")

Function to add data to collection by embedding them

In [103]:
def add_data_to_collection(collection, data):
    for idx, row in data.iterrows():
        try:
            # get the embeddings using the embedding model for the documents
            embeddings = embedding_model.embed_documents([row['text']])[0]
            collection.add(
                ids=[str(idx)],
                embeddings=[embeddings],
                documents=[row['text']]
            )
        except Exception as e:
            print(f"Error on index {idx}: {e}")

# add data to collections
add_data_to_collection(health_collection, health_data_sample)
add_data_to_collection(work_collection, work_data_sample)
# add_data_to_collection(transit_collection, transit_data_sample)

Function to now match for releveant document

In [104]:
def get_relevant_document(query, category):
    try:
        # get the embedding for the user query using same embedding model
        query_embeddings = embedding_model.embed_documents([query])[0]

        # choose the correct collection based on the category
        if category == "health":
            collection = health_collection
        elif category == "work":
            collection = work_collection
        # elif category == "transit":
            # collection = transit_collection
        # collection = health_collection if category == "health" else work_collection

        # query the collection
        results = collection.query(query_embeddings=[query_embeddings], n_results=1)

        print(f"Query Results: {results}")

        return results['documents'][0][0] if results['documents'] else None
    except Exception as e:
        print(f"Error querying: {e}")
        return None

Generate Answer

In [105]:
def generate_answer(query, category):
    # b4 rag
    output_before_rag = llm.predict(f"Respond to this question: {query}")
    response_before_rag = output_before_rag

    # get the relevant document
    relevant_document = get_relevant_document(query, category)
    if relevant_document is None:
        return f"Sorry, no relevant document found. Model's response before RAG: {response_before_rag}"

    relevant_document = " ".join(relevant_document.split())
    MAX_DOC_LENGTH = 500
    relevant_document = relevant_document[:MAX_DOC_LENGTH]

    # rag_prompt = f"""
    # You are a helpful assistant for international students new to B.C. Here is a relevant document:

    # {relevant_document}

    # Please respond to the following question based on the document above:

    # Question: {query}

    # Answer:
    # """
    rag_prompt = f"""
    You are a helpful assistant for international students new to B.C. Here is a relevant document:

    {relevant_document}

    Please respond to the following question based on the document above, if you can't answer anything or it requires the international student to ask a query again, direct them to additional resources like the vancouver transit website or the transit mobile app for transit related queries:

    Question: {query}

    Answer:
    """

    # print("Prompt being sent to model:")
    # print(rag_prompt)

    # now generate using RAG
    output_after_rag = llm.predict(rag_prompt)
    # print("Output from model:", output_after_rag)

    response_after_rag = output_after_rag

    # return both responses to compare
    return {
        "Before RAG Response": response_before_rag,
        "After RAG Response": response_after_rag
    }

Example Usage

In [107]:
# user_query = "How do I commute in vancouver and how can I get to SFU?"
user_query = "What do I need to do to apply for MSP coverage in B.C.?"
# category = "transit"
category = "health"
responses = generate_answer(user_query, category)

print("User Query:", user_query)
print("Response Before RAG:", responses["Before RAG Response"])
print("Response After RAG:", responses["After RAG Response"])



Query Results: {'ids': [['1']], 'embeddings': None, 'documents': [['Who qualifies for MSP coverage? A B.C. resident who is a Canadian citizen or lawfully admitted to Canada for permanent residence, makes their home in B.C., and is physically present in B.C. for at least six months in a calendar year.']], 'uris': None, 'data': None, 'metadatas': [[None]], 'distances': [[16.354888726841512]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}




User Query: What do I need to do to apply for MSP coverage in B.C.?
Response Before RAG: 

Here's a breakdown of the process:

**1. Determine Eligibility:**
   * **Work:** Are you employed in a qualifying job?
   * **Income:** Do you meet the income requirements?
   * **Health:** Do you have a pre-existing condition?

**2. Apply for Coverage:**
   * **Online:** Visit the WorkSafeBC website and complete the online application.
   * **By Phone:** Call WorkSafeBC at 1-888-328-2222.
   * **In Person:** Visit a WorkSafeBC office.

**3. Provide Documentation:**
   * **Proof of Employment:** Pay stubs, employment contract, etc.
   * **Proof of Income:** Tax returns, bank statements, etc.
   * **Proof of Health:** Medical records, doctor's notes, etc.

**4. Pay Premiums:**
   * **Monthly:** Premiums are typically paid monthly.
   * **Payment Options:** WorkSafeBC offers various payment options.

**5. Receive Coverage:**
   * **Confirmation:** You will receive confirmation of your coverage.
   

In [108]:
# verify
health_docs = health_collection.get()
print("Number of documents in health collection:", len(health_docs['documents']))

work_docs = work_collection.get()
print("Number of documents in work collection:", len(work_docs['documents']))

Number of documents in health collection: 76
Number of documents in work collection: 878
