## Hello, Here's How to use RAG w HF Models

Install some dependencies

In [None]:
!pip install -q -U peft==0.8.2
!pip install -q -U trl==0.7.10
!pip install -q -U accelerate==0.27.1
!pip install -q -U datasets==2.17.0
!pip install -q -U transformers==4.38.1
!pip install langchain sentence-transformers chromadb langchainhub

!pip install langchain-community langchain-core

Get the Model You Want

In [None]:
#!pip install --upgrade --upgrade-strategy eager "optimum[neural-compressor]"


In [1]:
#!pip install fuzzywuzzy



In [None]:
#!pip install optimum


In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import chromadb

import time
import uuid
#from fuzzywuzzy import fuzz
#from optimum.quantization import QuantizeruenceClassification


In [None]:
#pip install numpy --upgrade

Define Variables

In [None]:
import os

# set your own hf token then fetch it here
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, token=hf_token)

model.half()

Define Data Sources

In [17]:
import pandas as pd

file_names = [
    "study_permit_general",
    "work_permit_student_general",
    "work-and-education-data",
    "vancouver_transit_qa_pairs",
    "permanent_residence_student_general",
    "data-with-sources"
]

all_texts = []

for file in file_names:
    path = f'./sample_data/{file}.csv'
    try:
        df = pd.read_csv(path)
        df.columns = df.columns.str.lower()

        if 'question' in df.columns and 'answer' in df.columns:
            df = df.drop_duplicates(subset=['question'])
            df['text'] = df['question'].fillna('') + ' ' + df['answer'].fillna('')
        elif 'theme' in df.columns and 'content' in df.columns:
            df = df.drop_duplicates(subset=['content'])
            df['text'] = df['theme'].fillna('') + ' ' + df['content'].fillna('')
        else:
            print(f"no text columns in {file}")
            continue
        all_texts.extend(df['text'].tolist())
    except Exception as e:
        print(f"Error loading {file}: {e}")


In [None]:
# forgot one dependency
!pip install chromadb

Set Embedding Model, and Chroma Client to Interact w Vector Database and Create Collections

In [16]:
#!pip install -U numpy



In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import chromadb
import uuid
import pandas as pd
#from fuzzywuzzy import fuzz

# pt model for geenrating embeddings used pretty often
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# persistent client to interact w chroma vector store
client = chromadb.PersistentClient(path="./chroma_db")

# create collections for each data (for testing rn)
collection = client.get_or_create_collection(name="combined_docs")

# seems like better results if we remove duplicates and very similar data
data = pd.DataFrame({"text": all_texts})
data = data.drop_duplicates()
all_texts = data["text"].tolist()

'''
def remove_fuzzy_duplicates(texts, threshold=90):
    unique_texts = []
    for text in texts:
        if not any(fuzz.ratio(text, existing_text) > threshold for existing_text in unique_texts):
            unique_texts.append(text)
    return unique_texts

all_texts = remove_fuzzy_duplicates(all_texts, threshold=90)
'''

print(f"successfully added {len(all_texts)} documents to Chroma DB.")

successfully added 1053 documents to Chroma DB.


Function to add data to collection by embedding them

In [41]:
def add_data_to_collection_batch(collection, texts, batch_size=3):
    for idx in range(0, len(texts), batch_size):
        try:
            batch_texts = texts[idx: idx + batch_size]

            embeddings = embedding_model.embed_documents(batch_texts)

            batch_ids = [str(uuid.uuid4()) for _ in batch_texts]

            collection.add(
                ids=batch_ids,
                embeddings=embeddings,
                documents=batch_texts
            )
            print(f"successfully added {len(batch_texts)} documents (Batch {idx}-{idx + batch_size - 1})")
        except Exception as e:
            print(f"Error processing batch starting at index {idx}: {e}")

In [None]:
add_data_to_collection_batch(collection, all_texts)
print(f"successfully added {len(all_texts)} documents to the Chroma collection.")

Function to now match for releveant document

In [43]:
def get_relevant_documents(query, n_results=3):
    try:
        query_embeddings = embedding_model.embed_documents([query])[0]

        results = collection.query(query_embeddings=[query_embeddings], n_results=n_results)
        print(f"Query Results: {results}")

        return results['documents'][0] if results['documents'] else []
    except Exception as e:
        print(f"Error querying: {e}")
        return []

Generate Answer

In [44]:
def generate_answer(query):
    inputs = tokenizer(query, return_tensors="pt")
    base_output = model.generate(inputs["input_ids"], max_length=150, temperature=0.1)
    response_before_rag = tokenizer.decode(base_output[0], skip_special_tokens=True)

    relevant_documents = get_relevant_documents(query)
    if not relevant_documents:
        return {
            "Before RAG Response": response_before_rag,
            "After RAG Response": "Sorry, no relevant documents found."
        }

    relevant_texts = "\n\n".join([doc for doc in relevant_documents])
    rag_prompt = f"""
    You are a helpful assistant for international students. Here are relevant documents:

    {relevant_texts}

    Please respond to the following question based on the documents above. Be conversational but concise:

    Question: {query}

    Answer:
    """

    rag_inputs = tokenizer(rag_prompt, return_tensors="pt")
    rag_output = model.generate(rag_inputs["input_ids"], max_length=500, temperature=0.1)
    response_after_rag = tokenizer.decode(rag_output[0], skip_special_tokens=True)

    return {
        "Before RAG Response": response_before_rag,
        "After RAG Response": response_after_rag
    }

Example Usage

In [45]:
test_queries = [
    "How do I apply for a study permit in Canada?",
    "Can I work while studying on a student visa?",
    "What happens if my study permit expires before I finish my program?",
    "Do I need a new study permit if I change schools?",
    "How long does it take to process a Canadian study permit?",
    "Am I allowed to work off-campus as an international student?",
    "How many hours can I work while studying in Canada?",
    "What documents do I need to apply for a co-op work permit?",
    "Can I work in Canada after I graduate?",
    "What is a Post-Graduation Work Permit (PGWP) and how do I apply?",
    "How do I apply for MSP (Medical Services Plan) in British Columbia?",
    "Is MSP mandatory for international students?",
    "What healthcare services are covered under MSP?",
    "What should I do if I get sick and don’t have insurance yet?",
    "Can I use private health insurance instead of MSP?",
    "What are my options for student housing in Vancouver?",
    "How much does rent typically cost for international students?",
    "What should I check before signing a lease in Canada?",
    "Are there any student discounts for accommodation?",
    "How can I find a roommate in Canada?",
    "How do I open a bank account as an international student?",
    "What documents do I need to get a student bank account?",
    "Can I get a credit card as an international student?",
    "How do I send money to my home country from Canada?",
    "What scholarships are available for international students?",
    "How does the Compass Card work for transit in Vancouver?",
    "Am I eligible for a U-Pass as an international student?",
    "What is the best way to get around Vancouver on a budget?",
    "Where can I find the bus and SkyTrain schedules?",
    "Are there student discounts for public transportation?",
    "Can I apply for permanent residence after graduating?",
    "What is the Canadian Experience Class (CEC) immigration program?",
    "How can I improve my chances of getting permanent residence?",
    "What are the eligibility requirements for Express Entry?",
    "Does having a Canadian degree help with PR applications?"
]

for idx, user_query in enumerate(test_queries, start=1):
    responses = generate_answer(user_query)

    print("\n" + "="*50)
    print(f"Test Query {idx}: {user_query}")
    print("="*50)
    print("Response Before RAG:\n", responses["Before RAG Response"])
    print("\nResponse After RAG:\n", responses["After RAG Response"])
    print("="*50 + "\n\n")

#add_all_texts_to_collection()


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Query Results: {'ids': [['a9e9b5f4-01e3-469b-b7c1-768022665ff5', 'd8a7253f-3659-448c-bb0c-b8489df8af47', '30a5a828-0efc-428a-989f-836ffc0c6d06']], 'embeddings': None, 'documents': [['Study Permit Application Process When you apply online, your personalized checklist will include the Application to Change Conditions, Extend my Stay or Remain in Canada as a Student [IMM 5709]. On that form, check the box to Apply for a study permit for the first time or extend my study permit and fill out the rest of the required fields.', 'Study Permit Application Process Most foreign nationals already in Canada can no longer apply for a study permit at a port of entry. You must apply online for', 'Extending a Study Permit If you want to keep studying in Canada, you must']], 'uris': None, 'data': None, 'metadatas': [[None, None, None]], 'distances': [[0.3383488655090332, 0.40008115768432617, 0.40086132287979126]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Test Query 1: How do I apply for a study permit in Canada?
Response Before RAG:
 How do I apply for a study permit in Canada? The application process typically involves several steps, which I'll outline below.

## Step 1: Determine Your Eligibility
To apply for a study permit in Canada, you must meet certain eligibility criteria. These include being a permanent resident of Canada, having a valid passport, and having sufficient funds to support yourself during your stay in Canada.

## Step 2: Choose the Correct Application Type
There are two main types of study permits: a study permit for a specific program (e.g., a bachelor's degree) and a study permit for a general program (e.g., a master's degree). You must choose the correct application type based on your program of study.

## Step 3

Response After RAG:
 
    You are a helpful assistant for international students. Here are relevant documents:

    Study Permit Application Process When you apply online, your personalized checklist 

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Query Results: {'ids': [['f49f0ece-d747-4594-a770-721355511f00', '0e1017e4-90c6-49d3-be3a-7a9428aeeafb', '22006685-c985-4364-8d8d-8e6d8268c1fb']], 'embeddings': None, 'documents': [["Work Permits for International Students If you're able to work during your studies, it'll say so in the conditions on your study permit.", "Work Permits for International Students If you're able to work during your studies, it'll say so in the conditions on your study permit.", "Work Permits for International Students If you're able to work during your studies, it'll say so in the conditions on your study permit."]], 'uris': None, 'data': None, 'metadatas': [[None, None, None]], 'distances': [[0.5540599822998047, 0.5540599822998047, 0.554060161113739]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Test Query 2: Can I work while studying on a student visa?
Response Before RAG:
 Can I work while studying on a student visa? Working while studying on a student visa is a common practice, but it's essential to understand the rules and regulations surrounding it.

**Understanding the rules:**

*   **Employment and student status:** You must be a student and not an employee to work while studying on a student visa.
*   **Work hours and duration:** You can work up to 20 hours per week, and you must not exceed 40 hours per month.
*   **Employment contract:** You must have a written employment contract with your employer, which includes the terms of your employment, working hours, and salary.
*   **Tax and benefits:** You must pay taxes on your earnings and benefits, and you

Response After RAG:
 
    You are a helpful assistant for international students. Here are relevant documents:

    Work Permits for International Students If you're able to work during your studies, it'll say so in 

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Query Results: {'ids': [['13b81354-0ec4-4fda-a068-7d9496a1ee48', '267bbdd2-8c18-4878-af02-bb4069d92172', 'c6082afc-c3b5-466e-9ce1-02a40b1b3e29']], 'embeddings': None, 'documents': [['Study Permit Expiration and Renewal What happens when your permit expires\nYou lose your student status in Canada if any of the following applies to you:', 'Extending a Study Permit What happens when your permit expires\nYou lose your student status in Canada if any of the following applies to you:', 'Extending a Study Permit What happens when your permit expires\nYou lose your student status in Canada if any of the following applies to you:']], 'uris': None, 'data': None, 'metadatas': [[None, None, None]], 'distances': [[0.38881877064704895, 0.43466609716415405, 0.43466609716415405]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Test Query 3: What happens if my study permit expires before I finish my program?
Response Before RAG:
 What happens if my study permit expires before I finish my program? Can I still enter Canada?
If your study permit expires before you finish your program, you may be eligible to apply for a new study permit. However, there are some conditions and requirements you need to meet.

Here are the steps to follow:

1. **Check if your study permit has expired**: You can check the status of your study permit online through the Immigration, Refugees and Citizenship Canada (IRCC) website.
2. **Contact the IRCC**: If your study permit has expired, contact the IRCC to inquire about the next steps. They may ask you to provide documentation, such as proof of payment for the application fee or proof of completion of your program.


Response After RAG:
 
    You are a helpful assistant for international students. Here are relevant documents:

    Study Permit Expiration and Renewal What happens when

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Query Results: {'ids': [['b93fb3dc-8910-4cbb-858b-39291370569c', '8a55474b-3669-47fb-920d-6e8d4fa73cf6', 'c6ac72f1-0967-458d-80f5-547120bf2138']], 'embeddings': None, 'documents': [["Extending a Study Permit If your study situation changes\nIf you weren't eligible to work off campus, but your study situation has now changed, you may be able to change the conditions of your study permit.", "Study Permit Application Process If your study situation changes\nIf you weren't eligible to work off campus, but your study situation has now changed, you may be able to change the conditions of your study permit.", 'Extending a Study Permit keep studying until your current permit expires or\ntransfer to another DLI\nIf you want to extend your study permit, you’ll need to enroll at a school with DLI status.']], 'uris': None, 'data': None, 'metadatas': [[None, None, None]], 'distances': [[0.6665720343589783, 0.6875311136245728, 0.7013002634048462]], 'included': [<IncludeEnum.distances: 'distances'>, 

KeyboardInterrupt: 