# RAG Fusion

In [2]:
import os
import requests
import zipfile
from io import BytesIO
import textwrap

def download_and_extract_zip(url, target_folder):
    # Ensure the target folder exists
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    # Download the file from the URL
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to download file: {url}")

    # Unzip the file in memory
    with zipfile.ZipFile(BytesIO(response.content)) as zip_ref:
        zip_ref.extractall(target_folder)

    print(f"Files extracted to {target_folder}")


def zip_folder(folder_path, zip_file_path):
    # Create a ZipFile object in write mode
    with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through the directory
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                # Create a path relative to the folder to avoid storing absolute paths
                relative_path = os.path.relpath(os.path.join(root, file), os.path.dirname(folder_path))
                # Add file to the zip file
                zipf.write(os.path.join(root, file), arcname=relative_path)

    print(f"{zip_file_path} created successfully.")



def wrap_text(text, width=90): #preserve_newlines
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

# URL of the zip file
url = "https://www.dropbox.com/scl/fi/av3nw07o5mo29cjokyp41/singapore_text_files_languages.zip?rlkey=xqdy5f1modtbnrzzga9024jyw&dl=1" # Ensure dl=1 for direct download

# Folder to save extracted files
folder = "singapore_text"

# Call the function
download_and_extract_zip(url, folder)


Files extracted to singapore_text


In [3]:
## download the chroma DB
url = 'https://www.dropbox.com/scl/fi/3kep8mo77h642kvpum2p7/singapore_chroma_db.zip?rlkey=4ry4rtmeqdcixjzxobtmaajzo&dl=1'
download_and_extract_zip(url, '.')

Files extracted to .


In [4]:
import os

from google.colab import userdata
os.environ["GOOGLE_API_KEY"] = userdata.get('')

In [5]:
!pip show langchain

Name: langchain
Version: 0.0.350
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: aiohttp, async-timeout, dataclasses-json, jsonpatch, langchain-community, langchain-core, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


## Google

## Imports

In [6]:
from langchain.llms import GooglePalm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
import langchain



## Load in Docs

In [7]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

In [8]:
%%time
loader = DirectoryLoader('/content/singapore_text/Textfiles3/English/', glob="*.txt", show_progress=True)
docs = loader.load()

  0%|          | 0/646 [00:00<?, ?it/s][nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
100%|██████████| 646/646 [00:46<00:00, 13.79it/s]

CPU times: user 39 s, sys: 1.4 s, total: 40.4 s
Wall time: 46.9 s





In [9]:
len(docs)
# docs = docs[:10]
len(docs)

646

In [10]:
docs[0]

Document(page_content="Link: https://www.visitsingapore.com/festivals\n\nevents\n\nsingapore/annual\n\nhighlights/grand\n\nprix\n\nseason\n\nsingapore/\n\nTitle: Grand Prix Season Singapore\n\nGlitz, glamour and nonstop racing action return to the heart of the city, with the revving of engines marking the return of the Grand Prix Season Singapore (GPSS).\n\nBlasting into full gear from 15 – 17 September, the Formula 1 Singapore Airlines Singapore Grand Prix 2023 returns, with island-wide celebrations, four precinct parties providing all the on-and-off-track excitement you've come to expect.\n\nRace fans will get to see the bustling streets of Singapore’s city centre roar to life, with household names like Lewis Hamilton, Max Verstappen and Charles Leclerc battling it out beneath the dazzling lights of the Marina Bay Street Circuit.\n\nWhether you’re looking to cheer on your racing heroes, soak in the glitz and glamour at the track or dance the night away at parties across four of our c

In [11]:
raw_text = ''
for i, doc in enumerate(docs):
    text = doc.page_content
    if text:
        raw_text += text

In [12]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 100,
    length_function = len,
    is_separator_regex = False,
)

In [13]:
texts = text_splitter.split_text(raw_text)

In [14]:
len(texts)

6217

In [15]:
texts[1]

"Blasting into full gear from 15 – 17 September, the Formula 1 Singapore Airlines Singapore Grand Prix 2023 returns, with island-wide celebrations, four precinct parties providing all the on-and-off-track excitement you've come to expect.\n\nRace fans will get to see the bustling streets of Singapore’s city centre roar to life, with household names like Lewis Hamilton, Max Verstappen and Charles Leclerc battling it out beneath the dazzling lights of the Marina Bay Street Circuit."

## BGE Embeddings

In [17]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embedding_function = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs,

)

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## Vector DB

In [18]:
%%time
### Make the chroma and persiste to disk
# db = Chroma.from_texts(texts,
#                        embedding_function,
#                        persist_directory="./chroma_db")



### load from disk
db = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)





CPU times: user 442 ms, sys: 32.9 ms, total: 474 ms
Wall time: 584 ms


In [None]:
### Save to zip
# zip_folder('/content/chroma_db', 'chroma_db.zip')

chroma_db.zip created successfully.


In [19]:
query = "Tell me about Universal Studios Singapore?"

db.similarity_search(query, k=5)

[Document(page_content='Universal Studios Singapore'),
 Document(page_content='Universal Studios Singapore is a popular theme park that offers thrilling rides and entertainment for all ages. You can buy Universal Studios Singapore tickets and enjoy an unbeatable discount of 40% from MySingaporePass. The pass provides a hassle-free booking process, allowing you to skip the long queues and gain easy access to the park. Enjoy exhilarating roller coasters, live shows, and immersive themed zones inspired by your favorite movies. The discounted entry with My Singapore Pass'),
 Document(page_content='leisure/fun\n\nthings\n\nto\n\ndo/universal\n\nstudios\n\nsingapore/\n\nTitle: Universal Studios Singapore\n\nThe shimmering wonders of the silver screen comes to vivid life at Universal Studios Singapore, the first-ever Hollywood movie theme park in Southeast Asia.'),
 Document(page_content='The magic of Hollywood comes to life at Universal Studios Singapore (USS), a theme park filled with adren

## Setup a Retriever

In [20]:
retriever = db.as_retriever(k=5) # can add mmr fetch_k=20, search_type="mmr"

retriever.get_relevant_documents(query)[1]

Document(page_content='Universal Studios Singapore is a popular theme park that offers thrilling rides and entertainment for all ages. You can buy Universal Studios Singapore tickets and enjoy an unbeatable discount of 40% from MySingaporePass. The pass provides a hassle-free booking process, allowing you to skip the long queues and gain easy access to the park. Enjoy exhilarating roller coasters, live shows, and immersive themed zones inspired by your favorite movies. The discounted entry with My Singapore Pass')

## Chat chain

In [21]:
from operator import itemgetter

from langchain.chat_models import ChatGooglePalm

from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

In [22]:

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatGooglePalm()

In [23]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [25]:
text_reply = chain.invoke("Tell me about job opportunities in Singapore")

print(wrap_text(text_reply))

Singapore is a global financial hub and a major center for business and trade. The city-
state has a strong economy and a low unemployment rate, making it a great place to find a
job.

There are many different types of jobs available in Singapore, from finance and banking to
technology and engineering. The city-state is also home to a number of multinational
corporations, which offer a variety of opportunities for expats.

The cost of living in Singapore is high, but salaries are also high. The average salary in
Singapore is around $4,000 per month. However, salaries can vary depending on the industry
and the level of experience.

If you are looking for a job in Singapore, there are a few things you can do to increase
your chances of success. First, make sure you have a strong resume and cover letter.
Second, network with people in your field and attend industry events. Finally, be prepared
to negotiate your salary.

Here are some of the most in-demand jobs in Singapore:

* Financial s

## With RagFusion

In [26]:
from langchain.chat_models import ChatGooglePalm
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.prompts import ChatMessagePromptTemplate, PromptTemplate

In [27]:
prompt = ChatPromptTemplate(input_variables=['original_query'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpful assistant that generates multiple search queries based on a single input query.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['original_query'], template='Generate multiple search queries related to: {question} \n OUTPUT (4 queries):'))])

In [29]:
generate_queries = (
    prompt | ChatGooglePalm(temperature=0) | StrOutputParser() | (lambda x: x.split("\n"))
)

In [30]:
original_query = "universal studios Singapore"

In [31]:
from langchain.load import dumps, loads


def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}
    for docs in results:
        # Assumes the docs are returned in sorted order of relevance
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    return reranked_results

In [32]:
ragfusion_chain = generate_queries | retriever.map() | reciprocal_rank_fusion

In [33]:
langchain.debug = True

In [34]:
ragfusion_chain.input_schema.schema()

{'title': 'PromptInput',
 'type': 'object',
 'properties': {'question': {'title': 'Question', 'type': 'string'}}}

In [35]:
ragfusion_chain.invoke({"question": original_query})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "question": "universal studios Singapore"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] Entering Prompt run with input:
[0m{
  "question": "universal studios Singapore"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] [1ms] Exiting Prompt run with output:
[0m{
  "lc": 1,
  "type": "constructor",
  "id": [
    "langchain",
    "prompts",
    "chat",
    "ChatPromptValue"
  ],
  "kwargs": {
    "messages": [
      {
        "lc": 1,
        "type": "constructor",
        "id": [
          "langchain",
          "schema",
          "messages",
          "SystemMessage"
        ],
        "kwargs": {
          "content": "You are a helpful assistant that generates multiple search queries based on a single input query.",
          "additional_kwargs": {}
        }
      },
      {
        "lc": 1,
  

[(Document(page_content='Universal Studios Singapore'), 0.06666666666666667),
 (Document(page_content='Universal Studios Singapore is a popular theme park that offers thrilling rides and entertainment for all ages. You can buy Universal Studios Singapore tickets and enjoy an unbeatable discount of 40% from MySingaporePass. The pass provides a hassle-free booking process, allowing you to skip the long queues and gain easy access to the park. Enjoy exhilarating roller coasters, live shows, and immersive themed zones inspired by your favorite movies. The discounted entry with My Singapore Pass'),
  0.06558258417063283),
 (Document(page_content='Here are some of the top attractions that can be booked with the My Singapore Pass:\n\nUniversal Studios Singapore @ 40% Off'),
  0.048651507139079855),
 (Document(page_content='leisure/fun\n\nthings\n\nto\n\ndo/universal\n\nstudios\n\nsingapore/\n\nTitle: Universal Studios Singapore\n\nThe shimmering wonders of the silver screen comes to vivid lif

In [36]:
from langchain.schema.runnable import RunnablePassthrough
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

full_rag_fusion_chain = (
    {
        "context": ragfusion_chain,
        "question": RunnablePassthrough()
    }
    | prompt
    | model
    | StrOutputParser()
)

In [37]:
full_rag_fusion_chain.input_schema.schema()

{'title': 'RunnableParallelInput',
 'type': 'object',
 'properties': {'question': {'title': 'Question', 'type': 'string'}}}

In [38]:
full_rag_fusion_chain.invoke({"question": "Tell me about Universal Studio Singapore?"})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "question": "Tell me about Universal Studio Singapore?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel] Entering Chain run with input:
[0m{
  "question": "Tell me about Universal Studio Singapore?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 3:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "question": "Tell me about Universal Studio Singapore?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 3:chain:RunnableSequence > 4:prompt:ChatPromptTemplate] Entering Prompt run with input:
[0m{
  "question": "Tell me about Universal Studio Singapore?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 3:chain:RunnableSequence > 4:prompt:ChatPromptTemplate] [1ms] Exiting Prompt run with output:
[0m{
  "lc": 1,

"Universal Studios Singapore is a theme park located on Sentosa Island in Singapore. It is the first Universal Studios theme park in Southeast Asia and the second in Asia after Universal Studios Japan. The park opened on 18 September 2010.\n\nUniversal Studios Singapore is divided into seven themed zones:\n\n* Hollywood: This zone is home to rides and attractions based on Hollywood movies and TV shows, such as Transformers The Ride: The Ultimate 3D Battle, Revenge of the Mummy, and Battlestar Galactica: HUMAN vs. CYLON.\n* New York: This zone is themed after New York City and features rides and attractions based on movies and TV shows set in New York, such as The Amazing Spider-Man, Despicable Me Minion Mayhem, and Shrek 4-D Adventure.\n* Sci-Fi City: This zone is themed after science fiction movies and TV shows and features rides and attractions such as TRANSFORMERS The Ride: The Ultimate 3D Battle, Battlestar Galactica: HUMAN vs. CYLON, and Jurassic Park Rapids Adventure.\n* Ancient 

In [40]:
full_rag_fusion_chain.invoke({"question": "Tell me about Machine learning jobs in singapore?"})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "question": "Tell me about Machine learning jobs in singapore?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel] Entering Chain run with input:
[0m{
  "question": "Tell me about Machine learning jobs in singapore?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 3:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "question": "Tell me about Machine learning jobs in singapore?"
}[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 4:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "question": "Tell me about Machine learning jobs in singapore?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 4:chain:RunnablePassthrough] [1ms] Exiting Chain run with output:
[0m{
  "question": "Tell me about Machin

'Machine learning is a rapidly growing field with a high demand for skilled professionals. Singapore is a major hub for technology companies, and there are many opportunities for machine learning engineers and data scientists in the city-state.\n\nThe average salary for a machine learning engineer in Singapore is around S$150,000 per year. However, salaries can vary depending on experience, qualifications, and the specific role.\n\nSome of the most common machine learning jobs in Singapore include:\n\n* Machine learning engineer: Machine learning engineers develop and build machine learning models. They work with data scientists to understand the business problem and then design and implement machine learning solutions.\n* Data scientist: Data scientists collect, clean, and analyze data. They use their findings to develop machine learning models and improve business processes.\n* Data analyst: Data analysts collect and analyze data to help businesses make better decisions. They use the