<h4> Installing Necessary Libraries </h4>

In [1]:
!pip install transformers -q
!pip install bitsandbytes -q
!pip install -U bitsandbytes -q
!pip install -U langchain-community -q
!pip install pypdf pymupdf -q
!pip install chromadb -q
!pip install sentence_transformers -q
!pip install huggingface_hub -q
!pip install tqdm -q

<h4> Importing Libraries </h4>

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import pandas as pd

import gc
import sys
import langchain
from langchain.document_loaders import PyMuPDFLoader, PyPDFLoader
import re
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from huggingface_hub import login
from huggingface_hub import snapshot_download
from tqdm import tqdm
hf_token = '<Enter your token here>'
login(token=hf_token)

<h4> Variable Initialization </h4>

In [100]:
file_path = '/content/UK_EMPLOYEE_HANDBOOK_2022.pdf'
# model_name = "meta-llama/Llama-2-13b-chat-hf"
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
# save_directory = "./llama2-13b-chat-4bit"
save_directory = "./" + model_name.split('/')[1]
persist_directory = './database/chroma/'
pages_to_exclude = [1, 3, 4, 81, 82] #These pages consisted of Index and irrelevant pages
chunk_size = 500
chunk_overlap = 100
max_new_tokens = 200

prompt_template_str = """[INST] Use the following pieces of context to answer the question at the end in ONE SENTENCE ONLY.
If the context is not relevant to the question, then Helpful Answer : I don't know the answer to that". Do NOT give any explanation.
Don't try to make up an answer. Avoid Repetition.

Context: {context}

Question: {question}
Helpful Answer: [/INST]'
"""

# prompt_template_str = """[INST] Use the following pieces of context to answer the question at the end in ONE SENTENCE ONLY.
# If you don't know the answer, Helpful Answer : I don't know the answer to that".
# Don't try to make up an answer. Avoid Repetition.

# Context: {context}

# Question: {question}
# Helpful Answer: [/INST]'
# """

# prompt_template_str = """
# You are a helpful assistant. Use the following context to answer the question in one sentence immediately after "Answer:".
# If you don't know the answer, just say "I don't know." Do not repeat or add extra text. Don't start your answer with "Please note that I\'m just an AI and not a legal expert. However, based on the provided context,". Just write a single sentence response

# Context:
# {context}

# Question: {question}
# Answer:
# """


# prompt_template_str = """
# You are a helpful assistant. Use the following context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

# Context:
# {context}

# Question: {question}
# Answer:
# """

# prompt_template_str = """You are a helpful assistant. Use the context given below to answer the question in one concise sentence. Do not include any options in the answer. If you don't know the answer, just say "I don't know.". Do not make up an answer.

# Context:
# {context}

# Question: {question}
# Answer:
# """

# prompt_template_str = """Use the following context to answer the question at the end.
# If you don't know the answer, just say that you don't know.
# Don't try to make up an answer. Avoid Repetition of same pointers.

# Context: {context}

# Question: {question}
# """

In [55]:
save_directory

'./Mistral-7B-Instruct-v0.2'

<h4> Globally Declaring the model and Tokenizer </h4>

In [5]:
model = None
tokenizer = None

<h4> Helper Functions </h4>

In [82]:
# Function to Save the Downloaded Model and Tokenizer to Disk
# def fnSaveModelandTokenizer(model_name, save_directory):
#   quantization_config = BitsAndBytesConfig(
#       load_in_4bit=True,  # Load the model in 4-bit precision
#       bnb_4bit_use_double_quant=True,  # Use double quantization for better precision
#       bnb_4bit_quant_type="nf4",  # Use 4-bit NormalFloat quantization
#       bnb_4bit_compute_dtype=torch.float16)  # Use FP16 for computation
#   tokenizer = AutoTokenizer.from_pretrained(model_name)
#   model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")
#   model.save_pretrained(save_directory)
#   tokenizer.save_pretrained(save_directory)
#   torch.save(model.state_dict(), os.path.join(save_directory, "pytorch_model.bin"))
#   model.config.save_pretrained(save_directory)
#   print(f"Model and tokenizer saved to '{save_directory}")
#   fnClearModelFromGPU(model)
#   print(f"Model Cleared from the GPU Memory")

def fnSaveModelandTokenizer(model_name, save_directory):
  snapshot_download(repo_id=model_name, local_dir=save_directory)

# Loading the saved model and Tokenizer from Disk
def fnLoadModelandTokenizer(model_name, save_directory):
  quantization_config = BitsAndBytesConfig(
      load_in_4bit=True,  # Load the model in 4-bit precision
      bnb_4bit_use_double_quant=True,  # Use double quantization for better precision
      bnb_4bit_quant_type="nf4",  # Use 4-bit NormalFloat quantization
      bnb_4bit_compute_dtype=torch.float16)
  print("Loading tokenizer...")
  tokenizer = AutoTokenizer.from_pretrained(save_directory)
  print("Tokenizer successfully loaded.")
  from transformers import AutoConfig
  config = AutoConfig.from_pretrained(save_directory)
  # Initialize the model with quantization
  print("Loading model...")
  model = AutoModelForCausalLM.from_pretrained(
    save_directory,
    config=config,
    quantization_config=quantization_config,
    device_map="auto")
  return model, tokenizer

# Function to Download the Model and Tokenizer from HuggingFace
# def fnDownloadModelandTokenizer(model_name):
#   quantization_config = BitsAndBytesConfig(
#       load_in_4bit=True,  # Load the model in 4-bit precision
#       bnb_4bit_use_double_quant=True,  # Use double quantization for better precision
#       bnb_4bit_quant_type="nf4",  # Use 4-bit NormalFloat quantization
#       bnb_4bit_compute_dtype=torch.float16)  # Use FP16 for computation
#   tokenizer = AutoTokenizer.from_pretrained(model_name)
#   model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")
#   return model,tokenizer

# Function to Unload the model from GPU
def fnClearModelFromGPU(model):
  model.cpu()
  del model
  gc.collect()
  torch.cuda.empty_cache()

# Function to Clean the PDF text
def clean_text(text):
    # Remove pattern 1: "<number> \nMORGAN STANLEY | UK EMPLOYEE HANDBOOK 2022"
    text = re.sub(r'\d+\s+\nMORGAN STANLEY \| UK EMPLOYEE HANDBOOK 2022', '', text)

    # Remove pattern 2: "MORGAN STANLEY | UK EMPLOYEE HANDBOOK 2022"
    text = re.sub(r'MORGAN STANLEY \| UK EMPLOYEE HANDBOOK 2022', '', text)

    # Remove the first number
    text = re.sub(r'^\d+\s+\n', '', text, flags=re.MULTILINE)

    # Remove standalone numbers followed by newlines (e.g., "6 \n")
    text = re.sub(r'\d+\s+\n', '', text)

    # Replace all newline characters with a space
    text = text.replace('\n', ' ')

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Remove leading/trailing whitespace
    return text.strip()


# Read the PDF and create and save Vector Embeddings
def fnCreateVectorDB(file_path, persist_directory, pages_to_exclude):
  loader = PyMuPDFLoader(file_path)
  print('PDF Read Successfully')
  documents = loader.load()
  print(f'The document has {len(documents)} pages.')
  filtered_documents = [doc for doc in documents if int(doc.metadata['page']) not in pages_to_exclude]
  print(f'The document has {len(filtered_documents)} pages after removing irrelevant content.')
  for doc in filtered_documents:
      doc.page_content = clean_text(doc.page_content)
  print("Text Cleaning completed")
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  splits = text_splitter.split_documents(filtered_documents)
  print(f"The document has {len(splits)} chunks")
  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=persist_directory)
  vectorstore.persist()
  print("VectorDB created and saved successfully.")
  return vectorstore

# Creating a prompt template to pass to Langchain
def create_prompt_template():
  return PromptTemplate(template=prompt_template_str, input_variables=["context", "question"])

# Creating a compression retriever
def create_compression_retriever(model, tokenizer, vector_store):
    # Wrap the model in a LangChain-compatible LLM
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        temperature=0.1,
        device_map="auto"
    )
    llm = HuggingFacePipeline(pipeline=pipe)

    # Create the compressor
    compressor = LLMChainExtractor.from_llm(llm)

    # Create the compression retriever
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor,
        base_retriever=vector_store.as_retriever(search_kwargs={"k": 5})  # Retrieve top 3 documents
    )
    return compression_retriever

def create_vector_retriever(model, tokenizer):
    # Wrap the model in a LangChain-compatible LLM
    retriever = vector_store.as_retriever(search_type="similarity",
                                      #  metadata_field_info=metadata_field_info,
                                      search_kwargs={"k": 2})
    return retriever

# Creating a Q and A chain
def create_qa_chain(model, tokenizer, retriever):
    # Wrap the model in a LangChain-compatible LLM
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        temperature=0.1,
        top_p = 0.9,
        device_map="auto"
    )

    llm = HuggingFacePipeline(pipeline=pipe)

    prompt_template = create_prompt_template()

    print(prompt_template.template)

    # Create the Q&A chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": prompt_template},
        return_source_documents=True
    )
    return qa_chain

# Function to Answer Questions from a Pandas Dataframe
def answer_questions_from_dataframe(qa_chain, df):
    answers = []
    # Wrap the loop with tqdm to show progress
    for question in tqdm(df["question"], desc="Processing questions"):
        output = qa_chain.invoke(question)
        answers.append(output["result"])
    # Apply extraction for each answer
    answers = [extract_text_after_answer(ans) for ans in answers]
    df["answer"] = answers
    return df

# Extract the actual response and ignore the instruction and context
def extract_text_after_answer(text):
    keyword = "[/INST]'"
    index = text.find(keyword)
    if index == -1:
        return ""
    # Extract the text after "[/INST]'" and strip leading/trailing whitespace.
    text = text[index + len(keyword):].strip()
    text = post_process(text)
    return text

# Creating a new function to eliminate junk text
def post_process(text: str) -> str:
    # Remove everything after the first occurrence of "\n\n"
    text = text.split("\n\n", 1)[0]

    # Remove text inside parentheses and square brackets
    text = re.sub(r'\(.*?\)', '', text)  # Remove text in parentheses
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in square brackets

    text = re.sub(r"'\s*(?=[.?!])", '', text)  # Remove apostrophes immediately before punctuation
    text = re.sub(r"'\s*$", '', text)           # Remove apostrophe if at the very end

    # Clean up extra spaces that might result from removals
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [7]:
fnSaveModelandTokenizer(model_name, save_directory)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.52k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

In [8]:
model, tokenizer = fnLoadModelandTokenizer(model_name, save_directory)

Loading tokenizer...
Tokenizer successfully loaded.
Loading model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
vector_store = fnCreateVectorDB(file_path, persist_directory, pages_to_exclude)

PDF Read Successfully
The document has 83 pages.
The document has 78 pages after removing irrelevant content.
Text Cleaning completed
The document has 612 chunks


  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

VectorDB created and saved successfully.


  vectorstore.persist()


In [101]:
# compression_retriever = create_compression_retriever(model, tokenizer, vector_store)

vectordb_retriever = create_vector_retriever(model, tokenizer)

In [102]:
# Compression retriever is returning a lot of junk
# qa_chain = create_qa_chain(model, tokenizer, compression_retriever)
qa_chain = create_qa_chain(model, tokenizer, vectordb_retriever)

Device set to use cuda:0


[INST] Use the following pieces of context to answer the question at the end in ONE SENTENCE ONLY.
If the context is not relevant to the question, then Helpful Answer : I don't know the answer to that". Do NOT give any explanation.
Don't try to make up an answer. Avoid Repetition.

Context: {context}

Question: {question}
Helpful Answer: [/INST]'



In [77]:
questions = [ "What are the core values outlined in the Morgan Stanley UK Employee Handbook?",
    "What is the notice period required for Professional Employees to terminate their employment?",
    "What is the notice period for Vice Presidents, Executive Directors, and Managing Directors?",
    "How many days of annual leave are all Professional Employees entitled to?",
    "How many days of annual leave are Vice Presidents entitled to?",
    "How many days of annual leave are Executive Directors entitled to?",
    "How many days of annual leave are Managing Directors entitled to?",
    "What additional service-related annual leave entitlement is provided based on years of service?",
    "What is the procedure for notifying your manager about annual leave?",
    "What is the maximum number of annual leave days that can be carried over?",
    "What is the dress code policy for employees at Morgan Stanley?",
    "What constitutes appropriate business casual attire as per the handbook?",
    "What is Morgan Stanley’s policy on smoking within its premises?",
    "What guidelines are provided regarding substance abuse at Morgan Stanley?",
    "What are the rules for using internet and electronic communications at Morgan Stanley?",
    "What policies govern the use of firm systems?",
    "What does the confidentiality clause require from employees during and after employment?",
    "What responsibilities do employees have regarding intellectual property created while employed?",
    "What does the handbook state about data protection and privacy?",
    "How should employees keep their personal details updated according to the handbook?",
    "What are the guidelines on outside interests and external employment?",
    "What are the rules regarding employees' use of social media?",
    "What is the procedure for reporting sickness absence?",
    "How is sick pay determined as described in the handbook?",
    "What does the handbook specify about mandatory vacation requirements?",
    "How many consecutive weeks of annual leave are employees generally allowed to take?",
    "What is the process for handling disciplinary procedures at Morgan Stanley?",
    "What steps are outlined in the grievance procedure?",
    "How are deductions from remuneration handled in the handbook?",
    "What are the terms and conditions regarding fixed compensation and base salary?",
    "How is the base salary calculated for employees who start mid-month?",
    "What is required of employees in terms of obtaining and providing references?",
    "What does the handbook state about overtime payments?",
    "What rights do employees have under the confidentiality and intellectual property clauses?",
    "What provisions are made for employee benefits?",
    "What is the policy regarding family leave (maternity, paternity, adoption, etc.)?",
    "How are collective agreements addressed in the handbook?",
    "What guidelines are provided regarding equal opportunities?",
    "What does the handbook state about dignity at work and harassment?",
    "What is the process for suspending an employee under the handbook?",
    "Under what conditions can an employee be dismissed without notice?",
    "What is the procedure for returning Morgan Stanley property upon termination?",
    "What guidelines are given for reimbursement of business-related expenses?",
    "What does the handbook say about health and safety at work?",
    "What are the provisions for flexible working arrangements?",
    "What is the process for reporting health issues as per the handbook?",
    "How does the handbook define and manage outside interests?",
    "What standards of conduct are expected as per the Code of Conduct?",
    "What responsibilities do Senior Managers have under the regulatory rules?",
    "What is the procedure for the annual fitness and propriety assessment?",

    # 15 generic questions outside the handbook context
    "What is the capital city of France?",
    "Who is the current President of the United States?",
    "What is the tallest mountain in the world?",
    "What is the fastest land animal?",
    "How many continents are there on Earth?",
    "What is the boiling point of water in Celsius?",
    "Who wrote 'Romeo and Juliet'?",
    "What is the largest ocean on Earth?",
    "How many planets are in our solar system?",
    "What is the chemical symbol for gold?",
    "What is the speed of light in a vacuum?",
    "Who invented the telephone?",
    "What is the population of Tokyo?",
    "What is the national language of Brazil?",
    "What is the primary ingredient in traditional Japanese miso soup?"
]

dfQandA = pd.DataFrame(questions, columns=["question"])


In [78]:
%%time
dfQandA = answer_questions_from_dataframe(qa_chain, dfQandA)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

CPU times: user 3min 24s, sys: 5.97 s, total: 3min 30s
Wall time: 3min 35s





In [80]:
dfQandA.to_excel("RAG_Questions_and_Answers.xlsx", index=False)