# RAG framework

In [None]:
%pip -q install git+https://github.com/huggingface/transformers # need to install from github
%pip install langchain tiktoken chromadb InstructorEmbedding
%pip install accelerate loralib bitsandbytes sentencepiece xformers einops
%pip install sentence-transformers

In [None]:
# Imports
import torch
import transformers
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from IPython.display import Markdown, display
import chromadb

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# Downloading the LLM
model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", torch_dtype="auto", device_map='auto')
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta", torch_dtype="auto")

In [None]:
corpus_directory = 'Enter path to corpus'
loader = DirectoryLoader(corpus_directory, glob="*.txt", loader_cls=TextLoader)
documents = loader.load()

# Splitting text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

# Get the number of text chunks
num_chunks = len(texts)
print(f"Number of text chunks: {num_chunks}")

In [None]:
# Download embedding model

from langchain.embeddings import HuggingFaceBgeEmbeddings
model_name = "BAAI/bge-large-en"
encode_kwargs = {'normalize_embeddings': True}  # set True to compute cosine similarity

model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)

In [None]:
persist_directory = 'Specify the path of directory to store the embeddings' # chunksize = 4096

# Making a vector database
vectordb = Chroma.from_documents(documents=texts,
                                embedding=model_norm,
                                persist_directory=persist_directory)

In [None]:
# If loading from existing vector database
vectordb = Chroma(persist_directory=persist_directory, embedding_function=model_norm)

In [None]:
# Setup the retriever
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [None]:
# Functions to process response and return the source
import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        # print(source.metadata['source'])
        return(source.metadata['source'])

In [None]:
# Create wrokflow for LLM
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length = 5000,
    temperature=0.0001,
    do_sample=True,
    top_p=0.95,
    repetition_penalty=1.35
)
local_llm = HuggingFacePipeline(pipeline = pipe)
 
# Create RAG chain
qa_chain = RetrievalQA.from_chain_type(llm=local_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
# Enter query
query = "Input query"
llm_response = qa_chain(query)
process_llm_response(llm_response)

# Importing question from file

In [None]:
input_file_path = 'Enter path of input text file containing one question per line'
output_file_path = 'Enter path of output file to write questions and responses'

In [None]:
with open(input_file_path, 'r') as input_file, open(output_file_path, 'w') as output_file:
    question_number = 1  # Initialize the question number

    # Loop through each line in the input file
    for line in input_file:
        # Remove leading and trailing whitespaces
        question = line.strip()

        # Send the question to the qa_chain function to get a response
        llm_response = qa_chain(question)
        # processed_response = process_llm_response(llm_response)

        # Write the question and response to the output file
        # output_file.write(f"Question {question_number}: {question}\n")
        output_file.write(f"Question: {question}\n")
        output_file.write("Response:\n")
        output_file.write(wrap_text_preserve_newlines(llm_response['result']) + '\n')
        output_file.write('\nSources:\n')

        # Loop through the source documents in the response
        for source in llm_response["source_documents"]:
            output_file.write(source.metadata['source'] + '\n')

        output_file.write('\n\n')  # Separate each question-response pair

        question_number += 1  # Increment the question number

print("Questions and responses written to the output file.")