**Installing Required Dependencies**

In [1]:
!pip install langchain-openai langchain langchain-community langchain-core faiss-gpu PyPDF2



Collecting langchain-openai
  Downloading langchain_openai-0.1.9-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m997.8 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.2.5-py3-none-any.whl (974 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.6/974.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-community
  Downloading langchain_community-0.2.5-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core
  Downloading langchain_core-0.2.9-py3-none-any.whl (321 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.8/321.8 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━

**Importing Required Modules**

In [2]:
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI

from PyPDF2 import PdfReader

**Setting Environment Variable for API key ( Set your own to ensure working )**

In [3]:
os.environ["OPENAI_API_KEY"] = ""


**Helper Functions**

In [12]:
def get_pdf_content(documents):
    """
    Extracts and concatenates text from a list of PDF documents.

    Parameters:
    documents (list): A list of paths to PDF documents.

    Returns:
    str: A single string containing the extracted text from all the provided PDF documents.
    """
    raw_text = ""  # Initialize an empty string to hold the extracted text

    for document in documents:  # Iterate through each document path in the provided list
        pdf_reader = PdfReader(document)  # Create a PdfReader object for the current document

        for page in pdf_reader.pages:  # Iterate through each page in the current document
            raw_text += page.extract_text()  # Extract text from the current page and append it to raw_text

    return raw_text  # Return the concatenated text from all the documents


In [13]:
def get_chunks(text):
    """
    Splits a given text into smaller chunks for easier processing.

    Parameters:
    text (str): The text to be split into chunks.

    Returns:
    list: A list of text chunks.
    """
    # Create a CharacterTextSplitter object with specified parameters:
    # - separator: the character used to split the text (newline character in this case)
    # - chunk_size: the maximum number of characters in each chunk
    # - chunk_overlap: the number of overlapping characters between consecutive chunks
    # - length_function: function to calculate the length of the text (using the len function here)
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )

    # Split the text into chunks using the text splitter
    text_chunks = text_splitter.split_text(text)

    # Return the list of text chunks
    return text_chunks


In [14]:
def get_embeddings(chunks):
    """
    Generates vector embeddings for a list of text chunks and stores them in a FAISS index.

    Parameters:
    chunks (list): A list of text chunks to be embedded.

    Returns:
    FAISS: A FAISS index containing the vector embeddings of the text chunks.
    """
    # Create an OpenAIEmbeddings object to generate embeddings for the text chunks
    embeddings = OpenAIEmbeddings()

    # Use the FAISS library to create a vector storage index from the text chunks and their embeddings
    vector_storage = FAISS.from_texts(texts=chunks, embedding=embeddings)

    # Return the FAISS index containing the vector embeddings
    return vector_storage


In [15]:
def start_conversation(vector_embeddings):
    """
    Initializes a conversational retrieval chain with a language model and memory for context.

    Parameters:
    vector_embeddings (FAISS): A FAISS index containing vector embeddings for the text chunks.

    Returns:
    ConversationalRetrievalChain: A conversational retrieval chain object for handling interactive Q&A.
    """
    # Create a ChatOpenAI object to use as the language model (LLM) for the conversation
    llm = ChatOpenAI()

    # Create a ConversationBufferMemory object to store and manage the conversation history
    # - memory_key: the key under which the conversation history is stored
    # - return_messages: whether to return the conversation messages
    memory = ConversationBufferMemory(
        memory_key='chat_history',
        return_messages=True
    )

    # Create a ConversationalRetrievalChain object to handle the retrieval-based conversational interaction
    # - llm: the language model to be used for generating responses
    # - retriever: the retriever object created from the vector embeddings to find relevant information
    # - memory: the memory object to store the conversation history
    conversation = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vector_embeddings.as_retriever(),
        memory=memory
    )

    # Return the initialized conversational retrieval chain
    return conversation


In [16]:
def query_system(conversation, question):
    """
    Queries the conversational retrieval system with a given question and returns the answer.

    Parameters:
    conversation (ConversationalRetrievalChain): The conversational retrieval chain object.
    question (str): The question to be asked.

    Returns:
    str: The answer provided by the conversational retrieval system.
    """
    # Pass the question to the conversation object and get the response
    response = conversation({"question": question})

    # Return the answer from the response
    return response["answer"]


**Calling Functions to create Embeddings of Document**

In [17]:
document_paths = ["/content/RAG Input Doc.pdf", "/content/usig_sports_rules_-_cricket.pdf"]


In [18]:
document_texts = [get_pdf_content([path]) for path in document_paths]
combined_text = " ".join(document_texts)  # Combine texts from all documents
text_chunks = get_chunks(combined_text)
vector_embeddings = get_embeddings(text_chunks)
conversation = start_conversation(vector_embeddings)


**Answering questions using RAG**

**These are some of the sample quetions which I will be typing in and asking**

In [None]:
questions = [
    "Which paper received the highest number of stars per hour?",
    "What is the focus of the 'MeshAnything' project?",
    "Which paper discusses the integration of Large Language Models with Monte Carlo Tree Search?",
    "What advancements does the 'VideoLLaMA 2' paper propose?",
    "Which paper was published most recently?",
    "Identify a paper that deals with language modeling and its scalability",
    "Which paper aims at improving accuracy in Google-Proof Question Answering?",
    '''List the categories covered by the paper titled 'TextGrad: Automatic "Differentiation" via Text'.''',
    "What are umpires in cricket?",
    "How many overs are there in a T20 match?",
    "What are boundary decisions",
    "What happens when match is a tie",
    "How many overs can a bowler bowl?"
]

In [11]:
while True:
    question = input("Enter your question (or 'exit' to quit): ")
    if question.lower() == 'exit':
        break
    response = conversation({"question": question})
    print(f"Answer: {response['answer']}\n")

Enter your question (or 'exit' to quit): Which paper received the highest number of stars per hour?


  warn_deprecated(


Answer: The paper "Scalable MatMul-free Language Modeling" by ridgerchu/matmulfreellm received the highest number of stars per hour with 2,140 stars per hour.

Enter your question (or 'exit' to quit): What is the focus of the 'MeshAnything' project?"
Answer: The focus of the 'MeshAnything' project is on Artist-Created Mesh Generation with Autoregressive Transformers.

Enter your question (or 'exit' to quit): Which paper discusses the integration of Large Language Models with Monte Carlo Tree Search?
Answer: The paper that discusses the integration of Large Language Models with Monte Carlo Tree Search is titled "Accessing GPT-4 level Mathematical Olympiad Solutions via Monte Carlo Tree Self-refine with LLaMa-3 8B" by trotsky1997/mathblackbox.

Enter your question (or 'exit' to quit): What advancements does the 'VideoLLaMA 2' paper propose?
Answer: The "VideoLLaMA 2" paper proposes advancements in spatial-temporal modeling and audio understanding in Video Large Language Models (Video-LLM