In [None]:
import os
import time

# For pdf parsing, chunks and vectorization
import pdfplumber
import PyPDF4
import re              

from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# For open source models
from langchain.llms import HuggingFaceHub, Cohere

# For converastional LLM 
from langchain.chains import ConversationalRetrievalChain
from langchain.schema import HumanMessage, AIMessage
from langchain import PromptTemplate

# API Keys
from dotenv import load_dotenv

In [None]:
# Extracting meta data from the pdf

def extract_metadata_from_pdf(file_path):
    with open(file_path, "rb") as pdf_file:
        reader = PyPDF4.PdfFileReader(pdf_file) 
        metadata = reader.getDocumentInfo()
        return {
            "title": metadata.get("/Title", "").strip(),
            "author": metadata.get("/Author", "").strip(),
            "creation_date": metadata.get("/CreationDate", "").strip(),
        }

In [None]:
# Extracting individual pages from pdf

def extract_pages_from_pdf(file_path):
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    with pdfplumber.open(file_path) as pdf:
        pages = []
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text.strip():  # Check if extracted text is not empty
                pages.append((page_num + 1, text))
    return pages

In [None]:
# Function calls for extracting meta data and individual pages

def parse_pdf(file_path):
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    metadata = extract_metadata_from_pdf(file_path)
    pages = extract_pages_from_pdf(file_path)

    return pages, metadata

In [None]:
# Text cleaning tasks

# Merging hyphen words
def merge_hyphenated_words(text):
    return re.sub(r"(\w)-\n(\w)", r"\1\2", text)

# Adding new lines to the same string
def fix_newlines(text):
    return re.sub(r"(?<!\n)\n(?!\n)", " ", text)

# Removing mulitple lines and adding them to the same string 
def remove_multiple_newlines(text):
    return re.sub(r"\n{2,}", "\n", text)

In [None]:
# cleaning pages and returning cleaned pages

def clean_text(pages, cleaning_functions):
    cleaned_pages = []
    for page_num, text in pages:
        for cleaning_function in cleaning_functions:
            text = cleaning_function(text)
        cleaned_pages.append((page_num, text))
    return cleaned_pages

In [None]:
# Making document into smaller cunks

def text_to_docs(text, metadata):
    doc_chunks = []

    for page_num, page in text:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=200,
        )
        chunks = text_splitter.split_text(page)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={
                    "page_number": page_num,
                    "chunk": i,
                    "source": f"p{page_num}-{i}",
                    **metadata,
                },
            )
            doc_chunks.append(doc)

    return doc_chunks

Make sure you complete the following tasks before proceeding
    - rename "copy.env" file to ".env" 
    - Add your API keys for huggingface and Cohere. 

In [None]:
# Loading API keys
load_dotenv()

Add your PDF to "pdfs" folder and add the path in the "file_path" variable.

In [None]:
# Parse PDF
file_path = "pdfs/example1.pdf"               # Update the file path here
raw_pages, metadata = parse_pdf(file_path)
title = metadata['title'].split(".")[0]

# Create text chunks
cleaning_functions = [
    merge_hyphenated_words,
    fix_newlines,
    remove_multiple_newlines,
]
cleaned_text_pdf = clean_text(raw_pages, cleaning_functions)
document_chunks = text_to_docs(cleaned_text_pdf, metadata)

In [None]:
# Using embeddings from HuggingFace and saving this embeddings file
embeddings = HuggingFaceEmbeddings()

vector_store = Chroma.from_documents(
    document_chunks,
    embeddings,
    collection_name="pdf-chat",
    persist_directory="chroma",
)

# Save DB locally
vector_store.persist()

Run either one of the two blocks. 

Load model using HuggingFace
    - You can choose any LLM from hugging face.
    - google/flan-t5 and its variants or gpt2 are one of many models available
    - If you update the model, some parameters may need some modification in arugments 

In my testing, Cohere's model has given consistent performance with reliable results. You may also use OpenAI's LLM by modifying the code for GPT4 / ChatGPT

In [None]:
# Function for loading parameters of the models from hugging face.

def make_chain():
    # Loading model parms
    model = HuggingFaceHub(
        repo_id="google/flan-t5-xxl", # or you can choose "gpt2" etc
        verbose=False, 
        model_kwargs={"temperature":1, "max_length":100}
    )
    
    # Defining LLM template
    template = "Answer the following question using information from the document titled" + str(title) + \
        "Question: {question} \
        Answer: Let's think step by step and provide accurate answer(s)."
    prompt = PromptTemplate(template=template, input_variables=["question"])

    # Utilizing the stored vector representation
    vector_store = Chroma(
        collection_name="pdf-chat",
        embedding_function=embeddings,
        persist_directory="chroma",
    )

    # call to model with all parameters defined above
    return ConversationalRetrievalChain.from_llm(
        llm=model,
        condense_question_prompt=prompt,
        retriever=vector_store.as_retriever(),
        return_source_documents=True,
        verbose=False
    )

Load model using Cohere

In [None]:
def make_chain():
    # Loading model parms
    model = Cohere(
        verbose=True, 
        temperature= 0, 
        max_tokens= 200
    )

    # Defining LLM template
    template = "Answer the following question using information from the document titled" + str(title) + \
        "Question: {question} \
        Answer: Let's think step by step and provide accurate answer(s)."
    prompt = PromptTemplate(template=template, input_variables=["question"])
    
    # Utilizing the stored vector representation
    vector_store = Chroma(
        collection_name="pdf-chat",
        embedding_function=embeddings,
        persist_directory="chroma",
    )

    # call to model with all parameters defined above
    return ConversationalRetrievalChain.from_llm(
        llm=model,
        condense_question_prompt=prompt,
        retriever=vector_store.as_retriever(),
        return_source_documents=True,
        verbose=False,
    )

Run the section below (question templates available below this block). Enter exit or quit to end the session.

In [None]:
chain = make_chain()
chat_history = []
vectordbkwargs = {"search_distance": 0.9}

while True:
    question = input("Question: ")

    print(question)
    if question == "quit" or question == "exit":
        print("Ending session")
        break

    # Running similarity search to check sources of identified parts in the document
    simSearch = vector_store.similarity_search(query=question)
    print("\nSimilarity Search:\n{}".format(simSearch))
    simSearchScore = vector_store.similarity_search_with_score(query=question)
    print(print("Similarity Search Score:\n{}\n".format(simSearchScore[-1][-1])))

    # Generate answer
    response = chain({"question": question, "chat_history": chat_history, "vectordbkwargs": vectordbkwargs})
    time.sleep(1)
    
    # Retrieve answer
    answer = response["answer"]
    source = response["source_documents"]
    chat_history.append(HumanMessage(content=question))
    chat_history.append(AIMessage(content=answer))

    print(f"Question: {question}\nAnswer: {answer}\n\n")

Questions:
List some classes from the Mechanical & Aerospace Engineering Concentration for the Robotics graduate program
Is there a GRE waiver for application of Robotics program at Arizona State University
What is the email address of School of Electrical, Computer, and Energy Engineering department at Arizona State University
The Graduate Student Handbook for Robotics progam is intended for which academic year?
Give an overview of the Robotics & Autonomous Systems program
Can Artificial Intelligence Concentration students opt for portfolio option?
What's the difference between Curricular Practical Training and Optional Practical Training
What is the GPA requirement for the electrical engineering program?

In [None]:
# Hard-coded examples
examples = [
    {
        "query": "Is there a GRE waiver for application of Robotics program at Arizona State University",
        "answer": "Yes"
    },
    {
        "query": "What is the email address of School of Electrical, Computer, and Energy Engineering department at Arizona State University",
        "answer": "eceegrad@asu.edu"
    },
    {
        "query": "Can Artificial Intelligence Concentration students opt for portfolio option?",
        "answer": "Yes"
    } ,
    {
        "query": "The Graduate Student Handbook for Robotics progam is intended for which academic year?",
        "answer": "2023-2024 Academic year"
    },
    {
        "query": "List some classes from the Mechanical & Aerospace Engineering Concentration for the Robotics graduate program",
        "answer": "MAE 502 Partial Differential Equations, MAE 503 Finite Elements in Engineering, MAE 507 Fundamentals of Control and Optimization, MAE 508 Digital Control: Design and Implementation, MAE 509 LMI Methods in Optimal and Robust Control, MAE 510 Dynamics and Vibrations, MAE 514 Vibration Analysis, MAE 520 Stress Analysis, MAE 521 Structural Optimization, MAE 542 Design Geometry and Kinematics, MAE 548 Prob Methods for Eng Des/Analy, MAE 566 Rotary-Wing Aerodynamics, MAE 598 Bio-Inspired Robots, MAE 598 Multi-Robot Systems, MAE 598 Quantum Mech Eng: SW and HW of Quantum Computers, EGR 546 Robotic Systems- II, IEE 576 Network Optimization and Algorithms"
    }
]