In [13]:
import os
import time

# For pdf parsing, chunks and vectorization
import pdfplumber
import PyPDF4
import re              

from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# For open source models
from langchain.llms import HuggingFaceHub, Cohere

# For converastional LLM 
from langchain.chains import ConversationalRetrievalChain
from langchain.schema import HumanMessage, AIMessage
from langchain import PromptTemplate

# API Keys
from dotenv import load_dotenv

In [2]:
# Extracting meta data from the pdf

def extract_metadata_from_pdf(file_path):
    with open(file_path, "rb") as pdf_file:
        reader = PyPDF4.PdfFileReader(pdf_file) 
        metadata = reader.getDocumentInfo()
        return {
            "title": metadata.get("/Title", "").strip(),
            "author": metadata.get("/Author", "").strip(),
            "creation_date": metadata.get("/CreationDate", "").strip(),
        }

In [3]:
# Extracting individual pages from pdf

def extract_pages_from_pdf(file_path):
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    with pdfplumber.open(file_path) as pdf:
        pages = []
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text.strip():  # Check if extracted text is not empty
                pages.append((page_num + 1, text))
    return pages

In [4]:
# Function calls for extracting meta data and individual pages

def parse_pdf(file_path):
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    metadata = extract_metadata_from_pdf(file_path)
    pages = extract_pages_from_pdf(file_path)

    return pages, metadata

In [5]:
# Text cleaning tasks

# Merging hyphen words
def merge_hyphenated_words(text):
    return re.sub(r"(\w)-\n(\w)", r"\1\2", text)

# Adding new lines to the same string
def fix_newlines(text):
    return re.sub(r"(?<!\n)\n(?!\n)", " ", text)

# Removing mulitple lines and adding them to the same string 
def remove_multiple_newlines(text):
    return re.sub(r"\n{2,}", "\n", text)

In [6]:
# cleaning pages and returning cleaned pages

def clean_text(pages, cleaning_functions):
    cleaned_pages = []
    for page_num, text in pages:
        for cleaning_function in cleaning_functions:
            text = cleaning_function(text)
        cleaned_pages.append((page_num, text))
    return cleaned_pages

In [7]:
# Making document into smaller cunks

def text_to_docs(text, metadata):
    doc_chunks = []

    for page_num, page in text:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=200,
        )
        chunks = text_splitter.split_text(page)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={
                    "page_number": page_num,
                    "chunk": i,
                    "source": f"p{page_num}-{i}",
                    **metadata,
                },
            )
            doc_chunks.append(doc)

    return doc_chunks

In [8]:
# Loading API keys
load_dotenv()

True

In [9]:
# Parse PDF
file_path = "/Users/affanbinusman/Dropbox (ASU)/Mac (2)/Desktop/untitled folder 2/LangchainLLM/a2/data/example1.pdf"
raw_pages, metadata = parse_pdf(file_path)

# Create text chunks
cleaning_functions = [
    merge_hyphenated_words,
    fix_newlines,
    remove_multiple_newlines,
]
cleaned_text_pdf = clean_text(raw_pages, cleaning_functions)
document_chunks = text_to_docs(cleaned_text_pdf, metadata)

In [10]:
# Using embeddings from HuggingFace and saving this embeddings file
embeddings = HuggingFaceEmbeddings()

vector_store = Chroma.from_documents(
    document_chunks,
    embeddings,
    collection_name="resume",
    persist_directory="a2/data/chroma",
)

# Save DB locally
vector_store.persist()



Testing LLM

Load model HuggingFace

In [30]:
# Function for loading parameters of the models from hugging face.
# Two partially working models 

def make_chain():
    # Loading model parms
    model = HuggingFaceHub(
        repo_id="google/flan-t5-xxl",
        # repo_id="gpt2",
        verbose=False, 
        # model_kwargs={"temperature":1, "max_length":100}
    )
    
    # Defining LLM template
    template = """Answer the following question using information from the document titled "MS-RAS Handbook Spring 2023 and 23-24 AY.docx".
    Question: {question}
    Answer: Let's think step by step and provide accurate answer(s)."""
    prompt = PromptTemplate(template=template, input_variables=["question"])

    # Utilizing the stored vector representation
    vector_store = Chroma(
        collection_name="resume",
        embedding_function=embeddings,
        persist_directory="a2/data/chroma",
    )

    # call to model with all parameters defined above
    return ConversationalRetrievalChain.from_llm(
        llm=model,
        condense_question_prompt=prompt,
        retriever=vector_store.as_retriever(),
        return_source_documents=True,
        verbose=False
    )

Load model Cohere

In [34]:
def make_chain():
    # Loading model parms
    model = Cohere(
        verbose=True, 
        temperature= 0, 
        max_tokens= 200
    )

    # Defining LLM template
    template = """Answer the following question using information from the document titled "MS-RAS Handbook Spring 2023 and 23-24 AY.docx".
    Question: {question}
    Answer: Let's think step by step and provide accurate answer(s)."""
    prompt = PromptTemplate(template=template, input_variables=["question"])
    
    # Utilizing the stored vector representation
    vector_store = Chroma(
        collection_name="resume",
        embedding_function=embeddings,
        persist_directory="a2/data/chroma",
    )

    # call to model with all parameters defined above
    return ConversationalRetrievalChain.from_llm(
        llm=model,
        condense_question_prompt=prompt,
        retriever=vector_store.as_retriever(),
        return_source_documents=True,
        verbose=False,
    )

Run the section below (question templates available below this block)

In [36]:
chain = make_chain()
chat_history = []
vectordbkwargs = {"search_distance": 0.9}

while True:
    question = input("Question: ")

    # Running similarity search to check sources of identified parts in the document
    simSearch = vector_store.similarity_search(query=question)
    print("Similarity Search:\n{}".format(simSearch))
    simSearchScore = vector_store.similarity_search_with_score(query=question)
    print(print("Similarity Search Score:\n{}\n".format(simSearchScore[-1][-1])))

    # Generate answer
    response = chain({"question": question, "chat_history": chat_history, "vectordbkwargs": vectordbkwargs})
    time.sleep(2)
    # Retrieve answer
    answer = response["answer"]
    source = response["source_documents"]
    chat_history.append(HumanMessage(content=question))
    chat_history.append(AIMessage(content=answer))

    # Display answer
    # print("\n\nSources:\n")
    # for document in source:
    #     print(f"Page: {document.metadata['page_number']}")
    #     print(f"Text chunk: {document.page_content[:160]}...\n")
    print(f"Answer: {answer}\n\n")



Similarity Search:
[Document(page_content='● Professional resumé or curriculum vitae ● GRE scores o The GRE is waived for all students with a bachelor’s degree from an ABET Accredited US Institution. If you meet those criteria, please contact the advising office for the concentration that you have applied to (contact information found here) after you have applied to the program in order to have the GRE waived o The GRE is required for any applicant who has not earned a bachelor’s degree from an ABET Accredited US Institution. GRE scores must be sent in order for your application to be reviewed. o The program does not require specific subject GRE scores. The ASU institution code is 4007, and there is no department code (use 0000 if required). ● Proof of English proficiency o The University requires all international applicants from a country whose native language is not English to provide the Test of English as a Foreign Language (TOEFL), the International English Language Testing Syste

KeyboardInterrupt: Interrupted by user

Questions:
List some classes from the Mechanical & Aerospace Engineering Concentration for the Robotics graduate program
Is there a GRE waiver for application of Robotics program at Arizona State University
What is the email address of School of Electrical, Computer, and Energy Engineering department at Arizona State University
The Graduate Student Handbook for Robotics progam is intended for which academic year?
Give an overview of the Robotics & Autonomous Systems program
Can Artificial Intelligence Concentration students opt for portfolio option?
What's the difference between Curricular Practical Training and Optional Practical Training
What is the GPA requirement for the electrical engineering program?

Results from previous session - Cohere Sessions

In [None]:
chain = make_chain()
chat_history = []
vectordbkwargs = {"search_distance": 0.9}

while True:
    question = input("Question: ")

    simSearch = vector_store.similarity_search(query=question)
    print("Similarity Search:\n{}".format(simSearch))
    simSearchScore = vector_store.similarity_search_with_score(query=question)
    print(print("Similarity Search Score:\n{}\n".format(simSearchScore[-1][-1])))

    # Generate answer
    response = chain({"question": question, "chat_history": chat_history, "vectordbkwargs": vectordbkwargs})

    # Retrieve answer
    answer = response["answer"]
    source = response["source_documents"]
    chat_history.append(HumanMessage(content=question))
    chat_history.append(AIMessage(content=answer))

    # Display answer
    # print("\n\nSources:\n")
    # for document in source:
    #     print(f"Page: {document.metadata['page_number']}")
    #     print(f"Text chunk: {document.page_content[:160]}...\n")
    print(f"Answer: {answer}\n\n")




Similarity Search:
[Document(page_content='AFFAN BIN USMAN Tempe, AZ (Open to Relocate) • (602) 756-3989 • ausman4@asu.edu • linkedin.com/in/affanbinusman • github.com/affanbinusman SUMMARY A self-motivated team player with effective communication skills, delivering value in data-driven collaborative software development for AI/ML applications with in-depth data analysis utilizing agile & CI/CD practices, Python, C/C++, PyTorch, optimization techniques & data processing. 2 years of customer-focused experience in technical engineering management driving sustainable goals. EDUCATION Master of Science - Artificial Intelligence & Robotics May 2023 Arizona State University, Tempe, AZ 4.00 GPA Artificial Intelligence (AI), Machine Learning, Neural Networks, Perception, Semiconductors & Micro-Electronics Bachelor of Engineering - Electrical Engineering June 2019 National University of Sciences and Technology, Islamabad, Pakistan 3', metadata={'page_number': 1, 'chunk': 0, 'source': 'p1-0', '

KeyboardInterrupt: Interrupted by user

In [None]:
# Hard-coded examples
examples = [
    {
        "query": "Is there a GRE waiver for application of Robotics program at Arizona State University",
        "answer": "Yes"
    },
    {
        "query": "What is the email address of School of Electrical, Computer, and Energy Engineering department at Arizona State University",
        "answer": "eceegrad@asu.edu"
    },
    {
        "query": "Can Artificial Intelligence Concentration students opt for portfolio option?",
        "answer": "Yes"
    } ,
    {
        "query": "The Graduate Student Handbook for Robotics progam is intended for which academic year?",
        "answer": "2023-2024 Academic year"
    },
    {
        "query": "List some classes from the Mechanical & Aerospace Engineering Concentration for the Robotics graduate program",
        "answer": "MAE 502 Partial Differential Equations, MAE 503 Finite Elements in Engineering, MAE 507 Fundamentals of Control and Optimization, MAE 508 Digital Control: Design and Implementation, MAE 509 LMI Methods in Optimal and Robust Control, MAE 510 Dynamics and Vibrations, MAE 514 Vibration Analysis, MAE 520 Stress Analysis, MAE 521 Structural Optimization, MAE 542 Design Geometry and Kinematics, MAE 548 Prob Methods for Eng Des/Analy, MAE 566 Rotary-Wing Aerodynamics, MAE 598 Bio-Inspired Robots, MAE 598 Multi-Robot Systems, MAE 598 Quantum Mech Eng: SW and HW of Quantum Computers, EGR 546 Robotic Systems- II, IEE 576 Network Optimization and Algorithms"
    }
]