In [5]:
# Code to mount the drive, so that it could be used to access the files and directories from google drive
from google.colab import drive
import os
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Part 1

**In the first method, I utilized Hugging Face's pipeline for question answering directly. This pipeline provides a straightforward abstraction for performing question answering tasks without the need for explicit model loading, tokenization, or inference coding.**

In [6]:
# !pip install pdfquery

In [7]:
#Provided code imports libraries related to task one, then initializes a question-answering pipeline using a pre-trained model.
from transformers import pipeline
from pdfquery import PDFQuery
oracle = pipeline(model="deepset/roberta-base-squad2")

**This code defines a function to retrieve a list of files within a specified path**

In [8]:
# Function to get a list of files in a folder
def get_files_in_folder(folder_path):
    list_of_files = []
    file_names = os.listdir(folder_path)
    for file_name in file_names:
        list_of_files.append(file_name)
    return list_of_files

folder_path = "/content/drive/MyDrive/PDF Dataset"
file_list = get_files_in_folder(folder_path)  # Get list of files in the folder


**The purpose of the code is to extract text from multiple PDF files stored in a specific directory, combine the extracted text into a single string, and store it in the variable combined_text.**

In [9]:
# Function to extract text from a PDF file
def extract_text_from_pdf(file_path):
    pdf = PDFQuery(file_path)
    pdf.load()
    text_elements = pdf.pq('LTTextLineHorizontal')
    text = [t.text for t in text_elements]
    return ' '.join(text)  # Combine extracted text into a single string

# Function to combine text from multiple PDF files
def combine_text_from_files(file_names):
    combined_text = ""
    for file_name in file_names:
        if file_name.endswith(".pdf"):  # Check if file is a PDF
            file_path = f"/content/drive/My Drive/PDF Dataset/{file_name}"  # Construct full file path
            text = extract_text_from_pdf(file_path)
            combined_text += text + "\n"  # Append extracted text to combined text separated with a line between documents
    return combined_text

combined_text = combine_text_from_files(file_list)  # Combine text from all PDF files


**Query 1**

In [10]:
answer = oracle(question="Whats the CGPA of Aadrish?", context=combined_text)#I also added my resume in the documents, to better understand the confirmation of information
print(answer)

{'score': 0.03149282559752464, 'start': 360, 'end': 367, 'answer': '3.3/4.0'}


**Query 2**

In [11]:
answer = oracle(question="Which technology is used for auditable access control for business processes?", context=combined_text)
print(answer)


{'score': 0.7474311590194702, 'start': 2220, 'end': 2231, 'answer': 'blockchains'}





# Part 2

**This code imports modules and sets up configurations for using language embeddings, vector stores, question-answering chains, document loaders, and text splitters from the langchain and langchain_community libraries, and initializes Hugging Face embeddings for natural language processing tasks.**

In [12]:
# !pip install langchain
# !pip install -U langchain-community
# !pip install transformers
# !pip install sentence-transformers
# !pip install pypdf
# !pip install faiss-gpu

In [None]:
# Importing necessary modules for language embeddings and vector stores
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain_core.documents import Document
# Importing document loaders and text splitters
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Importing Hugging Face embeddings and setting up environment variable
from langchain.embeddings import HuggingFaceEmbeddings
from google.colab import userdata
import os
api_token = userdata.get("HUGGINGFACEHUB_API_TOKEN")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_token
embeddings = HuggingFaceEmbeddings()  # Initializing Hugging Face embeddings


**This code creates a list of documents extracted from PDF files in a specified directory, processes each PDF file to split its content into individual pages, and then creates Document objects from the extracted content along with their metadata. Finally, it uses the FAISS library to index the documents based on their embeddings for efficient search and retrieval.**

In [19]:
list_of_documents = []
# Initialize text splitter with specified parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=64,
    separators=['\n\n', '\n', ' ', ''])

pdf_directory = "/content/drive/MyDrive/PDF Dataset"
for pdf_file in os.listdir(pdf_directory):
    if pdf_file.endswith(".pdf"):  # Check if file is a PDF
        print(pdf_file)
        pdf_path = os.path.join(pdf_directory, pdf_file)
        loader = PyPDFLoader(pdf_path)
        pages = loader.load_and_split()
        docs = text_splitter.split_documents(pages)
        for i, page_content in enumerate(docs):  # Iterate over each document
            page_content_str = str(page_content)
            list_of_documents.append(Document(page_content=page_content_str, metadata=dict(page=i+1)))


db = FAISS.from_documents(list_of_documents, embeddings)


Resume_Aadrish (One Page).pdf
mypdf.pdf
slides-biasws-a-framework-for-improving-web-affordability-and-inclusiveness-00.pdf
Coal_not_diamonds.pdf
rethinking-web-affordability-inclusion.pdf
2110.14205v1.pdf
weblight.pdf


**Setting up the Model**

In [20]:
llm = HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature":1, "max_length":1000000})  # Initialize Hugging Face model
chain = load_qa_chain(llm, chain_type="stuff")  # Load question-answering chain

**Query 1**

In [21]:
query = "what is CGPA of Aadrish?"#I also added my resume in the documents, to better understand the confirmation of information
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)

'3.3/4.0'

**Query 2**

In [22]:
query = "what was the main vulnerability ammar tahir found in google web light service?"
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)

'Web Light can potentially build users’ browsing profiles and read website content being viewed by users'