# Data Ingestion


## This Python Notebook parses the PDF files from the URLs; uses LlamaIndex to index the parsed content and store the index locally

In [1]:
#Import libraries
import PyPDF2
import requests
import pandas as pd
from typing_extensions import Protocol
from llama_index.node_parser import SimpleNodeParser
import openai
from pathlib import Path
from llama_index import download_loader
from llama_index.node_parser import SimpleNodeParser
from llama_index import GPTVectorStoreIndex
import config


In [2]:
#OpenAI API Key Authentication (The OpenAI API Key will be stored in the config.py file)
openai.api_key = config.openai_key

In [3]:
#Parse PDF files from URLs

#Function to parse the PDF Files from URLs
def parse_pdf(url):
    response = requests.get(url)
    with open('temp.pdf', 'wb') as f:
        f.write(response.content)

    pdf_file = open('temp.pdf', 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    num_pages = len(pdf_reader.pages)
    text = ""
    for page_number in range(num_pages):
        page = pdf_reader.pages[page_number] 
        text += page.extract_text()

    pdf_file.close()
    return text

#List of URLs where PDF Files should be extracted
urls = [
    "https://dl.acm.org/doi/pdf/10.1145/3397271.3401075",
    "https://arxiv.org/pdf/2104.07186.pdf",
    "https://arxiv.org/pdf/2106.14807.pdf",
    "https://arxiv.org/pdf/2301.03266.pdf",
    "https://arxiv.org/pdf/2303.07678.pdf"
]

i = 0
data = []

#The for loop parses each PDF File and stores it a dataframe
for url in urls:
    i+=1
    pdf_text = parse_pdf(url)
    data.append({'id': i ,'URL': url, 'Research_Papers': pdf_text})

df = pd.DataFrame(data)
print(df)

   id                                                URL  \
0   1  https://dl.acm.org/doi/pdf/10.1145/3397271.340...   
1   2               https://arxiv.org/pdf/2104.07186.pdf   
2   3               https://arxiv.org/pdf/2106.14807.pdf   
3   4               https://arxiv.org/pdf/2301.03266.pdf   
4   5               https://arxiv.org/pdf/2303.07678.pdf   

                                     Research_Papers  
0  ColBERT: Efficient and Effective Passage Searc...  
1  COIL: Revisit Exact Lexical Match in Informati...  
2  A Few Brief Notes on DeepImpact, COIL, and a C...  
3  Doc2Query--: When Less is More\nMitko Gospodin...  
4  Query2doc: Query Expansion with Large Language...  


In [4]:
#Saving the parsed PDFs to a CSV file
df.to_csv('research_papers.csv')

In [5]:
#Using Data Loader from Llamahub( https://llamahub.ai/ )  to load the csv file 
PandasCSVReader = download_loader("PandasCSVReader")


loader = PandasCSVReader()
docs = loader.load_data(file=Path('research_papers.csv'))

In [6]:
docs

[Document(id_='d3ebb839-f393-431d-b500-d1ce023cf399', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='6b17a06aad0a7d93b9b6f3e6024417f50fc5776a9c09d8623a3137b9590786a1', text='0, 1, https://dl.acm.org/doi/pdf/10.1145/3397271.3401075, ColBERT: Efficient and Effective Passage Search via\nContextualized Late Interaction over BERT\nOmar Khattab\nStanford University\nokhattab@stanford.eduMatei Zaharia\nStanford University\nmatei@cs.stanford.edu\nABSTRACT\nRecent progress in Natural Language Understanding (NLU) is driv-\ning fast-paced advances in Information Retrieval (IR), largely owed\nto fine-tuning deep language models (LMs) for document ranking.\nWhile remarkably effective, the ranking models based on these LMs\nincrease computational cost by orders of magnitude over prior ap-\nproaches, particularly as they must feed each query–document pair\nthrough a massive neural network to compute a single relevance\nscore. To ta

In [7]:
#Parse the docs into nodes
# Node parsers are a simple abstraction that take a list of documents, and chunk them into Node objects

parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(docs)

In [8]:
nodes

[TextNode(id_='3c9f3625-12ee-46b6-b7e7-6d9c3783708b', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='d3ebb839-f393-431d-b500-d1ce023cf399', node_type=None, metadata={}, hash='6b17a06aad0a7d93b9b6f3e6024417f50fc5776a9c09d8623a3137b9590786a1'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='90e2e4e9-9282-43c6-8606-e5c11b105f25', node_type=None, metadata={}, hash='79048ed80b10e318ac10184464e9e787938cb0f9d0e4db5d9f4bd218cb70bb96')}, hash='29d0a52345017d55a684f6bfd60f5b66ff09b8c81c30764e8e39742b1a1f5e5d', text='0, 1, https://dl.acm.org/doi/pdf/10.1145/3397271.3401075, ColBERT: Efficient and Effective Passage Search via\nContextualized Late Interaction over BERT\nOmar Khattab\nStanford University\nokhattab@stanford.eduMatei Zaharia\nStanford University\nmatei@cs.stanford.edu\nABSTRACT\nRecent progress in Natural Language Understanding (NLU) is driv-\ning fast-paced ad

In [10]:
from llama_index import ServiceContext
from llama_index.llms import OpenAI

# Necessary to use the latest OpenAI models that support function calling API
service_context = ServiceContext.from_defaults(llm=OpenAI(model="gpt-3.5-turbo"))

#Index the Nodes
index = GPTVectorStoreIndex(nodes, service_context=service_context)

#An Index is a data structure that allows us to quickly retrieve relevant context for a user query.

In [11]:
#Store th Index
index.storage_context.persist(persist_dir="index")