### Master Project: Build a Research Agent with LangGraph, GPT-4o, RAG, Pinecone, ArXiv, and Google SerpAPI (Real Time Google Search API)

---------------------------------------------------------------------------------------------

### Extracting Data from ArXiv into Pandas DataFrame and Saving it as JSON

In [1]:
import requests
from pathlib import Path
import pandas as pd
import json
import xml.etree.ElementTree as ET  # parsing and manipulating XML data

# Namespace for ArXiv's Atom-based XML Format.
ARXIV_NAMESPACE = "{http://www.w3.org/2005/Atom}"


# Defining a function
def extract_from_arxiv(
    search_query: str = "cat:cs.AI",  ###by default Computer Science AI
    max_results: int = 100,
    json_file_path: str = "files/arxiv_dataset.json",  ###saves it as JSON format
):
    """
    Fetches papers from the ArXiv API based on a search query, saves them as JSON,
    and returns a pandas DataFrame.

    Args:
        search_query (str): The search query for ArXiv (by default is 'cat:cs.AI').
        max_results (int): The maximum number of results to retrieve (default is 100).
        json_file_path (str): File path where JSON data will be saved.

    Returns:
        pd.DataFrame: DataFrame containing the extracted paper information.
    """

    ## Construct the json_file_path if doesn't exist
    json_data_path = Path("files/")
    if json_data_path.is_dir():
        print(f"{json_data_path} already exists")
    else:
        print(f"{json_data_path} doesn't exists, creating new....")
        json_data_path.mkdir(parents=True, exist_ok=True)


    ## URL Constructor: Construct the URL for the API Request.
    url = f"http://export.arxiv.org/api/query?search_query={search_query}&max_results={max_results}"

    ## Send a GET request to the Arxiv API
    response = requests.get(url)  ### This will return the XML data for the requested results

    ## Parse the XML Response
    root = ET.fromstring(response.content)  ### ET: Element Tree to navigate the XML structure and navigate the data we need

    papers = []  ### To store all the extracted paper data

    ## Loop through each "entry" in the XML, representing a SINGLE PAPER. For each Paper, we are extracting Title, Summary, Author, URL
    for entry in root.findall(f"{ARXIV_NAMESPACE}entry"):
        title = entry.find(f"{ARXIV_NAMESPACE}title").text.strip()
        summary = entry.find(f"{ARXIV_NAMESPACE}summary").text.strip()

        ### Get the authors of the paper
        author_elements = entry.findall(f"{ARXIV_NAMESPACE}author")
        authors = [
            author.find(f"{ARXIV_NAMESPACE}name").text for author in author_elements
        ]

        ### Get the paper's URL
        paper_url = entry.find(f"{ARXIV_NAMESPACE}id").text  ### Later we can use this URL for PDF, chunking and embedding
        arxiv_id = paper_url.split("/")[-1]  ### Extracting the Arxiv ID, which is the last part of the paper_url

        ### Check for the PDF link, from the paper_URL
        pdf_link = next(
            (
                link.attrib["href"] for link in entry.findall(f"{ARXIV_NAMESPACE}link") 
                if link.attrib.get("title") == "pdf"
            ),
            None,
        )

        papers.append({
            "title": title,
            "summary": summary,
            "authors": authors,
            "arxiv_id": arxiv_id,
            "url": paper_url,
            "pdf_link": pdf_link
        })

    ## Convert list (papers) into DataFrame
    df = pd.DataFrame(papers)

    ## Save the DataFrame to a JSON file
    with open(json_file_path, "w", encoding="utf-8") as f:
        json.dump(papers, f, ensure_ascii=False, indent=4)
        print(f"Data Saved to {json_file_path}")
    
    return df 

In [2]:
# Let's test the function

df = extract_from_arxiv(max_results=20)

files already exists
Data Saved to files/arxiv_dataset.json


In [3]:
# To Read the JSON File
import json 
file_name = "files/arxiv_dataset.json"
with open(file_name, "r") as file:
    data = json.load(file)

print(data)

[{'title': 'Dynamic Backtracking', 'summary': 'Because of their occasional need to return to shallow points in a search\ntree, existing backtracking methods can sometimes erase meaningful progress\ntoward solving a search problem. In this paper, we present a method by which\nbacktrack points can be moved deeper in the search space, thereby avoiding this\ndifficulty. The technique developed is a variant of dependency-directed\nbacktracking that uses only polynomial space while still providing useful\ncontrol information and retaining the completeness guarantees provided by\nearlier approaches.', 'authors': ['M. L. Ginsberg'], 'arxiv_id': '9308101v1', 'url': 'http://arxiv.org/abs/cs/9308101v1', 'pdf_link': 'http://arxiv.org/pdf/cs/9308101v1'}, {'title': 'A Market-Oriented Programming Environment and its Application to\n  Distributed Multicommodity Flow Problems', 'summary': 'Market price systems constitute a well-understood class of mechanisms that\nunder certain conditions provide effec

In [None]:
# To Read the DF
print(f"\n Shape of the DataFrame: {df.shape}\n")
df.head()

## to check the random rows of the dataframe
# df.sample()


 Shape of the DataFrame: (20, 6)



Unnamed: 0,title,summary,authors,arxiv_id,url,pdf_link
0,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,http://arxiv.org/pdf/cs/9308101v1
1,A Market-Oriented Programming Environment and ...,Market price systems constitute a well-underst...,[M. P. Wellman],9308102v1,http://arxiv.org/abs/cs/9308102v1,http://arxiv.org/pdf/cs/9308102v1
2,An Empirical Analysis of Search in GSAT,We describe an extensive study of search in GS...,"[I. P. Gent, T. Walsh]",9309101v1,http://arxiv.org/abs/cs/9309101v1,http://arxiv.org/pdf/cs/9309101v1
3,The Difficulties of Learning Logic Programs wi...,As real logic programmers normally use cut (!)...,"[F. Bergadano, D. Gunetti, U. Trinchero]",9311101v1,http://arxiv.org/abs/cs/9311101v1,http://arxiv.org/pdf/cs/9311101v1
4,Software Agents: Completing Patterns and Const...,To support the goal of allowing users to recor...,"[J. C. Schlimmer, L. A. Hermens]",9311102v1,http://arxiv.org/abs/cs/9311102v1,http://arxiv.org/pdf/cs/9311102v1


#### Downloading the Research Paper - Downloading the 20 Research papers that we generated above, using the coloumn PDF Link and saving it to files.

In [None]:
import pandas as pd
import requests
import os
from pathlib import Path


# Defining the Function
def download_pdf(df=df, download_folder: str = "files"):
    """
    Downloads PDFs from URLs listed in the DataFrame and saves them to a specified folder.
    The file names are stored in a new column 'pdf_file_name' in the DataFrame.

    Args:
        df (DataFrame): DataFrame containing a "pdf_link" column with the URLs to download.
        download_folder: Path to the folder where PDFs will be saved (default is 'files').
    
    Returns:
        pd.DataFrame: The original DataFrame with an additional "pdf_file_name" column containing
                      the paths of the downloaded PDF files or None if the download failed.
    """

    if not os.path.exists(download_folder):
        print(f"Download Folder Doesn't Exists, Creating Folder {download_folder} ....")
        os.makedirs(download_folder)
        print("Folder Created")
    else:
        print(f"Downlad Folder Already Exists!")

    pdf_file_names = [] 

    ## Loop through each row to download the PDFs
    for index, row in df.iterrows():
        pdf_link = row["pdf_link"]

        try:
            response = requests.get(pdf_link)
            response.raise_for_status() 

            file_name = os.path.join(download_folder, pdf_link.split('/')[-1]) + ".pdf"
            pdf_file_names.append(file_name)

            ## Save the downloaded PDF
            with open(file_name, "wb") as f:
                f.write(response.content)
            
            print(f"PDF downloaded successfully and saved as {file_name}")
        
        except requests.exceptions.RequestException as e:
            print(f"Failed to download the PDF: {e}")
            pdf_file_names.append(None)
    
    df['pdf_file_name'] = pdf_file_names

    return df

In [6]:
# Test the function

df = download_pdf(df=df)

Downlad Folder Already Exists!
PDF downloaded successfully and saved as files/9308101v1.pdf
PDF downloaded successfully and saved as files/9308102v1.pdf
PDF downloaded successfully and saved as files/9309101v1.pdf
PDF downloaded successfully and saved as files/9311101v1.pdf
PDF downloaded successfully and saved as files/9311102v1.pdf
PDF downloaded successfully and saved as files/9312101v1.pdf
PDF downloaded successfully and saved as files/9401101v1.pdf
PDF downloaded successfully and saved as files/9402101v1.pdf
PDF downloaded successfully and saved as files/9402102v1.pdf
PDF downloaded successfully and saved as files/9402103v1.pdf
PDF downloaded successfully and saved as files/9403101v1.pdf
PDF downloaded successfully and saved as files/9406101v1.pdf
PDF downloaded successfully and saved as files/9406102v1.pdf
PDF downloaded successfully and saved as files/9408101v1.pdf
PDF downloaded successfully and saved as files/9408102v1.pdf
PDF downloaded successfully and saved as files/9408103

In [None]:
df.head()

Unnamed: 0,title,summary,authors,arxiv_id,url,pdf_link,pdf_file_name
0,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,http://arxiv.org/pdf/cs/9308101v1,files/9308101v1.pdf
1,A Market-Oriented Programming Environment and ...,Market price systems constitute a well-underst...,[M. P. Wellman],9308102v1,http://arxiv.org/abs/cs/9308102v1,http://arxiv.org/pdf/cs/9308102v1,files/9308102v1.pdf
2,An Empirical Analysis of Search in GSAT,We describe an extensive study of search in GS...,"[I. P. Gent, T. Walsh]",9309101v1,http://arxiv.org/abs/cs/9309101v1,http://arxiv.org/pdf/cs/9309101v1,files/9309101v1.pdf
3,The Difficulties of Learning Logic Programs wi...,As real logic programmers normally use cut (!)...,"[F. Bergadano, D. Gunetti, U. Trinchero]",9311101v1,http://arxiv.org/abs/cs/9311101v1,http://arxiv.org/pdf/cs/9311101v1,files/9311101v1.pdf
4,Software Agents: Completing Patterns and Const...,To support the goal of allowing users to recor...,"[J. C. Schlimmer, L. A. Hermens]",9311102v1,http://arxiv.org/abs/cs/9311102v1,http://arxiv.org/pdf/cs/9311102v1,files/9311102v1.pdf


In [8]:
## To save the DataFrame for re-usability
df.to_csv("research_papers_dataframe.csv")

### Loading and Splitting PDF Files into Chunks, Expanding the DataFrame

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Defining the Function


def load_and_chunk_pdf(pdf_file_name, chunk_size: int = 512):
    """
    Load a PDF File and splits its content into chunks of a specified size.

    Arg:
        file (str): Path to the PDF file to be loaded.
        chunk_size (int): The max size of each chunk in characters (default is 512).

    Returns:
        List[Documents]: A list of document chunks.
    """

    print(f"Loading and Splitting into chunks: {pdf_file_name}")

    ## Load the content of the PDF
    loader = PyPDFLoader(pdf_file_name)
    data = loader.load()

    ## Split the content into chunks with slight overlap to preserve context
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=64)
    chunks = text_splitter.split_documents(data)

    return chunks

In [12]:
def expand_df(df):
    """
    Expands each row in the DataFrame by splitting PDF documents into chunks.

    Args:
        df (pd.DataFrame): DataFrame containing 'pdf_file_name', 'arxiv_id', 'title', 'summary',
                            'authors', and 'url' columns.

    Returns:
        pd.DataFrame: A new DataFrame where each row represents a chunk of the original document,
                      with additional metadata such as chunk identifiers and relationships to adjacent
                      chunks.
    """

    expanded_rows = [] # List to store expanded rows with chunk information

    ## loop through each row in the DataFrame
    for idx, row in df.iterrows(): 
        try:
            chunks = load_and_chunk_pdf(row['pdf_file_name']) 
        except Exception as e:
            print(f"Error processing file {row['pdf_file_name']} or extension not supported")
        
        ### Loop over the chunks and construct a new DataFrame row for each
        for i, chunk in enumerate(chunks):
            prechunk_id = i-1 if i>0 else "" #Preceeding chunk ID
            postchunk_id = i+1 if i< len(chunks) - 1 else "" # Following chunk ID

            expanded_rows.append({
                'id': f"{row['arxiv_id']}#{i}", # unique chunk indentifier
                'title': row['title'],
                'summary': row['summary'],
                'authors': row['authors'],
                'arxiv_id': row['arxiv_id'],
                'url': row['url'],
                'chunk': chunk.page_content, # Text content of the chunk
                'prechunk_id': "" if i == 0 else f"{row['arxiv_id']}#{prechunk_id}", # Previous chunk ID
                'postchunk_id': "" if i == len(chunks) - 1 else f"{row['arxiv_id']}#{postchunk_id}" # Next chunk ID
            })
    
    # Return a new expanded DataFrame
    return pd.DataFrame(expanded_rows)


In [13]:
expanded_df = expand_df(df)

Loading and Splitting into chunks: files/9308101v1.pdf
Loading and Splitting into chunks: files/9308102v1.pdf
Loading and Splitting into chunks: files/9309101v1.pdf
Loading and Splitting into chunks: files/9311101v1.pdf
Loading and Splitting into chunks: files/9311102v1.pdf
Loading and Splitting into chunks: files/9312101v1.pdf
Loading and Splitting into chunks: files/9401101v1.pdf
Loading and Splitting into chunks: files/9402101v1.pdf
Loading and Splitting into chunks: files/9402102v1.pdf
Loading and Splitting into chunks: files/9402103v1.pdf
Loading and Splitting into chunks: files/9403101v1.pdf
Loading and Splitting into chunks: files/9406101v1.pdf
Loading and Splitting into chunks: files/9406102v1.pdf
Loading and Splitting into chunks: files/9408101v1.pdf
Loading and Splitting into chunks: files/9408102v1.pdf
Loading and Splitting into chunks: files/9408103v1.pdf
Loading and Splitting into chunks: files/9409101v1.pdf
Loading and Splitting into chunks: files/9412101v1.pdf
Loading an

In [14]:
expanded_df.head()

Unnamed: 0,id,title,summary,authors,arxiv_id,url,chunk,prechunk_id,postchunk_id
0,9308101v1#0,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,Journal of Arti/cial In telligence Researc h ...,,9308101v1#1
1,9308101v1#1,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,"problem/. In this pap er/, w e presen t a meth...",9308101v1#0,9308101v1#2
2,9308101v1#2,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,b y earlier approac hes/.\n/1/. In tro duction...,9308101v1#1,9308101v1#3
3,9308101v1#3,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,remaining problem in t w o/. W e no w b egin t...,9308101v1#2,9308101v1#4
4,9308101v1#4,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,there is no p oin t in w asting time completin...,9308101v1#3,9308101v1#5


In [15]:
expanded_df.to_csv("expanded_df.csv")

### Building a Knowledge Base for RAG Systems using Embedding (using Pinecone vector DB and OpenAI Embedding Model API)

In [None]:
# --- Loading ENV file and authenticating with OpenAI API Model ---
import os
import tqdm
from openai import OpenAI
from dotenv import find_dotenv, load_dotenv
from getpass import getpass
from semantic_router.encoders import OpenAIEncoder ## for converting texts into embeddings using OpenAI Models
## semantic_router is a smart decision making layer for your LLMs and Agents

# --- Load the Environment Variables ---
dotenv_path = find_dotenv()
if not dotenv_path:
    print(".env file not found, falling back to manual input.\n")
else:
    load_dotenv(dotenv_path, override=True)
    print("Loaded environment variable.\n")

# --- Load the API Key safely ---
api_key = os.getenv("OPENAI_API_KEY") or getpass("Enter your OpenAI API Key: ")

# --- Fail early if still missing ---
if not api_key:
    raise ValueError(
        "OPENAI_API_KEY not found. Please set it in your environment variables."
    )

# --- Initialize OpenAI Client and Encoder ---
client = OpenAI(api_key=api_key)
print("OpenAI Client initialized successfully.")

encoder = OpenAIEncoder(name="text-embedding-3-small")
print("OpenAI Encoder initialized (model: text-embedding-3-small, dim: 1536 default).")

Loaded environment variable.

OpenAI Client initialized successfully.
OpenAI Encoder initialized (model: text-embedding-3-small).


In [1]:
# testing the embedding model
## encoder("How are you Viv")

In [None]:
# checking the first element 'how' dimension from the sentence
dim = len(encoder("How are you Viv")[0])
dim

1536

### Creating a Pinecode Index

In [8]:
import os
from pinecone import Pinecone, ServerlessSpec

# --- Load the API Key ---
api_key = os.getenv("PINECONE_API_KEY") or getpass("Enter your Pinecone API Key: ")

if not api_key:
    raise ValueError("Pinecone API Key not found, please re-check your environment variable.")

# --- Initialize the Pinecone client and define ServerlessSpec for Pinecone --- 
pc = Pinecone(api_key=api_key)

spec = ServerlessSpec(
    cloud='aws',
    region='us-east-1'
)
print("Pinecone client initialized and ServerlessSpec setup successfully.")

Pinecone client initialized and ServerlessSpec setup successfully.
