# Build a Research Agent which can get information from online archive path and use serach API to create report

## 1. Intall required libs

In [6]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [33]:
import requests
import pandas as pd
import json
import xml.etree.ElementTree as ET

ARXIV_NAMESPACE = '{http://www.w3.org/2005/Atom}'


def extract_from_arxiv (search_query='cat:cs.AI', max_results=50, json_file_path='files/arxiv_dataset.json'):
    """
    Search papers from ARXIV API and save them as JSON

    Args:
        search_query (str): The search query for ArXiv (default is 'cat:cs.AI').
        max_results (int): The maximum number of results to retrieve (default is 100).
        json_file_path (str): File path where JSON data will be saved.

    Returns:
        pd.DataFrame: DataFrame containing the extracted paper information.

    """


## check documentation at https://info.arxiv.org/help/api/user-manual.html#412-python

    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={max_results}'

    #response = requests.get(url)
    #print(response.text)

    # with open('files/old_response.txt', 'r', encoding='utf-8') as f:
    #     f.read(old_response)

    old_reponse_file = 'files/old_response.xml'

    #root = ET.fromstring(old_reponse_file)
    tree = ET.parse(old_reponse_file)
    root = tree.getroot()
    print(type(root))

    papers=[]

    ## find all for multiple elements and find for first single element find
    for entry in root.findall(f'{ARXIV_NAMESPACE}entry'):
        title = entry.find(f'{ARXIV_NAMESPACE}title').text.strip()
        summary = entry.find(f'{ARXIV_NAMESPACE}summary').text.strip()

        #Get all authors
        author_elements= entry.findall(f'{ARXIV_NAMESPACE}author')
        authors = [ authors.find(f'{ARXIV_NAMESPACE}name').text    for authors in author_elements]
        #print(f'authors: {authors}')

        #get paper url
        url = entry.find(f'{ARXIV_NAMESPACE}id').text.strip()
        #print(f'url: {url} \n')

        arxiv_id = url.split('/')[-1]
        #print(f'arxiv_id: {arxiv_id} \n')

        ##check for pdf link
        pdf_link_element = entry.find(f'{ARXIV_NAMESPACE}link[@title="pdf"]')
        if pdf_link_element is not None:
            pdf_link = pdf_link_element.attrib.get('href')
            print(f'pdf_link: {pdf_link} \n')
        else:
            pdf_link = None
            print(f'pdf_link NOT found: {pdf_link} \n')


        # pdf_link = entry.find(f'{ARXIV_NAMESPACE}link[@title="pdf"]').attrib.get('href')
        # #print(f'pdf_link: {pdf_link} \n')

        papers.append({
            'title': title,
            'summary': summary,
            'authors': authors,
            'arxiv_id': arxiv_id,
            'url': url,
            'pdf_link': pdf_link
        })

    df = pd.DataFrame(papers)
    
    print(df.head(1))


    # Save the DataFrame to a JSON file.
    with open(json_file_path, 'w', encoding='utf-8') as f:
        json.dump(papers, f, ensure_ascii=False, indent=4)
        print(f'Data saved to {json_file_path} ...')

    return df




In [34]:
df = extract_from_arxiv()

<class 'xml.etree.ElementTree.Element'>
pdf_link: http://arxiv.org/pdf/cs/9308101v1 

pdf_link: http://arxiv.org/pdf/cs/9308102v1 

pdf_link: http://arxiv.org/pdf/cs/9309101v1 

pdf_link: http://arxiv.org/pdf/cs/9311101v1 

pdf_link: http://arxiv.org/pdf/cs/9311102v1 

pdf_link: http://arxiv.org/pdf/cs/9312101v1 

pdf_link: http://arxiv.org/pdf/cs/9401101v1 

pdf_link: http://arxiv.org/pdf/cs/9402101v1 

pdf_link: http://arxiv.org/pdf/cs/9402102v1 

pdf_link: http://arxiv.org/pdf/cs/9402103v1 

pdf_link: http://arxiv.org/pdf/cs/9403101v1 

pdf_link: http://arxiv.org/pdf/cs/9406101v1 

pdf_link: http://arxiv.org/pdf/cs/9406102v1 

pdf_link: http://arxiv.org/pdf/cs/9408101v1 

pdf_link: http://arxiv.org/pdf/cs/9408102v1 

pdf_link: http://arxiv.org/pdf/cs/9408103v1 

pdf_link: http://arxiv.org/pdf/cs/9409101v1 

pdf_link: http://arxiv.org/pdf/cs/9412101v1 

pdf_link: http://arxiv.org/pdf/cs/9412102v1 

pdf_link: http://arxiv.org/pdf/cs/9412103v1 

pdf_link: http://arxiv.org/pdf/cs/950110

In [35]:
df.shape

(50, 6)

## 2.Download PDF files

In [None]:
import requests
import pandas as pd
import json
import os


def download_pdfs(df, download_folder='files'):
    '''
    Download PDF from df and save it in local folder
    '''
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    
    pdf_file_names = [] ## empty list for storing pdf file names
    
    for index, row in df.iterrows():
        pdf_link = row['pdf_link']

        try:
            response = requests.get(pdf_link)
            response.raise_for_status()

            file_name = os.path.join(download_folder, pdf_link.split('/')[-1]) + '.pdf'
            pdf_file_names.append(file_name)

            # Save the downloaded PDF
            with open(file_name, 'wb') as f:
                f.write(response.content)
            
            print(f'PDF downloaded successfully and saved as {file_name}')
        
        except requests.exceptions.RequestException as e:
            print(f'Failed to download the PDF: {e}')
            pdf_file_names.append(None)
    
    df['pdf_file_name'] = pdf_file_names

    return df


        
download_pdfs(df)

Unnamed: 0,title,summary,authors,arxiv_id,url,pdf_link,pdf_file_name
0,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,http://arxiv.org/pdf/cs/9308101v1,files\9308101v1.pdf
1,A Market-Oriented Programming Environment and ...,Market price systems constitute a well-underst...,[M. P. Wellman],9308102v1,http://arxiv.org/abs/cs/9308102v1,http://arxiv.org/pdf/cs/9308102v1,files\9308102v1.pdf
2,An Empirical Analysis of Search in GSAT,We describe an extensive study of search in GS...,"[I. P. Gent, T. Walsh]",9309101v1,http://arxiv.org/abs/cs/9309101v1,http://arxiv.org/pdf/cs/9309101v1,files\9309101v1.pdf
3,The Difficulties of Learning Logic Programs wi...,As real logic programmers normally use cut (!)...,"[F. Bergadano, D. Gunetti, U. Trinchero]",9311101v1,http://arxiv.org/abs/cs/9311101v1,http://arxiv.org/pdf/cs/9311101v1,files\9311101v1.pdf
4,Software Agents: Completing Patterns and Const...,To support the goal of allowing users to recor...,"[J. C. Schlimmer, L. A. Hermens]",9311102v1,http://arxiv.org/abs/cs/9311102v1,http://arxiv.org/pdf/cs/9311102v1,files\9311102v1.pdf
5,Decidable Reasoning in Terminological Knowledg...,Terminological knowledge representation system...,"[M. Buchheit, F. M. Donini, A. Schaerf]",9312101v1,http://arxiv.org/abs/cs/9312101v1,http://arxiv.org/pdf/cs/9312101v1,files\9312101v1.pdf
6,Teleo-Reactive Programs for Agent Control,A formalism is presented for computing and org...,[N. Nilsson],9401101v1,http://arxiv.org/abs/cs/9401101v1,http://arxiv.org/pdf/cs/9401101v1,files\9401101v1.pdf
7,Learning the Past Tense of English Verbs: The ...,Learning the past tense of English verbs - a s...,[C. X. Ling],9402101v1,http://arxiv.org/abs/cs/9402101v1,http://arxiv.org/pdf/cs/9402101v1,files\9402101v1.pdf
8,Substructure Discovery Using Minimum Descripti...,The ability to identify interesting and repeti...,"[D. J. Cook, L. B. Holder]",9402102v1,http://arxiv.org/abs/cs/9402102v1,http://arxiv.org/pdf/cs/9402102v1,files\9402102v1.pdf
9,Bias-Driven Revision of Logical Domain Theories,The theory revision problem is the problem of ...,"[M. Koppel, R. Feldman, A. M. Segre]",9402103v1,http://arxiv.org/abs/cs/9402103v1,http://arxiv.org/pdf/cs/9402103v1,files\9402103v1.pdf


## 3.Splitting pdf into chunks

In [42]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter



def load_pdf_chunks(file_path):
    '''Load pdf file and chunk it'''
    loader = PyPDFLoader(file_path)
    data= loader.load()
    
    # Initialize the RecursiveCharacterTextSplitter
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,  # Maximum size of each chunk
        chunk_overlap=50  # Overlap between chunks
    )

    # Split the text
    chunks = splitter.split_documents(data)

    return chunks


### expand DF and append chunk in current DF

In [43]:
df.head(1)

Unnamed: 0,title,summary,authors,arxiv_id,url,pdf_link,pdf_file_name
0,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,http://arxiv.org/pdf/cs/9308101v1,files\9308101v1.pdf


In [38]:
df.count()

title       50
summary     50
authors     50
arxiv_id    50
url         50
pdf_link    50
dtype: int64

In [44]:
def expand_df(df):
    '''Expand DF to chunks
    return New expanded df
    '''

    expanded_row = []

    ## loop through each row in DF
    for index,row in df.iterrows():
        file_name = row['pdf_file_name']
        #print(row)
        try:
            chunks = load_pdf_chunks(file_name)
        except:
            print(f'PDF file not found for {index} row with filename - {file_name}')
            continue

        #loop over the chunks and add it to new data frame
        #print(f'Adding {len(chunks)} chunks for {index} row')
        for i, chunk in enumerate(chunks):
            prechunk = i-1 if i > 0 else '' # Preceding chunk ID
            postchunk = i+1 if i < len(chunks) -1 else '' # Following chunk ID

            expanded_row.append(
                {
                    'id':"{}#{}".format(row['arxiv_id'],i),
                    'title':row['title'],
                    'summary':row['summary'],
                    'authors':row['authors'],
                    'arxiv_id':row['arxiv_id'],
                    'url':row['url'],
                    'chunk':chunk.page_content,
                    'prechunk_id': '' if i == 0 else "{}#{}".format(row['arxiv_id'],prechunk),
                    'postchunk_id': '' if i == len(chunks) -1 else "{}#{}".format(row['arxiv_id'],postchunk)


                }
            )

    return pd.DataFrame(expanded_row)        



#expand_df(df)

In [45]:
expanded_df = expand_df(df)

## 4.Create embeddings

In [134]:
import os
from semantic_router.encoders import OpenAIEncoder
from dotenv import load_dotenv
from openai import OpenAI


res = load_dotenv(dotenv_path="../ML_practice/cred.env")

print(res)

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# client = OpenAI(api_key=OPENAI_API_KEY)

# for model in client.models.list():
#     print(model.id)

encoder = OpenAIEncoder(name='text-embedding-3-small')


True


In [135]:
dims = encode('hello namaste')



### create pinecone index

In [127]:
from pinecone import Pinecone,ServerlessSpec

load_dotenv(dotenv_path="../ML_practice/cred.env")

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')


pc = Pinecone(api_key=PINECONE_API_KEY)

##define serveless specification
spec = ServerlessSpec(
        cloud='aws',
        region='us-east-1'
)




In [128]:
print(pc.list_indexes().names())

['askdoc', 'askwiki', 'langgraph-research-agent']


In [115]:
import time

index_name = 'langgraph-research-agent'

if index_name not in pc.list_indexes().names():

    index = pc.create_index(
        name = index_name,
        dimension = 1536,
        metric='cosine',
        spec = spec
    )

    print(f'Index {index_name} Created')
else:
    print(f'Index {index_name} already exists')

Index langgraph-research-agent already exists


In [129]:
##get to index in variable

index = pc.Index(index_name)

In [130]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [133]:
expanded_df.head(1)


Unnamed: 0,id,title,summary,authors,arxiv_id,url,chunk,prechunk_id,postchunk_id
0,9308101v1#0,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,Journal of Arti/cial In telligence Researc h ...,,9308101v1#1


In [136]:
## Now from expanded_df, get chunks and upsert into pinecone index

from tqdm import tqdm

data = expanded_df
batch_size = 64

len(data) ##12276

metadata = []
ids = []
chunks =[]

##loop through data in batches in data and upsert into index

for i in tqdm(range(0,len(data),batch_size)):
    i_end = min(len(data),i+batch_size) ##end point

    batch = data[i:i_end].to_dict(orient='records')

    ##get metadata and ID for each chunk in batch
    for r in batch:
        metadata.append({'arxiv_id':r['arxiv_id'],'title':r['title'],'chunk':r['chunk']})
        ids.append(r['id'])
        chunks.append(r['chunk'])

    
    
    embeds = encoder(chunks)## openai encoder function isntead of tiktoken


    ##uload embeddings , ids and metadata
    index.upsert(vectors=zip(ids,embeds,metadata))



  3%|â–Ž         | 5/192 [00:25<15:49,  5.08s/it]


PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Tue, 11 Nov 2025 22:59:02 GMT', 'Content-Type': 'application/json', 'Content-Length': '94', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '617', 'x-pinecone-request-id': '1131134864434069597', 'x-envoy-upstream-service-time': '42', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Request size 3MB exceeds the maximum supported size of 2MB","details":[]}
