# Multi-Agent Research Assistant
**This assistant involves a multi-agent system for browsing, analyzing, and summarizing research papers.**

### Features:
- *Document Retrieval*: Downloading research papers in PDF format from provided URLs.
- *Text Extraction*: Extracting specific sections such as the title, abstract, methods, and conclusions.
- *Summarization*: Automatically retrieving key sections of the paper.


### Instructions:
1. Enter the topic of the research papers you are looking for
2. Choose which sections to extract.
3. View the extracted sections as output.
4. Get the summary of the required section.

In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from IPython.display import display, HTML

# Function to fetch data from arXiv API with or without filters
def fetch_research_papers(query, start_date=None, end_date=None, max_results=10):
    base_url = "http://export.arxiv.org/api/query?"
    api_url = f"{base_url}search_query=all:{query}&start=0&max_results={max_results}"

    if start_date and end_date:
        api_url += f"&start_date={start_date}&end_date={end_date}"

    response = requests.get(api_url)

    if response.status_code == 200:
        return response.content  # Return XML content
    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        return None

# Function to display results in a table with clickable PDF links
def display_research_papers_in_table(xml_data):
    root = ET.fromstring(xml_data)

    titles = []
    published_dates = []
    pdf_links = []
    pdf_urls=[]

    for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text

        # Look for the PDF link
        pdf_link = None
        for link in entry.findall('{http://www.w3.org/2005/Atom}link'):
            if link.get('type') == 'application/pdf':
                pdf_link = link.get('href')
                pdf_urls.append(pdf_link)
                break

        if pdf_link:
            pdf_links.append(f'<a href="{pdf_link}" target="_blank">Download PDF</a>')
        else:
            pdf_links.append('No PDF available')

        titles.append(title)
        published_dates.append(published_date)

    df = pd.DataFrame({
        "Title": titles,
        "Published Date": published_dates,
        "PDF Link": pdf_links
    })

    display(HTML(df.to_html(escape=False)))

    return pdf_urls

# Main function to ask user for input and handle filters
def main():
    query = input("Enter the topic you want to search for: ")
    apply_filters = input("Do you want to apply filters (yes/no)? ").lower()

    if apply_filters == "yes":
        start_date = input("Enter the start date (YYYY-MM-DD): ")
        end_date = input("Enter the end date (YYYY-MM-DD): ")
        max_results = input("Enter the number of research papers to retrieve: ")

        xml_data = fetch_research_papers(query, start_date=start_date, end_date=end_date, max_results=max_results)
    else:
        xml_data = fetch_research_papers(query)

    if xml_data:
      pdf_urls= display_research_papers_in_table(xml_data)
      return pdf_urls

    if not xml_data:
     print("No data returned from arXiv.")


if __name__ == "__main__":
    pdf_urls= main()


Enter the topic you want to search for: machine learning
Do you want to apply filters (yes/no)? no


Unnamed: 0,Title,Published Date,PDF Link
0,Lecture Notes: Optimization for Machine Learning,2019-09-08T21:49:42Z,Download PDF
1,An Optimal Control View of Adversarial Machine Learning,2018-11-11T14:28:34Z,Download PDF
2,Minimax deviation strategies for machine learning and recognition with\n short learning samples,2017-07-16T09:15:08Z,Download PDF
3,Machine Learning for Clinical Predictive Analytics,2019-09-19T22:02:00Z,Download PDF
4,Towards Modular Machine Learning Solution Development: Benefits and\n Trade-offs,2023-01-23T22:54:34Z,Download PDF
5,Introduction to Machine Learning: Class Notes 67577,2009-04-23T11:40:57Z,Download PDF
6,The Tribes of Machine Learning and the Realm of Computer Architecture,2020-12-07T23:10:51Z,Download PDF
7,"A Machine Learning Tutorial for Operational Meteorology, Part I:\n Traditional Machine Learning",2022-04-15T14:48:04Z,Download PDF
8,Position Paper: Towards Transparent Machine Learning,2019-11-12T10:49:55Z,Download PDF
9,Understanding Bias in Machine Learning,2019-09-02T20:36:19Z,Download PDF


In [None]:
!pip install PyPDF2 pdfplumber

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m 

In [None]:
import pdfplumber
import re
import requests


# Function to download the PDF from a URL
def download_pdf(url, file_name):
    response = requests.get(url)
    with open(file_name, 'wb') as f:
        f.write(response.content)

# Function to extract the 'Abstract' section from the PDF
def extract_abstract_from_pdf(file_path):
    with pdfplumber.open(file_path) as pdf:
        # Loop through all pages to find the abstract
        text = ''
        for page in pdf.pages:
            text += page.extract_text()

        # Use a regex pattern to find the abstract section
        # This assumes 'Abstract' is a heading and followed by text
        abstract_pattern = re.compile(r"(?i)(abstract[\s\S]*?)(introduction|background|keywords|1\.|2\.|section\s\d)", re.IGNORECASE)
        match = abstract_pattern.search(text)

        if match:
            return match.group(1).strip()
        else:
            return "Abstract not found."

# Check if pdf_urls was fetched successfully
if 'pdf_urls' in globals():
    # Loop through each PDF link
    for i, url in enumerate(pdf_urls):
        file_name = f"paper_{i+1}.pdf"

        # Download the PDF
        download_pdf(url, file_name)

        # Extract the abstract
        abstract = extract_abstract_from_pdf(file_name)

        print(f"Abstract from {file_name}:")
        print(abstract)
        print("\n")
else:
    print("No PDF URLs were found.")


Abstract from paper_1.pdf:
Abstract not found.


Abstract from paper_2.pdf:
Abstract
Idescribeanoptimalcontrolviewofadversarialmachinelearning,wherethedynamicalsystemisthe
machine learner, the input are adversarial actions, and the control costs are defined by the adversary’s
goals to do harm and be hard to detect. This view encompasses many types of adversarial machine
learning,includingtest-itemattacks,training-datapoisoning, andadversarial rewardshaping. Theview
encouragesadversarialmachinelearningresearchertoutilizeadvancesincontroltheoryandreinforcement
learning.
1 Adversarial Machine Learning is not Machine Learning
Machine learning has its mathematical foundation in concentration inequalities. This is a consequence of
the independent and identically-distributed (i.i.d.) data assumption. In contrast, I suggest that adversarial
machine learning may adopt optimal controlas its mathematical foundation [3,25]. There are telltale signs:
adversarialattacks tend to be subtle and have pe

In [None]:
!pip install --upgrade transformers




In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from IPython.display import display, HTML
import pdfplumber
import re

# Function to download the PDF from a URL
def download_pdf(url, file_name):
    response = requests.get(url)
    with open(file_name, 'wb') as f:
        f.write(response.content)


# Function to extract a specific section from the PDF
def extract_section_from_pdf(file_path, section):
    with pdfplumber.open(file_path) as pdf:
        # Extract all text from the PDF
        text = ''
        for page in pdf.pages:
            text += page.extract_text()

        # Dictionary of regex patterns for various sections
        section_patterns = {

            "abstract": r"(?i)(?:abstract)\b[\s\S]*?(?=\b(?:introduction|methods|results|literature review|conclusion|references)\b)",
            "introduction": r"(?i)(?:introduction)\b[\s\S]*?(?=\b(?:methods|results|literature review|conclusion|references)\b)",
            "methods": r"(?i)(?:methods)\b[\s\S]*?(?=\b(?:results|discussion|conclusion|references)\b)",
            "results": r"(?i)(?:results)\b[\s\S]*?(?=\b(?:discussion|conclusion|references)\b)",
            "literature review": r"(?i)(?:literature review)\b[\s\S]*?(?=\b(?:methods|results|conclusion|references)\b)",
            "conclusion": r"(?i)(?:conclusion)\b[\s\S]*?(?=\b(?:references|bibliography|limitations|future scope)\b)",
            "bibliography": r"(?i)(?:references)\b[\s\S]*?(?=\b(?:appendix|bibliography|limitations)\b)",
            "future scope": r"(?i)(?:future scope)\b[\s\S]*?(?=\b(?:conclusion|bibliography|limitations)\b)",
            "limitations": r"(?i)(?:limitations)\b[\s\S]*?(?=\b(?:future work|conclusion|bibliography)\b)"
        }

        # Use the corresponding regex pattern for the requested section
        pattern = section_patterns.get(section.lower(), None)
        if pattern:
            match = re.search(pattern, text)
            if match:
                return match.group(0).strip()
            else:
                return f"{section.capitalize()} not found in this paper."
        else:
           return "Invalid section selected."

def summarize_long_text(long_text, model, max_token_length=1024):
    # Split the text into chunks smaller than the model's maximum token length
    words = long_text.split()
    chunks = [' '.join(words[i:i + max_token_length]) for i in range(0, len(words), max_token_length)]

    # Summarize each chunk
    summarized_chunks = [model(chunk) for chunk in chunks]

    # Combine the summaries
    return " ".join(summarized_chunks)

# List of PDF URLs
if 'pdf_urls' in globals():
    # Loop through each PDF link
    for i, url in enumerate(pdf_urls):
        file_name = f"paper_{i+1}.pdf"

        # Download the PDF
        download_pdf(url, file_name)


# List of sections available for extraction
available_sections = [
     "abstract", "introduction", "methods", "results",
    "literature review", "conclusion", "bibliography", "future scope", "limitations"
]

# Main function to download PDFs and extract sections based on user input
def main():
    # Ask the user which sections to extract
    selected_sections = input(f"Which sections do you want to extract? (Options: {', '.join(available_sections)})\nEnter sections separated by commas: ").lower().split(",")
    selected_sections = [section.strip() for section in selected_sections]

     # Dictionary to store extracted sections
    extracted_data = {section: [] for section in selected_sections}



    # Loop through each PDF link
    for i, url in enumerate(pdf_urls):
        file_name = f"paper_{i+1}.pdf"

        # Download the PDF
        download_pdf(url, file_name)

        # Extract and display the requested sections
        print(f"\nExtracted sections from {file_name}:")
        for section in selected_sections:
            if section in available_sections:
                extracted_text = extract_section_from_pdf(file_name, section)
                extracted_data[section].append(extracted_text)
                #print(f"\n--- {section.capitalize()} ---")
                #print(extracted_text)
            #else:
              #  print(f"\n--- {section.capitalize()} ---")
               # print("Invalid section. Skipping...")

        for section in selected_sections:
            print(f"\n--- Summarizing {section.capitalize()} ---")
            combined_text = " ".join(extracted_data[section])
            summarized_text = summarize_text(combined_text)
            print(summarized_text)

if __name__ == "__main__":
    main()


In [None]:
from transformers import pipeline

# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

combined_text = " ".join(all_extracted_texts_from_pdfs)

# Function to summarize text
def summarize_text(text, max_length=150, min_length=40):
    summary = summarizer(combined_text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
    return summary[0]['summary_text']

summary = summarize_text(extracted_text)

print("Original Text:")
print(extracted_text)
print("\nSummarized Text:")
print(summary)

Original Text:
This work is part of an innovative e-learning project allowing
the development of an advanced digital educational tool that provides
feedback during the process of learning handwriting for young school
children (three to eight years old). In this paper, we describe a new method
for children handwriting quality analysis. It automatically detects mistakes,
gives real-time on-line feedback for children’s writing, and helps teachers
comprehend and evaluate children’s writing skills. The proposed method
adjudges five main criteria: shape, direction, stroke order, position respect
to the reference lines, and kinematics of the trace. It analyzes the
handwriting quality and automatically gives feedback based on the
combination of three extracted models: Beta-Elliptic Model (BEM) using
similarity detection (SD) and dissimilarity distance (DD) measure, Fourier
Descriptor Model (FDM), and perceptive Convolutional Neural Network
(CNN) with Support Vector Machine (SVM) comparison eng

In [None]:
import pdfplumber
import re
import requests
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch

# Function to download the PDF from a URL
def download_pdf(url, file_name):
    response = requests.get(url)
    with open(file_name, 'wb') as f:
        f.write(response.content)


# Function to extract a specific section from the PDF
def extract_section_from_pdf(file_path, section):
    with pdfplumber.open(file_path) as pdf:
        # Extract all text from the PDF
        text = ''
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text

        # Dictionary of regex patterns for various sections
        section_patterns = {
            "abstract": r"(?i)(?:abstract)\b[\s\S]*?(?=\b(?:introduction|methods|results|literature review|conclusion|references)\b)",
            "introduction": r"(?i)(?:introduction)\b[\s\S]*?(?=\b(?:methods|results|literature review|conclusion|references)\b)",
            "methods": r"(?i)(?:methods)\b[\s\S]*?(?=\b(?:results|discussion|conclusion|references)\b)",
            "results": r"(?i)(?:results)\b[\s\S]*?(?=\b(?:discussion|conclusion|references)\b)",
            "literature review": r"(?i)(?:literature review)\b[\s\S]*?(?=\b(?:methods|results|conclusion|references)\b)",
            "conclusion": r"(?i)(?:conclusion)\b[\s\S]*?(?=\b(?:references|bibliography|limitations|future scope)\b)",
            "bibliography": r"(?i)(?:references)\b[\s\S]*?(?=\b(?:appendix|bibliography|limitations)\b)",
            "future scope": r"(?i)(?:future scope)\b[\s\S]*?(?=\b(?:conclusion|bibliography|limitations)\b)",
            "limitations": r"(?i)(?:limitations)\b[\s\S]*?(?=\b(?:future work|conclusion|bibliography)\b)"
        }

        # Use the corresponding regex pattern for the requested section
        pattern = section_patterns.get(section.lower(), None)
        if pattern:
            match = re.search(pattern, text)
            if match:
                return match.group(0).strip()
            else:
                return f"{section.capitalize()} not found in this paper."
        else:
           return "Invalid section selected."
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large', ignore_mismatched_sizes=True)

# Function to summarize the text using Pegasus
def summarize_long_text(long_text, max_token_length=1024):

    if len(long_text.strip()) == 0:
        return "The section is empty or not found."

    # Tokenize the text for Pegasus
    inputs = tokenizer(long_text, return_tensors="pt", max_length=max_token_length, truncation=True)

    # Adjust max_length based on input size if needed
    input_length = len(inputs['input_ids'][0])
    max_summary_length = min(150, input_length // 2)

    # Generate summary
    summary_ids = model.generate(inputs['input_ids'], max_length=max_summary_length)

    # Decode summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


# List of sections available for extraction
available_sections = [
    "abstract", "introduction", "methods", "results",
    "literature review", "conclusion", "bibliography", "future scope", "limitations"
]


# Main function to download PDFs and extract sections based on user input
def main():
    # Example list of URLs for PDF files
    pdf_urls = [
        'https://arxiv.org/pdf/2104.03602.pdf',
        # Add more URLs if needed
    ]

    # Ask the user which sections to extract
    selected_sections = input(f"Which sections do you want to extract? (Options: {', '.join(available_sections)})\nEnter sections separated by commas: ").lower().split(",")
    selected_sections = [section.strip() for section in selected_sections]

    # Dictionary to store extracted sections
    extracted_data = {section: [] for section in selected_sections}

    # Loop through each PDF link
    for i, url in enumerate(pdf_urls):
        file_name = f"paper_{i+1}.pdf"

        # Download the PDF
        download_pdf(url, file_name)

        # Extract and display the requested sections
        print(f"\nExtracted sections from {file_name}:")
        for section in selected_sections:
            if section in available_sections:
                extracted_text = extract_section_from_pdf(file_name, section)
                extracted_data[section].append(extracted_text)
                print(f"\n--- {section.capitalize()} ---")
                print(extracted_text)

        # Summarize each extracted section
        for section in selected_sections:
            print(f"\n--- Summarizing {section.capitalize()} ---")
            combined_text = " ".join(extracted_data[section])
            summarized_text = summarize_long_text(combined_text)
            print(summarized_text)


if __name__ == "__main__":
    main()


Which sections do you want to extract? (Options: abstract, introduction, methods, results, literature review, conclusion, bibliography, future scope, limitations)
Enter sections separated by commas: abstract

Extracted sections from paper_1.pdf:

--- Abstract ---
Abstract—InNaturalLanguageProcessing(NLP),Self-supervisedLearning(SSL)andtransformersarealreadythemethodsof
choiceduetothetremendoussuccessofattentionbasedself-supervisedtransformermodelslikeBERT[1]andGPT[2].Sofar,the
visiontransformers,adoptedfromNLPtransformers,havebeenshowntoworkwellwhenpretrainedeitherusingalargescale
superviseddata[3]orwithsomekindofco-supervision,e.g.intermsofteachernetwork[4].Thesesupervisedpretrainedvision
transformersachieveoutstandingresultsindownstreamtaskswithminimalchanges[3],[4],[5].Self-supervisedPretraining(SSP)is
stillnotthemethodofchoiceforcomputervisionduetoperformancegap[3],however,SSLisgainingincreasingtractionincomputer
visionastheperformancegapbetweenSupervisedPretraining(SP)andSSPisredu

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 