In [77]:
import requests
import feedparser
from datetime import datetime, timedelta
import fitz # this is pymupdf
from typing import Dict, List, Tuple

# Define the ArxivParser class
class ArxivParser:
    def __init__(self, query: str = "llm", max_results: int = 10, days: int = 60):
        self.query = query
        self.max_results = max_results
        self.days = days
        self.url = f"http://export.arxiv.org/api/query?search_query=all:{query}&start=0&max_results={max_results}&sortBy=submittedDate&sortOrder=descending"
        # Send a GET request to api endpoint
        self.response = requests.get(self.url)
        # Parse the response
        self.entries = feedparser.parse(self.response.text).entries
        # Use a type alias to define the type of the dictionary values
        EntryData = Dict[str, str]
        self.extracted_data: Dict[str, EntryData] = {}

    def store_entries(self) -> None:
        # Loop through the entries
        for entry in self.entries:
            published_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
            current_date = datetime.now()
            date_diff = (current_date - published_date).days
            # Check if the date difference is less than or equal to the days parameter
            if date_diff <= self.days:
                id = entry.id
                title = entry.title
                link = entry.link
                summary = entry.summary
                # Get the pdf link by replacing the "abs" with "pdf" in the link
                pdf_link = link.replace("abs", "pdf")
                # Get the pdf content by sending a GET request to the pdf link and opening it with fitz
                pdf_content = requests.get(pdf_link).content
                pdf_file = fitz.open(stream=pdf_content, filetype="pdf")
                # Extract the text from the pdf file
                pdf_text = ""
                for page in pdf_file:
                    pdf_text += page.get_text()
                # Store the id as the key and the values in a nested dictionary
                self.extracted_data[id] = {"title": title, "published_date":published_date, "pdf_link": pdf_link, "summary": summary, "pdf_text": pdf_text}
            else:
                # Break the loop if the date difference is greater than the days parameter
                break

# Create an instance of the ArxivParser class with the default parameters
parser = ArxivParser()
# Call the store_entries method to store the results in a nested dictionary
parser.store_entries()
data=parser.extracted_data
# Print the results
print(data)

{'http://arxiv.org/abs/2401.06121v1': {'title': 'TOFU: A Task of Fictitious Unlearning for LLMs', 'published_date': datetime.datetime(2024, 1, 11, 18, 57, 12), 'pdf_link': 'http://arxiv.org/pdf/2401.06121v1', 'summary': 'Large language models trained on massive corpora of data from the web can\nmemorize and reproduce sensitive or private data raising both legal and ethical\nconcerns. Unlearning, or tuning models to forget information present in their\ntraining data, provides us with a way to protect private data after training.\nAlthough several methods exist for such unlearning, it is unclear to what\nextent they result in models equivalent to those where the data to be forgotten\nwas never learned in the first place. To address this challenge, we present\nTOFU, a Task of Fictitious Unlearning, as a benchmark aimed at helping deepen\nour understanding of unlearning. We offer a dataset of 200 diverse synthetic\nauthor profiles, each consisting of 20 question-answer pairs, and a subset 

In [78]:
list(data.keys())

['http://arxiv.org/abs/2401.06121v1',
 'http://arxiv.org/abs/2401.06118v1',
 'http://arxiv.org/abs/2401.06104v1',
 'http://arxiv.org/abs/2401.06102v1',
 'http://arxiv.org/abs/2401.06088v1',
 'http://arxiv.org/abs/2401.06081v1',
 'http://arxiv.org/abs/2401.06072v1',
 'http://arxiv.org/abs/2401.06059v1',
 'http://arxiv.org/abs/2401.05952v1',
 'http://arxiv.org/abs/2401.05940v1']

In [79]:
data['http://arxiv.org/abs/2401.06121v1']['summary']

'Large language models trained on massive corpora of data from the web can\nmemorize and reproduce sensitive or private data raising both legal and ethical\nconcerns. Unlearning, or tuning models to forget information present in their\ntraining data, provides us with a way to protect private data after training.\nAlthough several methods exist for such unlearning, it is unclear to what\nextent they result in models equivalent to those where the data to be forgotten\nwas never learned in the first place. To address this challenge, we present\nTOFU, a Task of Fictitious Unlearning, as a benchmark aimed at helping deepen\nour understanding of unlearning. We offer a dataset of 200 diverse synthetic\nauthor profiles, each consisting of 20 question-answer pairs, and a subset of\nthese profiles called the forget set that serves as the target for unlearning.\nWe compile a suite of metrics that work together to provide a holistic picture\nof unlearning efficacy. Finally, we provide a set of bas

In [87]:
for article in data:
    print(f"Id: {article}")
    print(f"Published date: {data[article]['published_date']}")
    print(f"Pdf link: {data[article]['pdf_link']}\n")
    print(f"Title: {data[article]['title']}\n")
    print(f"Summary: {data[article]['summary']}\n")
    print(f"Content: {data[article]['pdf_text']}")
    break

Id: http://arxiv.org/abs/2401.06121v1
Published date: 2024-01-11 18:57:12
Pdf link: http://arxiv.org/pdf/2401.06121v1

Title: TOFU: A Task of Fictitious Unlearning for LLMs

Summary: Large language models trained on massive corpora of data from the web can
memorize and reproduce sensitive or private data raising both legal and ethical
concerns. Unlearning, or tuning models to forget information present in their
training data, provides us with a way to protect private data after training.
Although several methods exist for such unlearning, it is unclear to what
extent they result in models equivalent to those where the data to be forgotten
was never learned in the first place. To address this challenge, we present
TOFU, a Task of Fictitious Unlearning, as a benchmark aimed at helping deepen
our understanding of unlearning. We offer a dataset of 200 diverse synthetic
author profiles, each consisting of 20 question-answer pairs, and a subset of
these profiles called the forget set that se