In [1]:
import requests
import feedparser
from datetime import datetime, timedelta
import fitz # this is pymupdf
from typing import Dict, List, Tuple

# Define the ArxivParser class
class ArxivParser:
    def __init__(self, query: str = "llm", max_results: int = 1, days: int = 2*365):
        self.query = query
        self.max_results = max_results
        self.days = days
        self.url = f"http://export.arxiv.org/api/query?search_query=all:{query}&start=0&max_results={max_results}&sortBy=submittedDate&sortOrder=descending"
        # Send a GET request to api endpoint
        self.response = requests.get(self.url)
        # Parse the response
        self.entries = feedparser.parse(self.response.text).entries
        # Use a type alias to define the type of the dictionary values
        EntryData = Dict[str, str]
        self.extracted_data: Dict[str, EntryData] = {} # pickle file storage
        

    def store_entries(self) -> None:
        # Loop through the entries
        for entry in self.entries:
            published_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
            current_date = datetime.now()
            date_diff = (current_date - published_date).days
            # Check if the date difference is less than or equal to the days parameter
            if date_diff <= self.days:
                id = entry.id
                title = entry.title
                link = entry.link
                summary = entry.summary
                # Get the pdf link by replacing the "abs" with "pdf" in the link
                pdf_link = link.replace("abs", "pdf")
                # Get the pdf content by sending a GET request to the pdf link and opening it with fitz
                pdf_content = requests.get(pdf_link).content
                pdf_file = fitz.open(stream=pdf_content, filetype="pdf")
                # Extract the text from the pdf file
                pdf_text = ""
                for page in pdf_file:
                    pdf_text += page.get_text()
                # Store the id as the key and the values in a nested dictionary
                self.extracted_data[id] = {"title": title, "published_date":published_date, "pdf_link": pdf_link, "summary": summary, "pdf_text": pdf_text}
            else:
                # Break the loop if the date difference is greater than the days parameter
                break

# Create an instance of the ArxivParser class with the default parameters
parser = ArxivParser()
# Call the store_entries method to store the results in a nested dictionary
parser.store_entries()
data=parser.extracted_data
# Print the results
print(data)

{'http://arxiv.org/abs/2401.06761v1': {'title': 'APAR: LLMs Can Do Auto-Parallel Auto-Regressive Decoding', 'published_date': datetime.datetime(2024, 1, 12, 18, 50, 36), 'pdf_link': 'http://arxiv.org/pdf/2401.06761v1', 'summary': 'The massive adoption of large language models (LLMs) demands efficient\ndeployment strategies. However, the auto-regressive decoding process, which is\nfundamental to how most LLMs generate text, poses challenges to achieve\nefficient serving. In this work, we introduce a parallel auto-regressive\ngeneration method. By instruct-tuning on general domain data that contains\nhierarchical structures, we enable LLMs to independently plan their generation\nprocess and perform auto-parallel auto-regressive (APAR) generation,\nsignificantly reducing the number of generation steps. APAR alone can achieve\nup to 2x speed-up, and when combined with speculative decoding, the speed-up\ncan reach up to 4x. In addition, APAR reduces the key-value cache consumption\nand atte

In [2]:
list(data.keys())

['http://arxiv.org/abs/2401.06761v1']

In [4]:
data[list(data.keys())[0]]['summary']

'The massive adoption of large language models (LLMs) demands efficient\ndeployment strategies. However, the auto-regressive decoding process, which is\nfundamental to how most LLMs generate text, poses challenges to achieve\nefficient serving. In this work, we introduce a parallel auto-regressive\ngeneration method. By instruct-tuning on general domain data that contains\nhierarchical structures, we enable LLMs to independently plan their generation\nprocess and perform auto-parallel auto-regressive (APAR) generation,\nsignificantly reducing the number of generation steps. APAR alone can achieve\nup to 2x speed-up, and when combined with speculative decoding, the speed-up\ncan reach up to 4x. In addition, APAR reduces the key-value cache consumption\nand attention computation during generation. This leads to a throughput\nincrease of 20-70% and a latency reduce of 20-35% in high-throughput scenarios,\ncompared to state-of-the-art serving frameworks.'

In [5]:
for article in data:
    print(f"Id: {article}")
    print(f"Published date: {data[article]['published_date']}")
    print(f"Pdf link: {data[article]['pdf_link']}\n")
    print(f"Title: {data[article]['title']}\n")
    print(f"Summary: {data[article]['summary']}\n")
    print(f"Content: {data[article]['pdf_text']}")
    break

Id: http://arxiv.org/abs/2401.06761v1
Published date: 2024-01-12 18:50:36
Pdf link: http://arxiv.org/pdf/2401.06761v1

Title: APAR: LLMs Can Do Auto-Parallel Auto-Regressive Decoding

Summary: The massive adoption of large language models (LLMs) demands efficient
deployment strategies. However, the auto-regressive decoding process, which is
fundamental to how most LLMs generate text, poses challenges to achieve
efficient serving. In this work, we introduce a parallel auto-regressive
generation method. By instruct-tuning on general domain data that contains
hierarchical structures, we enable LLMs to independently plan their generation
process and perform auto-parallel auto-regressive (APAR) generation,
significantly reducing the number of generation steps. APAR alone can achieve
up to 2x speed-up, and when combined with speculative decoding, the speed-up
can reach up to 4x. In addition, APAR reduces the key-value cache consumption
and attention computation during generation. This leads 

# Modifications 1

## ArxivParser

In [59]:
import requests
import feedparser
import pandas as pd
from datetime import datetime, timedelta
import fitz # this is pymupdf
from typing import Dict, List, Tuple

class ArxivParser:
    base_url = "http://export.arxiv.org/api/query"
    def __init__(self):
        EntryData = Dict[str, str]
        self.extracted_data: Dict[str, EntryData] = {} # store all the data present in Computer Science
        
    def get_results(self, query: str = "llm", max_results: int = 10, days: int = 60) -> pd.DataFrame:
        # Construct the url with the query parameters
        params = {
            "search_query": f"all:{query}",
            "start": 0,
            "max_results": max_results,
            "sortBy": "submittedDate",
            "sortOrder": "descending"
        }
        url = self.base_url + "?" + requests.compat.urlencode(params)
        # Send a GET request to the api endpoint
        response = requests.get(url)
        # Parse the response
        entries = feedparser.parse(response.text).entries
        # Loop through the entries
        for entry in entries:
            published_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
            current_date = datetime.now()
            date_diff = (current_date - published_date).days
            # Check if the date difference is less than or equal to the days parameter
            if date_diff <= days:
                id = entry.id
                title = entry.title
                link = entry.link
                summary = entry.summary
                # Get the pdf link by replacing the "abs" with "pdf" in the link
                pdf_link = link.replace("abs", "pdf")
                # Get the pdf content by sending a GET request to the pdf link and opening it with fitz
                pdf_content = requests.get(pdf_link).content
                pdf_file = fitz.open(stream=pdf_content, filetype="pdf")
                # Extract the text from the pdf file
                pdf_text = ""
                for page in pdf_file:
                    pdf_text += page.get_text()
                # Store the extracted data in the dictionary with the id as the key
                self.extracted_data[id] = {
                    "title": title,
                    "published_date": published_date,
                    "pdf_link": pdf_link,
                    "summary": summary,
                    "pdf_text": pdf_text
                }
        # Convert the extracted data into a pandas dataframe
        df = pd.DataFrame.from_dict(self.extracted_data, orient="index")
        # Return the dataframe
        return df
        
    def store_data(self, query: str = "llm", max_results: int = 10, days: int = 60) -> None:
        # Call the get_results method and store the dataframe in the self.extracted_data attribute
        self.extracted_data = self.get_results(query, max_results, days)
        
        # Create two new columns using lambda functions
        self.extracted_data['summary_length'] = self.extracted_data.apply(lambda row: len(row['summary']), axis=1)
        self.extracted_data['pdf_text_length'] = self.extracted_data.apply(lambda row: len(row['pdf_text']), axis=1)


    def get_stored_data(self) -> pd.DataFrame:
        # Return the self.extracted_data attribute
        return self.extracted_data

In [60]:
arxiv=ArxivParser()

In [61]:
data=arxiv.get_results()
data

Unnamed: 0,title,published_date,pdf_link,summary,pdf_text
http://arxiv.org/abs/2401.06761v1,APAR: LLMs Can Do Auto-Parallel Auto-Regressiv...,2024-01-12 18:50:36,http://arxiv.org/pdf/2401.06761v1,The massive adoption of large language models ...,APAR: LLMs Can Do Auto-Parallel Auto-Regressiv...
http://arxiv.org/abs/2401.06692v1,An Experimental Design Framework for Label-Eff...,2024-01-12 16:56:54,http://arxiv.org/pdf/2401.06692v1,Supervised finetuning (SFT) on instruction dat...,An Experimental Design Framework for Label-Eff...
http://arxiv.org/abs/2401.06688v1,"Don't Rank, Combine! Combining Machine Transla...",2024-01-12 16:52:41,http://arxiv.org/pdf/2401.06688v1,Neural machine translation systems estimate pr...,"Don’t Rank, Combine! Combining Machine Transla..."
http://arxiv.org/abs/2401.06676v1,LLMRS: Unlocking Potentials of LLM-Based Recom...,2024-01-12 16:33:17,http://arxiv.org/pdf/2401.06676v1,"Recommendation systems are ubiquitous, from Sp...",\nWorkshop on Information Technology and Syst...
http://arxiv.org/abs/2401.06373v1,How Johnny Can Persuade LLMs to Jailbreak Them...,2024-01-12 16:13:24,http://arxiv.org/pdf/2401.06373v1,Most traditional AI safety research has approa...,How Johnny Can Persuade LLMs to Jailbreak Them...
http://arxiv.org/abs/2401.06643v1,Effects of diversity incentives on sample dive...,2024-01-12 15:46:43,http://arxiv.org/pdf/2401.06643v1,The latest generative large language models (L...,Effects of diversity incentives on sample dive...
http://arxiv.org/abs/2401.06628v1,OOP: Object-Oriented Programming Evaluation Be...,2024-01-12 15:21:36,http://arxiv.org/pdf/2401.06628v1,Advancing automated programming necessitates r...,OOP: Object-Oriented Programming Evaluation Be...
http://arxiv.org/abs/2401.06603v1,Mutual Enhancement of Large Language and Reinf...,2024-01-12 14:35:57,http://arxiv.org/pdf/2401.06603v1,Large Language Models (LLMs) have demonstrated...,Mutual Enhancement of Large Language and\nRein...
http://arxiv.org/abs/2401.06580v1,TestSpark: IntelliJ IDEA's Ultimate Test Gener...,2024-01-12 13:53:57,http://arxiv.org/pdf/2401.06580v1,Writing software tests is laborious and time-c...,TestSpark: IntelliJ IDEA’s Ultimate Test Gener...
http://arxiv.org/abs/2401.06568v1,Lost in the Source Language: How Large Languag...,2024-01-12 13:23:21,http://arxiv.org/pdf/2401.06568v1,Large Language Models (LLMs) have achieved rem...,Lost in the Source Language: How Large Languag...


In [62]:
arxiv.store_data()

In [63]:
stored_data=arxiv.get_stored_data()
stored_data

Unnamed: 0,title,published_date,pdf_link,summary,pdf_text,summary_length,pdf_text_length
http://arxiv.org/abs/2401.06761v1,APAR: LLMs Can Do Auto-Parallel Auto-Regressiv...,2024-01-12 18:50:36,http://arxiv.org/pdf/2401.06761v1,The massive adoption of large language models ...,APAR: LLMs Can Do Auto-Parallel Auto-Regressiv...,951,43670
http://arxiv.org/abs/2401.06692v1,An Experimental Design Framework for Label-Eff...,2024-01-12 16:56:54,http://arxiv.org/pdf/2401.06692v1,Supervised finetuning (SFT) on instruction dat...,An Experimental Design Framework for Label-Eff...,1284,40531
http://arxiv.org/abs/2401.06688v1,"Don't Rank, Combine! Combining Machine Transla...",2024-01-12 16:52:41,http://arxiv.org/pdf/2401.06688v1,Neural machine translation systems estimate pr...,"Don’t Rank, Combine! Combining Machine Transla...",1358,71492
http://arxiv.org/abs/2401.06676v1,LLMRS: Unlocking Potentials of LLM-Based Recom...,2024-01-12 16:33:17,http://arxiv.org/pdf/2401.06676v1,"Recommendation systems are ubiquitous, from Sp...",\nWorkshop on Information Technology and Syst...,964,26069
http://arxiv.org/abs/2401.06373v1,How Johnny Can Persuade LLMs to Jailbreak Them...,2024-01-12 16:13:24,http://arxiv.org/pdf/2401.06373v1,Most traditional AI safety research has approa...,How Johnny Can Persuade LLMs to Jailbreak Them...,1199,120391
http://arxiv.org/abs/2401.06643v1,Effects of diversity incentives on sample dive...,2024-01-12 15:46:43,http://arxiv.org/pdf/2401.06643v1,The latest generative large language models (L...,Effects of diversity incentives on sample dive...,1002,53955
http://arxiv.org/abs/2401.06628v1,OOP: Object-Oriented Programming Evaluation Be...,2024-01-12 15:21:36,http://arxiv.org/pdf/2401.06628v1,Advancing automated programming necessitates r...,OOP: Object-Oriented Programming Evaluation Be...,1097,67956
http://arxiv.org/abs/2401.06603v1,Mutual Enhancement of Large Language and Reinf...,2024-01-12 14:35:57,http://arxiv.org/pdf/2401.06603v1,Large Language Models (LLMs) have demonstrated...,Mutual Enhancement of Large Language and\nRein...,1260,13733
http://arxiv.org/abs/2401.06580v1,TestSpark: IntelliJ IDEA's Ultimate Test Gener...,2024-01-12 13:53:57,http://arxiv.org/pdf/2401.06580v1,Writing software tests is laborious and time-c...,TestSpark: IntelliJ IDEA’s Ultimate Test Gener...,1404,27565
http://arxiv.org/abs/2401.06568v1,Lost in the Source Language: How Large Languag...,2024-01-12 13:23:21,http://arxiv.org/pdf/2401.06568v1,Large Language Models (LLMs) have achieved rem...,Lost in the Source Language: How Large Languag...,1156,58720


In [64]:
stored_data

Unnamed: 0,title,published_date,pdf_link,summary,pdf_text,summary_length,pdf_text_length
http://arxiv.org/abs/2401.06761v1,APAR: LLMs Can Do Auto-Parallel Auto-Regressiv...,2024-01-12 18:50:36,http://arxiv.org/pdf/2401.06761v1,The massive adoption of large language models ...,APAR: LLMs Can Do Auto-Parallel Auto-Regressiv...,951,43670
http://arxiv.org/abs/2401.06692v1,An Experimental Design Framework for Label-Eff...,2024-01-12 16:56:54,http://arxiv.org/pdf/2401.06692v1,Supervised finetuning (SFT) on instruction dat...,An Experimental Design Framework for Label-Eff...,1284,40531
http://arxiv.org/abs/2401.06688v1,"Don't Rank, Combine! Combining Machine Transla...",2024-01-12 16:52:41,http://arxiv.org/pdf/2401.06688v1,Neural machine translation systems estimate pr...,"Don’t Rank, Combine! Combining Machine Transla...",1358,71492
http://arxiv.org/abs/2401.06676v1,LLMRS: Unlocking Potentials of LLM-Based Recom...,2024-01-12 16:33:17,http://arxiv.org/pdf/2401.06676v1,"Recommendation systems are ubiquitous, from Sp...",\nWorkshop on Information Technology and Syst...,964,26069
http://arxiv.org/abs/2401.06373v1,How Johnny Can Persuade LLMs to Jailbreak Them...,2024-01-12 16:13:24,http://arxiv.org/pdf/2401.06373v1,Most traditional AI safety research has approa...,How Johnny Can Persuade LLMs to Jailbreak Them...,1199,120391
http://arxiv.org/abs/2401.06643v1,Effects of diversity incentives on sample dive...,2024-01-12 15:46:43,http://arxiv.org/pdf/2401.06643v1,The latest generative large language models (L...,Effects of diversity incentives on sample dive...,1002,53955
http://arxiv.org/abs/2401.06628v1,OOP: Object-Oriented Programming Evaluation Be...,2024-01-12 15:21:36,http://arxiv.org/pdf/2401.06628v1,Advancing automated programming necessitates r...,OOP: Object-Oriented Programming Evaluation Be...,1097,67956
http://arxiv.org/abs/2401.06603v1,Mutual Enhancement of Large Language and Reinf...,2024-01-12 14:35:57,http://arxiv.org/pdf/2401.06603v1,Large Language Models (LLMs) have demonstrated...,Mutual Enhancement of Large Language and\nRein...,1260,13733
http://arxiv.org/abs/2401.06580v1,TestSpark: IntelliJ IDEA's Ultimate Test Gener...,2024-01-12 13:53:57,http://arxiv.org/pdf/2401.06580v1,Writing software tests is laborious and time-c...,TestSpark: IntelliJ IDEA’s Ultimate Test Gener...,1404,27565
http://arxiv.org/abs/2401.06568v1,Lost in the Source Language: How Large Languag...,2024-01-12 13:23:21,http://arxiv.org/pdf/2401.06568v1,Large Language Models (LLMs) have achieved rem...,Lost in the Source Language: How Large Languag...,1156,58720


## User Class

In [32]:
class User(ArxivParser):
    # no get request
    def __init__(self, topics: str="llm", authors: str=None, keywords: list=None):
        # Call the init method of the ArxivParser class
        super().__init__()
        
        self.topics = topics
        self.authors = authors
        self.keywords = keywords
        
        self.extracted_data=None

    def store_data(self, query: str = "llm", max_results: int = 1, days: int = 60) -> None:
        # Construct a new query based on the user preferences
        new_query = query
        if self.topics:
            new_query += f"+cat:{self.topics}"
        if self.authors:
            new_query += f"+au:{self.authors}"
        if self.keywords:
            new_query += f"+ti:{'+'.join(self.keywords)}"
        
        # Call the get_results method of the ArxivParser class with the new query
        # self.extracted_data = super().get_results(new_query, max_results, days)
        
        self.extracted_data['summary_length'] = self.extracted_data.apply(lambda row: len(row['summary']), axis=1)
        self.extracted_data['pdf_text_length'] = self.extracted_data.apply(lambda row: len(row['pdf_text']), axis=1)


    # def get_stored_data(self) -> pd.DataFrame:
    #   return self.extracted_data

    def daily_feed():
        pass
    def search():
        pass

In [33]:
subrata=User(
    topics="time series",
    authors=None,
    keywords="data augmentation"
)

In [34]:
response = requests.get("http://export.arxiv.org/api/query?search_query=all:llm")
# Parse the response
entries = feedparser.parse(response.text).entries

In [35]:
entries

[{'id': 'http://arxiv.org/abs/2311.10372v2',
  'guidislink': True,
  'link': 'http://arxiv.org/abs/2311.10372v2',
  'updated': '2024-01-08T05:41:51Z',
  'updated_parsed': time.struct_time(tm_year=2024, tm_mon=1, tm_mday=8, tm_hour=5, tm_min=41, tm_sec=51, tm_wday=0, tm_yday=8, tm_isdst=0),
  'published': '2023-11-17T07:55:16Z',
  'published_parsed': time.struct_time(tm_year=2023, tm_mon=11, tm_mday=17, tm_hour=7, tm_min=55, tm_sec=16, tm_wday=4, tm_yday=321, tm_isdst=0),
  'title': 'A Survey of Large Language Models for Code: Evolution, Benchmarking, and\n  Future Trends',
  'title_detail': {'type': 'text/plain',
   'language': None,
   'base': '',
   'value': 'A Survey of Large Language Models for Code: Evolution, Benchmarking, and\n  Future Trends'},
  'summary': 'General large language models (LLMs), represented by ChatGPT, have\ndemonstrated significant potential in tasks such as code generation in software\nengineering. This has led to the development of specialized LLMs for softw

## Masterdata

In [89]:
import requests
import feedparser
import pandas as pd
from datetime import datetime, timedelta
import fitz # this is pymupdf
from typing import Dict, List, Tuple

class ArxivParser:
    base_url = "http://export.arxiv.org/api/query"
    def __init__(self):
        EntryData = Dict[str, str]
        self.extracted_data: Dict[str, EntryData] = {} # store all the data present in Computer Science
        
    def get_results(self, max_results: int = 5, days: int = 60) -> pd.DataFrame:
        # Construct the url with the query parameters
        params = {
            "search_query": f"cat:cs*",
            "start": 0,
            "max_results": max_results,
            "sortBy": "submittedDate",
            "sortOrder": "descending"
        }
        url = self.base_url + "?" + requests.compat.urlencode(params)

        # Send a GET request to the api endpoint
        response = requests.get(url)
        # Parse the response
        entries = feedparser.parse(response.text).entries
        # Loop through the entries
        for entry in entries:
            published_date = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ")
            current_date = datetime.now()
            date_diff = (current_date - published_date).days
            # Check if the date difference is less than or equal to the days parameter
            if date_diff <= days:
                id = entry.id
                title = entry.title
                link = entry.link
                summary = entry.summary
                # Get the pdf link by replacing the "abs" with "pdf" in the link
                pdf_link = link.replace("abs", "pdf")
                # Get the pdf content by sending a GET request to the pdf link and opening it with fitz
                pdf_content = requests.get(pdf_link).content
                pdf_file = fitz.open(stream=pdf_content, filetype="pdf")
                # Extract the text from the pdf file
                pdf_text = ""
                for page in pdf_file:
                    pdf_text += page.get_text()
                # Store the extracted data in the dictionary with the id as the key
                self.extracted_data[id] = {
                    "title": title,
                    "published_date": published_date,
                    "pdf_link": pdf_link,
                    "summary": summary,
                    "pdf_text": pdf_text
                }
        # Convert the extracted data into a pandas dataframe
        df = pd.DataFrame.from_dict(self.extracted_data, orient="index")
        return df
        
    def store_data(self, max_results: int = 10, days: int = 60) -> None:
        # Call the get_results method and store the dataframe in the self.extracted_data attribute
        self.extracted_data = self.get_results(max_results, days)
        
        # Create two new columns using lambda functions
        self.extracted_data['summary_length'] = self.extracted_data.apply(lambda row: len(row['summary']), axis=1)
        self.extracted_data['pdf_text_length'] = self.extracted_data.apply(lambda row: len(row['pdf_text']), axis=1)
        self.extracted_data.to_pickle("master_data.pkl")

    def get_stored_data(self) -> pd.DataFrame:
        # Return the self.extracted_data attribute
        return self.extracted_data

In [90]:
arxiv=ArxivParser()

In [91]:
arxiv.get_results()

Unnamed: 0,title,published_date,pdf_link,summary,pdf_text
http://arxiv.org/abs/2401.06769v1,Machine Translation Models are Zero-Shot Detec...,2024-01-12 18:59:02,http://arxiv.org/pdf/2401.06769v1,Detecting the translation direction of paralle...,Machine Translation Models are\nZero-Shot Dete...
http://arxiv.org/abs/2401.06766v1,Mind Your Format: Towards Consistent Evaluatio...,2024-01-12 18:58:26,http://arxiv.org/pdf/2401.06766v1,Large language models demonstrate a remarkable...,Mind Your Format: Towards Consistent Evaluatio...
http://arxiv.org/abs/2401.06765v1,Automated Test Case Repair Using Language Models,2024-01-12 18:56:57,http://arxiv.org/pdf/2401.06765v1,Ensuring the quality of software systems throu...,1\nAutomated Test Case Repair\nUsing Language ...
http://arxiv.org/abs/2401.06763v1,Optimally Blending Honeypots into Production N...,2024-01-12 18:54:51,http://arxiv.org/pdf/2401.06763v1,Honeypot is an important cyber defense techniq...,Optimally Blending Honeypots into Production\n...
http://arxiv.org/abs/2401.06762v1,Seeing the roads through the trees: A benchmar...,2024-01-12 18:50:43,http://arxiv.org/pdf/2401.06762v1,Fully understanding a complex high-resolution ...,SEEING THE ROADS THROUGH THE TREES:\nA BENCHMA...


In [92]:
arxiv.store_data()

In [93]:
arxiv.get_stored_data()

Unnamed: 0,title,published_date,pdf_link,summary,pdf_text,summary_length,pdf_text_length
http://arxiv.org/abs/2401.06769v1,Machine Translation Models are Zero-Shot Detec...,2024-01-12 18:59:02,http://arxiv.org/pdf/2401.06769v1,Detecting the translation direction of paralle...,Machine Translation Models are\nZero-Shot Dete...,917,43596
http://arxiv.org/abs/2401.06766v1,Mind Your Format: Towards Consistent Evaluatio...,2024-01-12 18:58:26,http://arxiv.org/pdf/2401.06766v1,Large language models demonstrate a remarkable...,Mind Your Format: Towards Consistent Evaluatio...,1219,79334
http://arxiv.org/abs/2401.06765v1,Automated Test Case Repair Using Language Models,2024-01-12 18:56:57,http://arxiv.org/pdf/2401.06765v1,Ensuring the quality of software systems throu...,1\nAutomated Test Case Repair\nUsing Language ...,1360,118802
http://arxiv.org/abs/2401.06763v1,Optimally Blending Honeypots into Production N...,2024-01-12 18:54:51,http://arxiv.org/pdf/2401.06763v1,Honeypot is an important cyber defense techniq...,Optimally Blending Honeypots into Production\n...,1027,56963
http://arxiv.org/abs/2401.06762v1,Seeing the roads through the trees: A benchmar...,2024-01-12 18:50:43,http://arxiv.org/pdf/2401.06762v1,Fully understanding a complex high-resolution ...,SEEING THE ROADS THROUGH THE TREES:\nA BENCHMA...,1521,21421
http://arxiv.org/abs/2401.06761v1,APAR: LLMs Can Do Auto-Parallel Auto-Regressiv...,2024-01-12 18:50:36,http://arxiv.org/pdf/2401.06761v1,The massive adoption of large language models ...,APAR: LLMs Can Do Auto-Parallel Auto-Regressiv...,951,43670
http://arxiv.org/abs/2401.06760v1,Navigating the Metrics Maze: Reconciling Score...,2024-01-12 18:47:40,http://arxiv.org/pdf/2401.06760v1,"Ten years ago a single metric, BLEU, governed ...",Navigating the Metrics Maze:\nReconciling Scor...,1176,51549
http://arxiv.org/abs/2401.06757v1,"Synthetic Data Generation Framework, Dataset, ...",2024-01-12 18:44:01,http://arxiv.org/pdf/2401.06757v1,Pedestrian intention prediction is crucial for...,"Synthetic Data Generation Framework, Dataset, ...",1102,41723
http://arxiv.org/abs/2401.06755v1,Solving the Discretised Multiphase Flow Equati...,2024-01-12 18:42:42,http://arxiv.org/pdf/2401.06755v1,This paper solves the multiphase flow equation...,Solving the Discretised Multiphase Flow Equati...,1543,94492
http://arxiv.org/abs/2401.06752v1,Stylometry Analysis of Multi-authored Document...,2024-01-12 18:36:41,http://arxiv.org/pdf/2401.06752v1,"In recent years, the increasing use of Artific...",Noname manuscript No.\n(will be inserted by th...,1365,46903


**read data from pickle**

In [94]:
pd.read_pickle('master_data.pkl')

Unnamed: 0,title,published_date,pdf_link,summary,pdf_text,summary_length,pdf_text_length
http://arxiv.org/abs/2401.06769v1,Machine Translation Models are Zero-Shot Detec...,2024-01-12 18:59:02,http://arxiv.org/pdf/2401.06769v1,Detecting the translation direction of paralle...,Machine Translation Models are\nZero-Shot Dete...,917,43596
http://arxiv.org/abs/2401.06766v1,Mind Your Format: Towards Consistent Evaluatio...,2024-01-12 18:58:26,http://arxiv.org/pdf/2401.06766v1,Large language models demonstrate a remarkable...,Mind Your Format: Towards Consistent Evaluatio...,1219,79334
http://arxiv.org/abs/2401.06765v1,Automated Test Case Repair Using Language Models,2024-01-12 18:56:57,http://arxiv.org/pdf/2401.06765v1,Ensuring the quality of software systems throu...,1\nAutomated Test Case Repair\nUsing Language ...,1360,118802
http://arxiv.org/abs/2401.06763v1,Optimally Blending Honeypots into Production N...,2024-01-12 18:54:51,http://arxiv.org/pdf/2401.06763v1,Honeypot is an important cyber defense techniq...,Optimally Blending Honeypots into Production\n...,1027,56963
http://arxiv.org/abs/2401.06762v1,Seeing the roads through the trees: A benchmar...,2024-01-12 18:50:43,http://arxiv.org/pdf/2401.06762v1,Fully understanding a complex high-resolution ...,SEEING THE ROADS THROUGH THE TREES:\nA BENCHMA...,1521,21421
http://arxiv.org/abs/2401.06761v1,APAR: LLMs Can Do Auto-Parallel Auto-Regressiv...,2024-01-12 18:50:36,http://arxiv.org/pdf/2401.06761v1,The massive adoption of large language models ...,APAR: LLMs Can Do Auto-Parallel Auto-Regressiv...,951,43670
http://arxiv.org/abs/2401.06760v1,Navigating the Metrics Maze: Reconciling Score...,2024-01-12 18:47:40,http://arxiv.org/pdf/2401.06760v1,"Ten years ago a single metric, BLEU, governed ...",Navigating the Metrics Maze:\nReconciling Scor...,1176,51549
http://arxiv.org/abs/2401.06757v1,"Synthetic Data Generation Framework, Dataset, ...",2024-01-12 18:44:01,http://arxiv.org/pdf/2401.06757v1,Pedestrian intention prediction is crucial for...,"Synthetic Data Generation Framework, Dataset, ...",1102,41723
http://arxiv.org/abs/2401.06755v1,Solving the Discretised Multiphase Flow Equati...,2024-01-12 18:42:42,http://arxiv.org/pdf/2401.06755v1,This paper solves the multiphase flow equation...,Solving the Discretised Multiphase Flow Equati...,1543,94492
http://arxiv.org/abs/2401.06752v1,Stylometry Analysis of Multi-authored Document...,2024-01-12 18:36:41,http://arxiv.org/pdf/2401.06752v1,"In recent years, the increasing use of Artific...",Noname manuscript No.\n(will be inserted by th...,1365,46903


## User

In [102]:
class User:
    def __init__(self):
        self.master_data = pd.read_pickle('master_data.pkl')
        self.user_data = None # this will store the filtered dataframe
    
    def search(self, query):
        mask = self.master_data["pdf_text"].str.contains(query) # create a boolean mask
        self.user_data = self.master_data[mask] # filter the master_data using the mask
        return self.user_data
    
    def feed(self):
        return self.user_data.sort_values("published_date", ascending=False).head(2)

In [103]:
user1=User()

In [104]:
user1.search("llm")

Unnamed: 0,title,published_date,pdf_link,summary,pdf_text,summary_length,pdf_text_length
http://arxiv.org/abs/2401.06766v1,Mind Your Format: Towards Consistent Evaluatio...,2024-01-12 18:58:26,http://arxiv.org/pdf/2401.06766v1,Large language models demonstrate a remarkable...,Mind Your Format: Towards Consistent Evaluatio...,1219,79334
http://arxiv.org/abs/2401.06762v1,Seeing the roads through the trees: A benchmar...,2024-01-12 18:50:43,http://arxiv.org/pdf/2401.06762v1,Fully understanding a complex high-resolution ...,SEEING THE ROADS THROUGH THE TREES:\nA BENCHMA...,1521,21421
http://arxiv.org/abs/2401.06761v1,APAR: LLMs Can Do Auto-Parallel Auto-Regressiv...,2024-01-12 18:50:36,http://arxiv.org/pdf/2401.06761v1,The massive adoption of large language models ...,APAR: LLMs Can Do Auto-Parallel Auto-Regressiv...,951,43670


## Test the Module

In [3]:
! python3 extract_data.py

                                                                               title  ... pdf_text_length
http://arxiv.org/abs/2401.06766v1  Mind Your Format: Towards Consistent Evaluatio...  ...           79334
http://arxiv.org/abs/2401.06762v1  Seeing the roads through the trees: A benchmar...  ...           21421
http://arxiv.org/abs/2401.06761v1  APAR: LLMs Can Do Auto-Parallel Auto-Regressiv...  ...           43670

[3 rows x 7 columns]


## Testing with PyTest

In [7]:
! ls

__init__.py     extract_data.py master_data.pkl research.ipynb
