In [None]:
import arxiv
from datasets import Dataset, DatasetDict
import pandas as pd
from tqdm import tqdm
import time

# Function to fetch articles from arXiv for a specific category
def fetch_articles(category, max_results=1250, retries=3):
    client = arxiv.Client()
    search = arxiv.Search(
        query=f"cat:{category}",
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance
    )
    results = []
    for result in tqdm(client.results(search), total=max_results, desc=f"Fetching {category} articles"):
        results.append({"abstract": result.summary, "category": category})
    return results

# Define primary categories with arXiv prefixes
categories = [
    "cs.*",       # Computer Science
    "econ.*",     # Economics
    "eess.*",     # Electrical Engineering and Systems Science
    "math.*",     # Mathematics
    "physics.*",  # Physics
    "q-bio.*",    # Quantitative Biology
    "q-fin.*",    # Quantitative Finance
    "stat.*"      # Statistics
]

# Fetch articles for each category and split into train and test
train_data = []
test_data = []

for category in categories:
    attempts = 0
    success = False
    while attempts < 3 and not success:
        try:
            articles = fetch_articles(category, 2500)  # Fetch double the amount to split into train/test
            train_data.extend(articles[:1250])
            test_data.extend(articles[1250:2500])
            success = True
        except arxiv.UnexpectedEmptyPageError:
            print(f"Error fetching {category} articles. Retrying...")
            attempts += 1
            time.sleep(5)  # Wait for a few seconds before retrying
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            break

# Convert to pandas DataFrame
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# Convert to Hugging Face dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Create DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Save the dataset in JSON format
dataset_dict.save_to_disk("arxiv_dataset")


Fetching cs.* articles: 100%|██████████| 2500/2500 [01:50<00:00, 22.57it/s]
Fetching econ.* articles: 100%|██████████| 2500/2500 [01:31<00:00, 27.29it/s]
Fetching eess.* articles: 100%|██████████| 2500/2500 [01:30<00:00, 27.62it/s]
Fetching math.* articles:  72%|███████▏  | 1800/2500 [01:57<00:45, 15.35it/s]


Error fetching math.* articles. Retrying...


Fetching math.* articles:  68%|██████▊   | 1701/2500 [01:50<01:03, 12.64it/s]Bozo feed; consider handling: document declared as utf-8, but parsed as windows-1252
Fetching math.* articles:  80%|████████  | 2000/2500 [02:30<00:37, 13.29it/s]


Error fetching math.* articles. Retrying...


Fetching math.* articles:  68%|██████▊   | 1701/2500 [01:51<00:52, 15.26it/s]Bozo feed; consider handling: document declared as utf-8, but parsed as windows-1252
Fetching math.* articles:  76%|███████▌  | 1900/2500 [02:10<00:41, 14.58it/s]


Error fetching math.* articles. Retrying...


Fetching physics.* articles: 100%|██████████| 2500/2500 [02:05<00:00, 19.93it/s]
Fetching q-bio.* articles: 100%|██████████| 2500/2500 [01:39<00:00, 25.06it/s]
Fetching q-fin.* articles: 100%|██████████| 2500/2500 [01:34<00:00, 26.33it/s]
Fetching stat.* articles: 100%|██████████| 2500/2500 [01:45<00:00, 23.65it/s]


Saving the dataset (0/1 shards):   0%|          | 0/8750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8750 [00:00<?, ? examples/s]

In [None]:
import arxiv
from datasets import Dataset, DatasetDict, load_from_disk, concatenate_datasets
import pandas as pd
from tqdm import tqdm
import time

# Function to fetch articles from arXiv for a specific category with extra papers to handle errors
def fetch_articles(category, max_results=1250, extra=100):
    client = arxiv.Client()
    total_results = max_results + extra
    search = arxiv.Search(
        query=f"cat:{category}",
        max_results=total_results,
        sort_by=arxiv.SortCriterion.Relevance
    )
    results = []
    for result in tqdm(client.results(search), total=total_results, desc=f"Fetching {category} articles"):
        try:
            results.append({"abstract": result.summary, "category": category})
        except Exception as e:
            print(f"Error processing article: {e}")
        if len(results) >= max_results:
            break
    return results[:max_results]  # Ensure we only return the desired number of results

# Define the math category with arXiv prefix
math_category = "math.*"

# Fetch math articles
math_articles = fetch_articles(math_category, 2500)

# Split into train and test
math_train_data = math_articles[:1250]
math_test_data = math_articles[1250:2500]

# Convert to pandas DataFrame
math_train_df = pd.DataFrame(math_train_data)
math_test_df = pd.DataFrame(math_test_data)

# Convert to Hugging Face dataset format
math_train_dataset = Dataset.from_pandas(math_train_df)
math_test_dataset = Dataset.from_pandas(math_test_df)

# Load the existing dataset
dataset_dict = load_from_disk("arxiv_dataset")

# Append the new data to the existing dataset
train_dataset = concatenate_datasets([dataset_dict['train'], math_train_dataset])
test_dataset = concatenate_datasets([dataset_dict['test'], math_test_dataset])

updated_dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Save the updated dataset
dataset_dict.save_to_disk("updated_arxiv_dataset")

print("Updated dataset saved successfully!")

# Load the updated dataset back to test if it works
loaded_dataset = load_from_disk("updated_arxiv_dataset")
print(loaded_dataset)

# Print some sample data to verify
print("Sample train data:", loaded_dataset['train'][0])
print("Sample test data:", loaded_dataset['test'][0])


Fetching math.* articles:  65%|██████▌   | 1701/2600 [01:33<00:41, 21.68it/s]Bozo feed; consider handling: document declared as utf-8, but parsed as windows-1252
Fetching math.* articles:  96%|█████████▌| 2499/2600 [02:03<00:04, 20.32it/s]


Saving the dataset (0/1 shards):   0%|          | 0/8750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8750 [00:00<?, ? examples/s]

Updated dataset saved successfully!
DatasetDict({
    train: Dataset({
        features: ['abstract', 'category'],
        num_rows: 8750
    })
    test: Dataset({
        features: ['abstract', 'category'],
        num_rows: 8750
    })
})
Sample train data: {'abstract': 'The goal of this paper is to define and analyze systems which exhibit brittle\nbehavior. This behavior is characterized by a sudden and steep decline in\nperformance as the system approaches the limits of tolerance. This can be due\nto input parameters which exceed a specified input, or environmental conditions\nwhich exceed specified operating boundaries. An analogy is made between brittle\ncommmunication systems in particular and materials science.', 'category': 'cs.*'}
Sample test data: {'abstract': 'Sample-efficient machine learning (SEML) has been widely applied to find\noptimal latency and power tradeoffs for configurable computer systems. Instead\nof randomly sampling from the configuration space, SEML reduce

In [None]:
updated_dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Save the updated dataset
updated_dataset_dict.save_to_disk("updated_arxiv_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
from datasets import load_from_disk
loaded_dataset = load_from_disk("updated_arxiv_dataset")


loaded_dataset["train"] = loaded_dataset["train"].rename_column("abstract", "text").rename_column("category", "label")
loaded_dataset["test"] = loaded_dataset["test"].rename_column("abstract", "text").rename_column("category", "label")
print(loaded_dataset)

# Print some sample data to verify
print("Sample train data:", loaded_dataset['train'][0])
print("Sample test data:", loaded_dataset['test'][0])

loaded_dataset.push_to_hub("Voice49/arXiv-Abstract-Label-20k")

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
})
Sample train data: {'text': 'The goal of this paper is to define and analyze systems which exhibit brittle\nbehavior. This behavior is characterized by a sudden and steep decline in\nperformance as the system approaches the limits of tolerance. This can be due\nto input parameters which exceed a specified input, or environmental conditions\nwhich exceed specified operating boundaries. An analogy is made between brittle\ncommmunication systems in particular and materials science.', 'label': 'cs.*'}
Sample test data: {'text': 'Sample-efficient machine learning (SEML) has been widely applied to find\noptimal latency and power tradeoffs for configurable computer systems. Instead\nof randomly sampling from the configuration space, SEML reduces the search cost\nby dramatically reducing the number of c

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Voice49/arXiv-Abstract-Label-20k/commit/fb4323d849863602f6c50e0d631b4bce9e381d08', commit_message='Upload dataset', commit_description='', oid='fb4323d849863602f6c50e0d631b4bce9e381d08', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
loaded_dataset.push_to_hub("Voice49/arXiv-Abstract-primaryCategory")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Voice49/arXiv-Abstract-primaryCategory/commit/1a69aa0412428eb464d35f15e7f2e30414e2f18e', commit_message='Upload dataset', commit_description='', oid='1a69aa0412428eb464d35f15e7f2e30414e2f18e', pr_url=None, pr_revision=None, pr_num=None)