In [7]:
import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm

In [8]:
# https://www.kaggle.com/balraj98/arxiv-abstract-translation-to-french-marian-nmt##1.-Abstract-Translation
data_file = '/Users/aidancurley/Downloads/archive/arxiv-metadata-oai-snapshot.json'

""" Using `yield` to load the JSON file in a loop to prevent Python memory issues if JSON is loaded directly"""

def get_metadata():
    with open(data_file, 'r') as f:
        for line in f:
            yield line

In [9]:
titles = []
abstracts = []

# Consider specific paper categories to be used during training and prediction
paper_categories = ["cs.AI", # Artificial Intelligence
                    "cs.LG", # Machine Learning
                    "physics.acc-ph", # Accelerator Physics
                    "physics.hist-ph", #History and Philosophy of Physics
                    "cs.NE", # Neural and Evolutionary Computing
                    "cs.HC", # Human-Computer Interaction
                    "cs.GT", # Computer Science and Game Theory
                    "cs.ET", # Emerging Technologies
                    "cs.IT", # Information Theory
                    "astro-ph", # Astrophysics
                    "astro-ph.EP", # Earth and Planetary Astrophysics
                    "astro-ph.HE", # High Energy Astrophysical Phenomena
                   ] 

metadata = get_metadata()
for paper in metadata:
    paper_dict = json.loads(paper)
    category = paper_dict.get('categories')
    try:
        year = int(paper_dict.get('journal-ref')[-4:])
        if category in paper_categories and 2000<year<2021:
            titles.append(paper_dict.get('title'))
            abstracts.append(paper_dict.get('abstract').replace("\n",""))
    except:
        pass 

len(titles), len(abstracts)

(20165, 20165)

In [11]:
titles[17000]

'The impact of mergers on relaxed X-ray clusters - I. Dynamical evolution\n  and emergent transient structures'

In [22]:
data = pd.DataFrame({'abstract': abstracts})

In [23]:
data

Unnamed: 0,abstract
0,We discuss the results from the combined IRA...
1,Results from spectroscopic observations of t...
2,"The very nature of the solar chromosphere, i..."
3,We derive masses and radii for both componen...
4,We show that the globular cluster mass funct...
...,...
20160,We discuss the features of a crystalline und...
20161,One approach to future high energy particle ...
20162,In this paper we introduce an optical approx...
20163,"In a companion report, we have derived a met..."


In [24]:
data['abstract'][0]

"  We discuss the results from the combined IRAC and MIPS c2d Spitzer Legacyobservations of the Serpens star-forming region. In particular we present a setof criteria for isolating bona fide young stellar objects, YSO's, from theextensive background contamination by extra-galactic objects. We then discussthe properties of the resulting high confidence set of YSO's. We find 235 suchobjects in the 0.85 deg^2 field that was covered with both IRAC and MIPS. Anadditional set of 51 lower confidence YSO's outside this area is identifiedfrom the MIPS data combined with 2MASS photometry. We describe two sets ofresults, color-color diagrams to compare our observed source properties withthose of theoretical models for star/disk/envelope systems and our own modelingof the subset of our objects that appear to be star+disks. These objectsexhibit a very wide range of disk properties, from many that can be fit withactively accreting disks to some with both passive disks and even possiblydebris disks. 

In [25]:
new_abs = []
for row in data["abstract"]:
    new_abs.append("<|startoftext|>" + row + "<|endoftext|>")

In [28]:
data['abs'] = new_abs

In [30]:
data.drop(columns='abstract',inplace = True)

In [31]:
data

Unnamed: 0,abs
0,<|startoftext|> We discuss the results from t...
1,<|startoftext|> Results from spectroscopic ob...
2,<|startoftext|> The very nature of the solar ...
3,<|startoftext|> We derive masses and radii fo...
4,<|startoftext|> We show that the globular clu...
...,...
20160,<|startoftext|> We discuss the features of a ...
20161,<|startoftext|> One approach to future high e...
20162,<|startoftext|> In this paper we introduce an...
20163,"<|startoftext|> In a companion report, we hav..."


In [33]:
data.to_csv('../data/abstracts.txt', header=None, index=None, sep=' ', mode='a')