In [7]:
import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm

In [34]:
# https://www.kaggle.com/balraj98/arxiv-abstract-translation-to-french-marian-nmt##1.-Abstract-Translation
data_file = '/Users/aidancurley/Downloads/archive/arxiv-metadata-oai-snapshot.json'

""" Using `yield` to load the JSON file in a loop to prevent Python memory issues if JSON is loaded directly"""

def get_metadata():
    with open(data_file, 'r') as f:
        for line in f:
            yield line

In [36]:
titles = []
abstracts = []

# Consider specific paper categories to be used during training and prediction
paper_categories = ['cs.AI', # 'Artificial Intelligence',
                'cs.AR', # 'Hardware Architecture',
                'cs.CC', # 'Computational Complexity',
                'cs.CE', # 'Computational Engineering, Finance, and Science',
                'cs.CG', # 'Computational Geometry',
                'cs.CL', # 'Computation and Language',
                'cs.CR', # 'Cryptography and Security',
                'cs.CV', # 'Computer Vision and Pattern Recognition',
                'cs.CY', # 'Computers and Society',
                'cs.DB', # 'Databases',
                'cs.DC', # 'Distributed, Parallel, and Cluster Computing',
                'cs.DL', # 'Digital Libraries',
                'cs.DM', # 'Discrete Mathematics',
                'cs.DS', # 'Data Structures and Algorithms',
                'cs.ET', # 'Emerging Technologies',
                'cs.FL', # 'Formal Languages and Automata Theory',
                'cs.GL', # 'General Literature',
                'cs.GT', # 'Computer Science and Game Theory',
                'cs.HC', # 'Human-Computer Interaction',
                'cs.IR', # 'Information Retrieval',
                'cs.IT', # 'Information Theory',
                'cs.LG', # 'Machine Learning',
                'cs.LO', # 'Logic in Computer Science',
                'cs.MA', # 'Multiagent Systems',
                'cs.MM', # 'Multimedia',
                'cs.MS', # 'Mathematical Software',
                'cs.NA', # 'Numerical Analysis',
                'cs.NE', # 'Neural and Evolutionary Computing',
                'cs.NI', # 'Networking and Internet Architecture',
                'cs.OH', # 'Other Computer Science',
                'cs.OS', # 'Operating Systems',
                'cs.PF', # 'Performance',
                'cs.PL', # 'Programming Languages',
                'cs.RO', # 'Robotics',
                'cs.SC', # 'Symbolic Computation',
                'cs.SD', # 'Sound',
                'cs.SE', # 'Software Engineering',
                'cs.SI', # 'Social and Information Networks',
                'cs.SY', # 'Systems and Control', # Information Theory
                   ] 

metadata = get_metadata()
for paper in metadata:
    paper_dict = json.loads(paper)
    category = paper_dict.get('categories')
    try:
        year = int(paper_dict.get('journal-ref')[-4:])
        if category in paper_categories and 2000<year<2021:
            titles.append(paper_dict.get('title'))
            abstracts.append(paper_dict.get('abstract').replace("\n",""))
    except:
        pass 

len(titles), len(abstracts)

(6594, 6594)

In [39]:
abstracts[1000]

'  This paper presents a method to understand spoken Tunisian dialect based onlexical semantic. This method takes into account the specificity of theTunisian dialect which has no linguistic processing tools. This method isontology-based which allows exploiting the ontological concepts for semanticannotation and ontological relations for speech interpretation. Thiscombination increases the rate of comprehension and limits the dependence onlinguistic resources. This paper also details the process of building theontology used for annotation and interpretation of Tunisian dialect in thecontext of speech understanding in dialogue systems for restricted domain.'

In [40]:
data = pd.DataFrame({'abstract': abstracts})

In [41]:
data

Unnamed: 0,abstract
0,"When looking for a solution, deterministic m..."
1,The semiring-based constraint satisfaction p...
2,We show how to test whether a graph with n v...
3,This paper has been withdrawn Abstract: This...
4,This paper describes experiments on learning...
...,...
6589,Bedwyr is a generalization of logic programm...
6590,We develop bounds on the capacity of wireles...
6591,For academics and practitioners concerned wi...
6592,This paper analyzes the distribution of cycl...


In [42]:
data['abstract'][0]

'  When looking for a solution, deterministic methods have the enormousadvantage that they do find global optima. Unfortunately, they are veryCPU-intensive, and are useless on untractable NP-hard problems that wouldrequire thousands of years for cutting-edge computers to explore. In order toget a result, one needs to revert to stochastic algorithms, that sample thesearch space without exploring it thoroughly. Such algorithms can find verygood results, without any guarantee that the global optimum has been reached;but there is often no other choice than using them. This chapter is a shortintroduction to the main methods used in stochastic optimization.'

In [43]:
# Add <|startoftext|> to beginning of text and <|endoftext|> to end of text
new_abs = []
for row in data["abstract"]:
    new_abs.append("<|startoftext|>" + row + "<|endoftext|>")

In [44]:
data['abs'] = new_abs

In [45]:
data.drop(columns='abstract',inplace = True)

In [46]:
data

Unnamed: 0,abs
0,"<|startoftext|> When looking for a solution, ..."
1,<|startoftext|> The semiring-based constraint...
2,<|startoftext|> We show how to test whether a...
3,<|startoftext|> This paper has been withdrawn...
4,<|startoftext|> This paper describes experime...
...,...
6589,<|startoftext|> Bedwyr is a generalization of...
6590,<|startoftext|> We develop bounds on the capa...
6591,<|startoftext|> For academics and practitione...
6592,<|startoftext|> This paper analyzes the distr...


In [47]:
data.to_csv('../data/abstracts.txt', header=None, index=None, sep=' ', mode='a')