In [None]:
'''
This notebook uses arxiv metadata from https://www.kaggle.com/datasets/Cornell-University/arxiv?resource=download.
Current version is 139. Check for a later version before running.
'''
import os
from os import listdir
from os.path import isfile, join
import pathlib
from datetime import datetime

import json
import pandas as pd

In [None]:
CORPUS = 'ArxivHealthcareNLP'
#CORPUS = 'arxiv_cl'
#CORPUS = 'aiml'

In [None]:
def load_properties(filepath, sep='=', comment_char='#'):
    '''
    Read the file passed as parameter as a properties file.
    '''
    props = {}
    with open(filepath, "rt") as f:
        for line in f:
            l = line.strip()
            if l and not l.startswith(comment_char):
                key_value = l.split(sep)
                key = key_value[0].strip()
                value = sep.join(key_value[1:]).strip().strip('"') 
                props[key] = value 
    return props
'''
Save a dictionary as a properties file; use to remember the latest processed id.
TODO store comments
'''
def save_properties(properties, filepath, sep='=', comment_char='#'):
    with open(filepath, 'w') as f: 
        for key, value in properties.items(): 
            f.write('%s %s %s\n' % (key, sep, value))

In [None]:
corpus_properties = load_properties(f"corpora/{CORPUS}.properties")
corpus_properties

In [None]:
CORPUS_BASE = corpus_properties['corpus_base']

In [None]:
PDF_BASE = f'{CORPUS_BASE}/pdf'
PDF_ND_BASE = f'{CORPUS_BASE}/pdf_nd'
JSON_BASE = f'{CORPUS_BASE}/json_raw/'

if not os.path.exists(PDF_BASE):
    raise Exception('Please download the corpus first.')

if not os.path.exists(JSON_BASE):
    raise Exception('Please convert the corpus to raw json first.')

if not os.path.exists(JSON_BASE):
    raise Exception('Please run filter_nd first.')

In [None]:
# load papers metadata
nl = 0
metadata_records = []
with open("arxiv-metadata-oai-snapshot.json") as f1:
    for line in f1:
        #print(line)   
        metadata_record = json.loads(line)
        #print(metadata_record)
        metadata_records.extend([metadata_record])
        #nl+=1
        #if (nl == 5): break

#print(metadata_records)
metadata_df = pd.DataFrame(metadata_records)
metadata_df.shape

In [None]:
# search for paper in the metadata_df
def kaggle_search(paper_id):
    row = metadata_df.loc[metadata_df['id'] == paper_id]
    #print(row)
    paper = None
    try:
        paper = {}
        paper['id'] = row['id'].values[0]
        paper['title'] = row['title'].values[0]
        paper['versions'] = row['versions'].values[0]
        paper['abstract'] = row['abstract'].values[0]
        paper['license'] = row['license'].values[0]

        latest_version = 'v1'
        for version in paper['versions'] :
            #v = json.loads(version)
            if version['version'] > latest_version:
                latest_version = version['version']
        paper['latest_version'] = latest_version
    except IndexError as ie:
        print(ie)
        print(f'Paper {paper_id} not found. Perhas should download a new metadata db version?')
    
    return paper

#paper = kaggle_search('2212.09410')
paper = kaggle_search('0704.0001')
paper

In [None]:
def write_json_file(filename, content):
    pathlib.Path(JSON_BASE+filename).write_bytes(content.encode('utf-8').strip())

def save_abstract(title,content):
    document_dict = dict()
    filename = title+'.json'
    document_dict['title'] = title
    document_dict['text'] = content
    document_dict['extraction_date'] = str(datetime.utcnow())
    document_dict['num_pages'] = 1
    json_object = json.dumps(document_dict) 
    write_json_file(filename,json_object)

In [None]:
pdf_files = [f for f in listdir(PDF_BASE) if isfile(join(PDF_BASE, f))]
print(f'Analyzing {len(pdf_files)} PDF files.')

In [None]:
for pdf_name in pdf_files:
    pdf_id = '.'.join(pdf_name.split('.')[:2])
    paper_id = pdf_id[:10]
    paper = kaggle_search(paper_id)
    if paper:
        if 'nd' in paper['license']:
            file_name = pdf_name.split(PDF_BASE)[0]
            print(file_name)
            print(paper['license'])
            print(paper['abstract'])
            save_abstract(file_name, paper['abstract'])
            in_file_name = f'{PDF_BASE}/{file_name}'
            out_file_name = f'{PDF_ND_BASE}/{file_name}'
            pathlib.Path(in_file_name).rename(out_file_name)


In [None]:
pdf_files = [f for f in listdir(PDF_ND_BASE) if isfile(join(PDF_ND_BASE, f))]
print(f'Analyzing {len(pdf_files)} PDF files.')

In [None]:
for pdf_name in pdf_files:
    pdf_id = '.'.join(pdf_name.split('.')[:2])
    paper_id = pdf_id[:10]
    paper = kaggle_search(paper_id)
    if paper:
        if 'nd' in paper['license']:
            file_name = pdf_name.split(PDF_ND_BASE)[0]
            print(file_name)
            print(paper['license'])
            #print(paper['abstract'])
            save_abstract(file_name, paper['abstract'])
            #in_file_name = f'{PDF_BASE}/{file_name}'
            #out_file_name = f'{PDF_ND_BASE}/{file_name}'
            #pathlib.Path(in_file_name).rename(out_file_name)