In [None]:
'''
This notebook uses arxiv metadata from https://www.kaggle.com/datasets/Cornell-University/arxiv?resource=download.
Current version is 139. Check for a later version before running.
'''
#based on https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/
import os
from os import listdir
from os.path import isfile, join
import re
import time
import json
import requests
import pandas as pd

import arxiv

In [None]:
#CORPUS = 'ArxivHealthcareNLP'
#CORPUS = 'arxiv_cl'
CORPUS = 'aiml'

In [None]:
def load_properties(filepath, sep='=', comment_char='#'):
    '''
    Read the file passed as parameter as a properties file.
    '''
    props = {}
    with open(filepath, "rt") as f:
        for line in f:
            l = line.strip()
            if l and not l.startswith(comment_char):
                key_value = l.split(sep)
                key = key_value[0].strip()
                value = sep.join(key_value[1:]).strip().strip('"') 
                props[key] = value 
    return props
'''
Save a dictionary as a properties file; use to remember the latest processed id.
TODO store comments
'''
def save_properties(properties, filepath, sep='=', comment_char='#'):
    with open(filepath, 'w') as f: 
        for key, value in properties.items(): 
            f.write('%s %s %s\n' % (key, sep, value))

In [None]:
corpus_properties = load_properties(f"corpora/{CORPUS}.properties")
corpus_properties

In [None]:
ACCOUNT = corpus_properties['account']
LATEST = int(corpus_properties['latest'])
CORPUS_BASE = corpus_properties['corpus_base']

In [None]:
PDF_BASE = f'{CORPUS_BASE}/pdf'
if not os.path.exists(PDF_BASE):
    print(f'{PDF_BASE} does not exist. Creating.')
    os.makedirs(PDF_BASE)

In [None]:
def user_lookup(acct):
    URL = f'https://mastodon.social/api/v1/accounts/lookup'
    params = {
        'acct': acct
    }

    r = requests.get(URL, params=params)
    user = json.loads(r.text)
    
    return user

In [None]:
user = user_lookup(acct=ACCOUNT)
user_id = user['id']

In [None]:
URL = f'https://mastodon.social/api/v1/accounts/{user_id}/statuses'
params = {
    'limit': 40,
    'since_id':  LATEST
#    'min_id':  LATEST
}

results = []
LATEST = 0

while True:
    print(params)
    r = requests.get(URL, params=params)
    toots = json.loads(r.text)

    if len(toots) == 0:
        break
    
    results.extend(toots)
    
    max_id = toots[-1]['id']
    params['max_id'] = max_id
    if(LATEST == 0):
        # remember the highest toot id processed
        LATEST = toots[0]["id"]
    print(f'first:{toots[0]["id"]} last:{max_id}')
    
df = pd.DataFrame(results)
print(f'Latest: {LATEST}; Total new toots: {df.shape[0]}')
df.head(2)

In [None]:
if(df.shape[0] == 0):
    raise ValueError("No new toots to process. Stopping.")

In [None]:
'''
Some toots are reblogs; we would need to bring their content to the content column for normalization.
'''
def update_content(row):
    #print(row)
    reblog_dict = row['reblog']
    if reblog_dict and ('content' in reblog_dict):
        row['content'] += reblog_dict['content']
    return row

df = df.apply(lambda row: update_content(row), axis = 1)
df.head(2)

In [None]:
#pattern = r'(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]*)' 
# some toots do not have the protocol or www; for these there must be at least 2 '/' to match the arxiv pattern
# this picks up the first 2 links although it matches all links
# pattern = r'((https?:\/\/(?:www\.)?)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]{2,256})'
# restrict to arxiv articles with creative versioning
pattern = r'((https?:\/\/(?:www\.)?)?(arxiv)\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]{2,256})|[0-9]{4,4}\.[0-9]+[a-zA-Z]*[0-9]*'

In [None]:
df['links'] = df["content"].str.extract(pattern, expand=True)[0]
# TODO - are there several different arxiv articles in the same toot?
#df = df.join(df["content"].str.extract(pattern, expand=True))
df.head(2)

In [None]:
# extract article id to use for download
ARXIV_PREFIX_1 = "https://arxiv.org/"
ARXIV_PREFIX_2 = "arxiv.org/"
ARXIV_PREFIX_3 = "arXiv"

import numpy as np

def get_article_id(col_str):
    #print(col_str)
    if col_str is np.nan:
         print(f'Not an arxiv article: {col_str}')
         return
    if col_str.startswith(ARXIV_PREFIX_1) | col_str.startswith(ARXIV_PREFIX_2):
        #an arxiv article
        article_id = col_str.split('/')[-1]
        #some articles have an extension
        article_id = '.'.join(article_id.split('.')[:2])
        articles = re.findall(r'[0-9]{4,4}\.[0-9]+[a-zA-Z]*[0-9]*', article_id)
        #print(articles)
        article_id = None
        if(len(articles) > 0):
            article_id = articles[0]
        else:
            print(f'{col_str} not an arxiv article.')
        return article_id
    elif col_str.startswith(ARXIV_PREFIX_3):
        #an arxiv article
        article_id = col_str.split(':')[-1]
        #some article have an extension
        article_id = '.'.join(article_id.split('.')[:2])
        return article_id
    elif re.search(r'[0-9]{4,4}\.[0-9]+[a-zA-Z]*[0-9]*', col_str):
        articles = re.findall(r'[0-9]{4,4}\.[0-9]+[a-zA-Z]*[0-9]*', col_str)
        #print(articles)
        article_id = None
        if(len(articles) > 0):
            article_id = articles[0]
        else:
            print(f'{col_str} not an arxiv article.')
        return article_id
    else:
        print(f'Not an arxiv article: {col_str}')

df['article_id'] = df['links'].apply(get_article_id)

df.head(2)


In [None]:
duplicates = df[df.article_id.duplicated()]['article_id']
duplicates.shape

In [None]:

pdf_files = [f for f in listdir(PDF_BASE) if isfile(join(PDF_BASE, f))]
len(pdf_files)

In [None]:
downloaded_article_ids = ['.'.join(f.split('.')[:2])[:10] for f in pdf_files]
#downloaded_article_ids
len(downloaded_article_ids)

In [None]:
from google.cloud import storage

client = storage.Client.create_anonymous_client()
bucket = client.bucket('arxiv-dataset')

'''
paper_id must contain the required version e.g. 2211.00350v3
'''
def google_cloud_download(paper_id, file_name):
    # blob = bucket.blob("arxiv/arxiv/pdf/2211/2211.00350v3.pdf")
    year = paper_id.split('.')[0]
    try:
        #blob = bucket.blob(f"arxiv/arxiv/pdf/{year}/{paper_id}v{vn}.pdf")
        blob = bucket.blob(f"arxiv/arxiv/pdf/{year}/{paper_id}.pdf")
        blob.download_to_filename(file_name)
    except Exception as e:
        print(e)
        print(f'Trying: {paper_id[:10]}v1')
        # version declared but not available for download, try v1; 
        # TODO perhaps find the most recent available version
        try:
            blob = bucket.blob(f"arxiv/arxiv/pdf/{year}/{paper_id[:10]}v1.pdf")
            blob.download_to_filename(file_name)
        except Exception as e1:
            print(e1)
            #try without the version
            try:
                blob = bucket.blob(f"arxiv/arxiv/pdf/{year}/{paper_id[:10]}.pdf")
                blob.download_to_filename(file_name)
            except Exception as e2:
                print(e2)

#google_cloud_download('2211.00350v3', 'test.pdf')

In [None]:
nl = 0
metadata_records = []
with open("arxiv-metadata-oai-snapshot.json") as f1:
    for line in f1:
        #print(line)   
        metadata_record = json.loads(line)
        #print(metadata_record)
        metadata_records.extend([metadata_record])
        #nl+=1
        #if (nl == 5): break

#print(metadata_records)
metadata_df = pd.DataFrame(metadata_records)
metadata_df.shape

In [None]:
metadata_df.head(2)

In [None]:
# search for paper in the metadata_df
def kaggle_search(paper_id):
    row = metadata_df.loc[metadata_df['id'] == paper_id]
    #print(row)
    paper = None
    try:
        paper = {}
        paper['id'] = row['id'].values[0]
        paper['title'] = row['title'].values[0]
        paper['versions'] = row['versions'].values[0]
        paper['abstract'] = row['abstract'].values[0]

        latest_version = 'v1'
        for version in paper['versions'] :
            #v = json.loads(version)
            if version['version'] > latest_version:
                latest_version = version['version']
        paper['latest_version'] = latest_version
    except IndexError as ie:
        print(ie)
        print(f'Paper {paper_id} not found. Perhas should download a new metadata db version?')
    
    return paper

#paper = kaggle_search('2212.09410')
paper = kaggle_search('0704.0001')
paper

In [None]:
# download papers based on the id; handles arxiv rates limits
i = 0

def download_paper_arxiv_api(paper_id):
    global i
    global downloaded_article_ids
    i = i+1
    if(paper_id):
        if paper_id[:10] in downloaded_article_ids:
            print(f'{i} A version of {paper_id} exists.')
        else:
            paper = next(arxiv.Search(id_list=[paper_id]).results())
            paper_title = re.sub('[^a-zA-Z0-9]', '_', paper.title)
            short_id = paper.get_short_id()
            long_file_name = f"{PDF_BASE}/{short_id}.{paper_title}.pdf"
            file_name = f"{short_id}.{paper_title}.pdf"
            if(os.path.exists(long_file_name)):
                print(f'{i} File exists. Skipping {file_name}')
            else:
                print(f'{i} Downloading {file_name}')
                # this might hit arxiv's rate limits
                paper.download_pdf(dirpath = PDF_BASE, filename=file_name)
            time.sleep(5)

def download_paper_kaggle(paper_id):
    global i
    global downloaded_article_ids
    i = i+1
    if(paper_id):
        if paper_id[:10] in downloaded_article_ids:
            print(f'{i} A version of {paper_id} exists.')
        else:
            paper = kaggle_search(paper_id[:10])
            if paper:
                paper_title = re.sub('[^a-zA-Z0-9]', '_', paper['title'])
                short_id = f'{paper["id"]}{paper["latest_version"]}'
                long_file_name = f"{PDF_BASE}/{short_id}.{paper_title}.pdf"
                file_name = f"{short_id}.{paper_title}.pdf"
                if(os.path.exists(long_file_name)):
                    print(f'{i} File exists. Skipping {file_name}')
                else:
                    print(f'{i} Downloading {file_name}')
                    # this might hit arxiv's rate limits
                    google_cloud_download(short_id, long_file_name)
                    time.sleep(5)
            else:
                # TODO missed papers - write them down in a file for later download
                print(f'Paper {paper_id} not in metadata, probably not on gcloud yet.')
                google_cloud_download(paper_id, paper_id)
                time.sleep(5)


df['article_id'].apply(download_paper_kaggle)

In [None]:
df.to_csv(f"{CORPUS}.csv", index=False)

In [None]:
corpus_properties['latest'] = LATEST
save_properties(corpus_properties, f'corpora/{CORPUS}.properties')