In [1]:
#based on https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/
import os
from os import listdir
from os.path import isfile, join
import re
import time
import json
import requests
import pandas as pd

import arxiv

In [2]:
#CORPUS = 'ArxivHealthcareNLP'
CORPUS = 'arxiv_cl'

In [3]:
def load_properties(filepath, sep='=', comment_char='#'):
    '''
    Read the file passed as parameter as a properties file.
    '''
    props = {}
    with open(filepath, "rt") as f:
        for line in f:
            l = line.strip()
            if l and not l.startswith(comment_char):
                key_value = l.split(sep)
                key = key_value[0].strip()
                value = sep.join(key_value[1:]).strip().strip('"') 
                props[key] = value 
    return props
'''
Save a dictionary as a properties file; use to remember the latest processed id.
TODO store comments
'''
def save_properties(properties, filepath, sep='=', comment_char='#'):
    with open(filepath, 'w') as f: 
        for key, value in properties.items(): 
            f.write('%s %s %s\n' % (key, sep, value))

In [4]:
corpus_properties = load_properties(f"corpora/{CORPUS}.properties")
corpus_properties

{'account': '@arxiv_cl@creative.ai',
 'latest': '0',
 'corpus_base': '/home/arylwen/datasets/documents/arxiv_cl'}

In [5]:
ACCOUNT = corpus_properties['account']
LATEST = int(corpus_properties['latest'])
CORPUS_BASE = corpus_properties['corpus_base']

In [6]:
PDF_BASE = f'{CORPUS_BASE}/pdf'
if not os.path.exists(PDF_BASE):
    print(f'{PDF_BASE} does not exist. Creating.')
    os.makedirs(PDF_BASE)

In [7]:
def user_lookup(acct):
    URL = f'https://mastodon.social/api/v1/accounts/lookup'
    params = {
        'acct': acct
    }

    r = requests.get(URL, params=params)
    user = json.loads(r.text)
    
    return user

In [8]:
user = user_lookup(acct=ACCOUNT)
user_id = user['id']
#user

In [9]:
#LATEST = 110735899976782801
URL = f'https://mastodon.social/api/v1/accounts/{user_id}/statuses'
params = {
    'limit': 40,
    'since_id':  LATEST
#    'min_id':  LATEST
}

results = []
LATEST = 0

while True:
    print(params)
    r = requests.get(URL, params=params)
    toots = json.loads(r.text)

    if len(toots) == 0:
        break
    
    results.extend(toots)
    
    max_id = toots[-1]['id']
    params['max_id'] = max_id
    if(LATEST == 0):
        # remember the highest toot id processed
        LATEST = toots[0]["id"]
    print(f'first:{toots[0]["id"]} last:{max_id}')
    
df = pd.DataFrame(results)
print(f'Latest: {LATEST}; Total new toots: {df.shape[0]}')
df.head(2)

{'limit': 40, 'since_id': 0}
first:110799421247107699 last:110783576822036958
{'limit': 40, 'since_id': 0, 'max_id': '110783576822036958'}
first:110783301167278791 last:110773749513723081
{'limit': 40, 'since_id': 0, 'max_id': '110773749513723081'}
first:110773459001191187 last:110765110274708008
{'limit': 40, 'since_id': 0, 'max_id': '110765110274708008'}
first:110763809292503598 last:110745902920180065
{'limit': 40, 'since_id': 0, 'max_id': '110745902920180065'}
first:110745862204215190 last:110730678429536062
{'limit': 40, 'since_id': 0, 'max_id': '110730678429536062'}
first:110730515694228985 last:110712684309190707
{'limit': 40, 'since_id': 0, 'max_id': '110712684309190707'}
first:110712330164904134 last:110699566175047761
{'limit': 40, 'since_id': 0, 'max_id': '110699566175047761'}
first:110698985947924427 last:110677499383561776
{'limit': 40, 'since_id': 0, 'max_id': '110677499383561776'}
first:110677027480589712 last:110667019983541891
{'limit': 40, 'since_id': 0, 'max_id': '11

Unnamed: 0,id,created_at,in_reply_to_id,in_reply_to_account_id,sensitive,spoiler_text,visibility,language,uri,url,...,edited_at,content,reblog,account,media_attachments,mentions,tags,emojis,card,poll
0,110799421247107699,2023-07-29T21:09:55.000Z,,,False,,public,,https://creative.ai/users/arxiv_cl/statuses/11...,,...,,,"{'id': '110799419181451151', 'created_at': '20...","{'id': '109581051765732628', 'username': 'arxi...",[],[],[],[],,
1,110798092795335545,2023-07-29T15:32:05.000Z,,,False,,public,,https://creative.ai/users/arxiv_cl/statuses/11...,,...,,,"{'id': '110798091712871710', 'created_at': '20...","{'id': '109581051765732628', 'username': 'arxi...",[],[],[],[],,


In [10]:
#df['reblog'][0]

In [11]:
def update_content(row):
    #print(row)
    reblog_dict = row['reblog']
    if reblog_dict and ('content' in reblog_dict):
        row['content'] += reblog_dict['content']
    return row

df = df.apply(lambda row: update_content(row), axis = 1)
df.head(2)

Unnamed: 0,id,created_at,in_reply_to_id,in_reply_to_account_id,sensitive,spoiler_text,visibility,language,uri,url,...,edited_at,content,reblog,account,media_attachments,mentions,tags,emojis,card,poll
0,110799421247107699,2023-07-29T21:09:55.000Z,,,False,,public,,https://creative.ai/users/arxiv_cl/statuses/11...,,...,,<p>Getting rid of softmax.</p><p>[2307.14995] ...,"{'id': '110799419181451151', 'created_at': '20...","{'id': '109581051765732628', 'username': 'arxi...",[],[],[],[],,
1,110798092795335545,2023-07-29T15:32:05.000Z,,,False,,public,,https://creative.ai/users/arxiv_cl/statuses/11...,,...,,<p>Investigating &quot;Secret Language&quot;(S...,"{'id': '110798091712871710', 'created_at': '20...","{'id': '109581051765732628', 'username': 'arxi...",[],[],[],[],,


In [12]:
#pattern = r'(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]*)' 
# some toots do not have the protocol or www; for these there must be at list 2 '/' to match the arxiv pattern
# this picks up the first 2 links although it matches all links
# pattern = r'((https?:\/\/(?:www\.)?)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]{2,256})'
# restrict to arxiv articles with creative versioning
pattern = r'((https?:\/\/(?:www\.)?)?(arxiv)\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]{2,256})|[0-9]{4,4}\.[0-9]+[a-zA-Z]*[0-9]*'

In [13]:
df['links'] = df["content"].str.extract(pattern, expand=True)[0]
# TODO - are there several different arxiv articles in the same toot?
#df = df.join(df["content"].str.extract(pattern, expand=True))
df.head(2)

Unnamed: 0,id,created_at,in_reply_to_id,in_reply_to_account_id,sensitive,spoiler_text,visibility,language,uri,url,...,content,reblog,account,media_attachments,mentions,tags,emojis,card,poll,links
0,110799421247107699,2023-07-29T21:09:55.000Z,,,False,,public,,https://creative.ai/users/arxiv_cl/statuses/11...,,...,<p>Getting rid of softmax.</p><p>[2307.14995] ...,"{'id': '110799419181451151', 'created_at': '20...","{'id': '109581051765732628', 'username': 'arxi...",[],[],[],[],,,
1,110798092795335545,2023-07-29T15:32:05.000Z,,,False,,public,,https://creative.ai/users/arxiv_cl/statuses/11...,,...,<p>Investigating &quot;Secret Language&quot;(S...,"{'id': '110798091712871710', 'created_at': '20...","{'id': '109581051765732628', 'username': 'arxi...",[],[],[],[],,,https://arxiv.org/abs/2307.12507


In [14]:
# extract article id to use for download
ARXIV_PREFIX_1 = "https://arxiv.org/"
ARXIV_PREFIX_2 = "arxiv.org/"
ARXIV_PREFIX_3 = "arXiv"

import numpy as np

def get_article_id(col_str):
    #print(col_str)
    if col_str is np.nan:
         print(f'Not an arxiv article: {col_str}')
         return
    if col_str.startswith(ARXIV_PREFIX_1) | col_str.startswith(ARXIV_PREFIX_2):
        #an arxiv article
        article_id = col_str.split('/')[-1]
        #some articles have an extension
        article_id = '.'.join(article_id.split('.')[:2])
        articles = re.findall(r'[0-9]{4,4}\.[0-9]+[a-zA-Z]*[0-9]*', article_id)
        #print(articles)
        article_id = articles[0]
        return article_id
    elif col_str.startswith(ARXIV_PREFIX_3):
        #an arxiv article
        article_id = col_str.split(':')[-1]
        #some article have an extension
        article_id = '.'.join(article_id.split('.')[:2])
        return article_id
    elif re.search(r'[0-9]{4,4}\.[0-9]+[a-zA-Z]*[0-9]*', col_str):
        articles = re.findall(r'[0-9]{4,4}\.[0-9]+[a-zA-Z]*[0-9]*', col_str)
        #print(articles)
        return articles[0]
    else:
        print(f'Not an arxiv article: {col_str}')

df['article_id'] = df['links'].apply(get_article_id)

df.head(2)


Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan
Not an arxiv article: nan


Unnamed: 0,id,created_at,in_reply_to_id,in_reply_to_account_id,sensitive,spoiler_text,visibility,language,uri,url,...,reblog,account,media_attachments,mentions,tags,emojis,card,poll,links,article_id
0,110799421247107699,2023-07-29T21:09:55.000Z,,,False,,public,,https://creative.ai/users/arxiv_cl/statuses/11...,,...,"{'id': '110799419181451151', 'created_at': '20...","{'id': '109581051765732628', 'username': 'arxi...",[],[],[],[],,,,
1,110798092795335545,2023-07-29T15:32:05.000Z,,,False,,public,,https://creative.ai/users/arxiv_cl/statuses/11...,,...,"{'id': '110798091712871710', 'created_at': '20...","{'id': '109581051765732628', 'username': 'arxi...",[],[],[],[],,,https://arxiv.org/abs/2307.12507,2307.12507


In [15]:

pdf_files = [f for f in listdir(PDF_BASE) if isfile(join(PDF_BASE, f))]
len(pdf_files)

635

In [16]:
downloaded_article_ids = ['.'.join(f.split('.')[:2])[:10] for f in pdf_files]
downloaded_article_ids

['1706.03762',
 '2010.10820',
 '2104.09864',
 '2106.09685',
 '2109.01537',
 '2201.11903',
 '2203.02155',
 '2205.11916',
 '2209.00840',
 '2210.03629',
 '2211.05655',
 '2211.07524',
 '2211.09110',
 '2212.03551',
 '2212.08011',
 '2212.08061',
 '2212.08718',
 '2212.09410',
 '2212.09744',
 '2212.10071',
 '2212.10403',
 '2212.10559',
 '2212.10562',
 '2301.06627',
 '2302.03494',
 '2302.12611',
 '2303.08774',
 '2303.11156',
 '2303.13379',
 '2304.00612',
 '2304.01373',
 '2304.02819',
 '2304.02819',
 '2304.09102',
 '2304.11079',
 '2304.11111',
 '2305.00118',
 '2305.01625',
 '2305.07759',
 '2305.08283',
 '2305.08298',
 '2305.09800',
 '2305.10601',
 '2305.10998',
 '2305.11206',
 '2305.11738',
 '2305.11778',
 '2305.12182',
 '2305.12295',
 '2305.12544',
 '2305.13009',
 '2305.13252',
 '2305.13281',
 '2305.13304',
 '2305.13504',
 '2305.13534',
 '2305.13735',
 '2305.14224',
 '2305.14233',
 '2305.14325',
 '2305.14337',
 '2305.14591',
 '2305.14726',
 '2305.14825',
 '2305.15374',
 '2305.15507',
 '2305.163

In [17]:
from google.cloud import storage

client = storage.Client.create_anonymous_client()
bucket = client.bucket('arxiv-dataset')

def google_cloud_download(paper_id, file_name):
    # blob = bucket.blob("arxiv/arxiv/pdf/2211/2211.00350v3.pdf")
    found = False
    vn = 1
    year = paper_id.split('.')[0]
    while not found:
        try:
            blob = bucket.blob(f"arxiv/arxiv/pdf/{year}/{paper_id}v{vn}.pdf")
            blob.download_to_filename(file_name)
            found = True
        except Exception as e:
            print(e)
            if vn == 10 : break
            vn = vn+1


#google_cloud_download('2211.00350', 'test.pdf')

In [None]:
#prepare json file
with open("arxiv-metadata-oai-snapshot.json") as f1:
    with open("arxiv-metadata-oai-snapshot-pysondb.json", "w") as f2:
        f2.write('{"data":[')
        for line in f1:
            f2.write(line+',')
        f2.write(']}')

In [18]:
from pysondb import db

a=db.getDb('arxiv-metadata-oai-snapshot.json')

In [20]:
a.get(2)

[{'': ''}]

In [None]:
# download papers based on the id; handles arxiv rates limits
i = 0
#arxiv_client = arxiv.Client(
#  delay_seconds = 1,
#  num_retries = 5
#)

def download_paper_arxiv_api(paper_id):
    global i
    global downloaded_article_ids
    i = i+1
    if(paper_id):
        if paper_id[:10] in downloaded_article_ids:
            print(f'{i} A version of {paper_id} exists.')
        else:
            paper = next(arxiv.Search(id_list=[paper_id]).results())
            #paper = next(arxiv_client.results(arxiv.Search(id_list=[paper_id])))
            paper_title = re.sub('[^a-zA-Z0-9]', '_', paper.title)
            short_id = paper.get_short_id()
            long_file_name = f"{PDF_BASE}/{short_id}.{paper_title}.pdf"
            file_name = f"{short_id}.{paper_title}.pdf"
            #print(paper.title)
            #print(file_name)
            if(os.path.exists(long_file_name)):
                print(f'{i} File exists. Skipping {file_name}')
            else:
                print(f'{i} Downloading {file_name}')
                # this might hit arxiv's rate limits
                paper.download_pdf(dirpath = PDF_BASE, filename=file_name)
            time.sleep(5)

def download_paper_kaggle(paper_id):
    global i
    global downloaded_article_ids
    i = i+1
    if(paper_id):
        if paper_id[:10] in downloaded_article_ids:
            print(f'{i} A version of {paper_id} exists.')
        else:
            paper = next(arxiv.Search(id_list=[paper_id]).results())
            #paper = next(arxiv_client.results(arxiv.Search(id_list=[paper_id])))
            paper_title = re.sub('[^a-zA-Z0-9]', '_', paper.title)
            short_id = paper.get_short_id()
            long_file_name = f"{PDF_BASE}/{short_id}.{paper_title}.pdf"
            file_name = f"{short_id}.{paper_title}.pdf"
            #print(paper.title)
            #print(file_name)
            if(os.path.exists(long_file_name)):
                print(f'{i} File exists. Skipping {file_name}')
            else:
                print(f'{i} Downloading {file_name}')
                # this might hit arxiv's rate limits
                #paper.download_pdf(dirpath = PDF_BASE, filename=file_name)
                google_cloud_download(paper_id, long_file_name)
            time.sleep(5)


df['article_id'].apply(download_paper_kaggle)

In [None]:
df.to_csv(f"{CORPUS}.csv", index=False)

In [None]:
corpus_properties['latest'] = LATEST
save_properties(corpus_properties, f'corpora/{CORPUS}.properties')