In [1]:
#based on https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/
import json
import requests
import pandas as pd

import arxiv

In [2]:
def user_lookup(acct):
    URL = f'https://mastodon.social/api/v1/accounts/lookup'
    params = {
        'acct': acct
    }

    r = requests.get(URL, params=params)
    user = json.loads(r.text)
    
    return user

In [3]:
user = user_lookup(acct='@ArxivHealthcareNLP@sigmoid.social')
user_id = user['id']
user

{'id': '109304918957671447',
 'username': 'ArxivHealthcareNLP',
 'acct': 'ArxivHealthcareNLP@sigmoid.social',
 'display_name': 'Arxiv CS-CL Healthcare NLP',
 'locked': False,
 'bot': True,
 'discoverable': True,
 'group': False,
 'created_at': '2022-11-07T00:00:00.000Z',
 'note': '<p>All new arxiv articles about Biomedical NLP published in CS-CL</p><p><a href="https://sigmoid.social/tags/NLP" class="mention hashtag" rel="nofollow noopener noreferrer" target="_blank">#<span>NLP</span></a> <a href="https://sigmoid.social/tags/NLProc" class="mention hashtag" rel="nofollow noopener noreferrer" target="_blank">#<span>NLProc</span></a> <a href="https://sigmoid.social/tags/Healthcare" class="mention hashtag" rel="nofollow noopener noreferrer" target="_blank">#<span>Healthcare</span></a> <a href="https://sigmoid.social/tags/Biomedical" class="mention hashtag" rel="nofollow noopener noreferrer" target="_blank">#<span>Biomedical</span></a> <a href="https://sigmoid.social/tags/Clinical" class="me

In [4]:
LATEST = 110735899976782801
URL = f'https://mastodon.social/api/v1/accounts/{user_id}/statuses'
params = {
    'limit': 40
}

results = []

while True:
    r = requests.get(URL, params=params)
    toots = json.loads(r.text)

    if len(toots) == 0:
        break
    
    results.extend(toots)
    
    max_id = toots[-1]['id']
    params['max_id'] = max_id
    print(f'first:{toots[0]["id"]} last{max_id}')
    
df = pd.DataFrame(results)
df.head(2)

first:110735899976782801 last110497700689379380
first:110497696607649185 last110224847905369958
first:110219506187081968 last109936060962095713
first:109936053273775121 last109426960004409997
first:109426959889192816 last109304909720242439


Unnamed: 0,id,created_at,in_reply_to_id,in_reply_to_account_id,sensitive,spoiler_text,visibility,language,uri,url,...,edited_at,content,reblog,account,media_attachments,mentions,tags,emojis,card,poll
0,110735899976782801,2023-07-18T15:55:38.000Z,,,True,:arxiv: An empirical study of using radiology ...,public,en,https://sigmoid.social/users/ArxivHealthcareNL...,https://sigmoid.social/@ArxivHealthcareNLP/110...,...,,"<p>In this work, we build survival prediction ...",,"{'id': '109304918957671447', 'username': 'Arxi...",[],[],"[{'name': 'nlp', 'url': 'https://mastodon.soci...","[{'shortcode': 'arxiv', 'url': 'https://files....",,
1,110735893620091684,2023-07-18T15:54:01.000Z,,,True,:arxiv: The Potential and Pitfalls of using a ...,public,en,https://sigmoid.social/users/ArxivHealthcareNL...,https://sigmoid.social/@ArxivHealthcareNLP/110...,...,,<p>We performed two analyses using ChatGPT and...,,"{'id': '109304918957671447', 'username': 'Arxi...",[],[],"[{'name': 'nlp', 'url': 'https://mastodon.soci...","[{'shortcode': 'arxiv', 'url': 'https://files....",,


In [5]:
pattern = pattern = r'(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]*)' 

In [6]:
df['links'] = df["content"].str.extract(pattern, expand=True)
df.head(2)

Unnamed: 0,id,created_at,in_reply_to_id,in_reply_to_account_id,sensitive,spoiler_text,visibility,language,uri,url,...,content,reblog,account,media_attachments,mentions,tags,emojis,card,poll,links
0,110735899976782801,2023-07-18T15:55:38.000Z,,,True,:arxiv: An empirical study of using radiology ...,public,en,https://sigmoid.social/users/ArxivHealthcareNL...,https://sigmoid.social/@ArxivHealthcareNLP/110...,...,"<p>In this work, we build survival prediction ...",,"{'id': '109304918957671447', 'username': 'Arxi...",[],[],"[{'name': 'nlp', 'url': 'https://mastodon.soci...","[{'shortcode': 'arxiv', 'url': 'https://files....",,,https://arxiv.org/pdf/2307.07513
1,110735893620091684,2023-07-18T15:54:01.000Z,,,True,:arxiv: The Potential and Pitfalls of using a ...,public,en,https://sigmoid.social/users/ArxivHealthcareNL...,https://sigmoid.social/@ArxivHealthcareNLP/110...,...,<p>We performed two analyses using ChatGPT and...,,"{'id': '109304918957671447', 'username': 'Arxi...",[],[],"[{'name': 'nlp', 'url': 'https://mastodon.soci...","[{'shortcode': 'arxiv', 'url': 'https://files....",,,https://arxiv.org/pdf/2307.08152


In [10]:
ARXIV_PREFIX = "https://arxiv.org/"

def get_article_id(col_str):
    if col_str.startswith(ARXIV_PREFIX):
        #an arxiv article
        #return col_str.split(ARXIV_PREFIX,1)[1]
        article_id = col_str.split('/')[-1]
        #some article have an extension
        article_id = '.'.join(article_id.split('.')[:2])
        return article_id
    else:
        print(f'Not an arxiv article: {col_str}')

df['article_id'] = df['links'].apply(get_article_id)

df.head(2)


Unnamed: 0,id,created_at,in_reply_to_id,in_reply_to_account_id,sensitive,spoiler_text,visibility,language,uri,url,...,reblog,account,media_attachments,mentions,tags,emojis,card,poll,links,article_id
0,110735899976782801,2023-07-18T15:55:38.000Z,,,True,:arxiv: An empirical study of using radiology ...,public,en,https://sigmoid.social/users/ArxivHealthcareNL...,https://sigmoid.social/@ArxivHealthcareNLP/110...,...,,"{'id': '109304918957671447', 'username': 'Arxi...",[],[],"[{'name': 'nlp', 'url': 'https://mastodon.soci...","[{'shortcode': 'arxiv', 'url': 'https://files....",,,https://arxiv.org/pdf/2307.07513,2307.07513
1,110735893620091684,2023-07-18T15:54:01.000Z,,,True,:arxiv: The Potential and Pitfalls of using a ...,public,en,https://sigmoid.social/users/ArxivHealthcareNL...,https://sigmoid.social/@ArxivHealthcareNLP/110...,...,,"{'id': '109304918957671447', 'username': 'Arxi...",[],[],"[{'name': 'nlp', 'url': 'https://mastodon.soci...","[{'shortcode': 'arxiv', 'url': 'https://files....",,,https://arxiv.org/pdf/2307.08152,2307.08152


In [11]:

PDF_BASE = '/home/arylwen/datasets/documents/ArxivHealthcareNLP/pdf'

def download_paper(paper_id):
    paper = next(arxiv.Search(id_list=[paper_id]).results())
    paper.download_pdf(dirpath = PDF_BASE)

df['article_id'].apply(download_paper)

0      None
1      None
2      None
3      None
4      None
       ... 
186    None
187    None
188    None
189    None
190    None
Name: article_id, Length: 191, dtype: object

In [12]:
df.to_csv("ArxivHealthcareNLP.csv", index=False)