In [None]:
!pip install beautifulsoup4

Defaulting to user installation because normal site-packages is not writeable


In [None]:
#| hide
#| default_exp medrxiv

# medRxiv

> The preprint server for health sciences.

In [None]:
#| export

import warnings
warnings.filterwarnings("ignore")

import os
import logging

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [None]:
#| export
try:
    # This will work when running as a script
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    # This will work when running in a Jupyter notebook
    script_dir = os.getcwd()

parent_dir = os.path.abspath(os.path.join(script_dir, os.pardir))
log_dir = os.path.join(parent_dir, 'logs')
data_dir = os.path.join(parent_dir, 'data')

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', filename=os.path.join(log_dir, 'medrxiv.log'), filemode='a')

In [None]:
#| export

url = "https://connect.medrxiv.org/medrxiv_xml.php?subject=all"
response = requests.get(url)
response.raise_for_status()

In [None]:
#| export

soup = BeautifulSoup(response.content, "lxml-xml")
items = soup.find_all("item")

In [None]:
#| export

data = []
for item in items:
    item_data = {}
    for child in item.find_all(recursive=False):
        tag_name = child.name
        tag_value = child.text.strip() if child.text else None
        item_data[tag_name] = tag_value
    item_data.update(item.attrs)
    data.append(item_data)
data[0]

{'title': 'Age-related differences in psychopathology within sex chromosome trisomies',
 'link': 'http://medrxiv.org/cgi/content/short/2024.11.22.24317803v1?rss=1',
 'description': 'Sex chromosome trisomies (SCTs) are a group of genetic disorders characterized by presence of a supernumerary sex chromosome, resulting in karyotypes other than XX or XY. These include XXX (Trisomy X), XXY (Klinefelter syndrome), and XYY (Jacobs syndrome). Sex chromosome trisomies have been linked to increased risk for psychopathology; however, this relationship warrants additional research. Specifically, little is known regarding potential age-related variation in risk for psychopathology and how this may differ across karyotypes and subdomains of psychopathology, which has relevance for psychoeducation, personalized care, and mechanistic research. Thus, we used the Child Behavior Checklist (CBCL) to estimate age-related variation in psychopathology in a large cross-sectional sample of individuals with SCT

In [None]:
#| export

filepath = os.path.join(data_dir, "medrxiv.jsonl")

new_data = pd.DataFrame(data)
if os.path.exists(filepath):
    existing_data = pd.read_json(filepath, lines=True)
    combined_data = pd.concat([existing_data, new_data])
else:
    combined_data = new_data

deduplicated = combined_data.drop_duplicates(subset="identifier")
deduplicated.to_json(filepath, orient="records", lines=True)
logging.info('Total number of records: {}'.format(deduplicated.shape[0]))

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()