In [2]:
import json
import pandas as pd
import requests
import os
import re

In [3]:
data_dir = '../../data/arXiv/arxiv-metadata-oai-snapshot.json'

In [4]:
if os.path.exists(data_dir.replace(".json", ".parquet")):
    print("Reading parquet..")
    df = pd.read_parquet(data_dir.replace(".json", ".parquet"))
    print("Done!")
else:
    print("Reading json...")
    data = []
    with open(data_dir, 'r') as f:
        for line in f:
            data.append(json.loads(line))

    df = pd.DataFrame(data)
    df.to_parquet(data_dir.replace(".json", ".parquet"))
    print("Done!")

Reading parquet..


In [5]:
df.columns

Index(['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
       'report-no', 'categories', 'license', 'abstract', 'versions',
       'update_date', 'authors_parsed'],
      dtype='object')

In [12]:
filtered_df = df[(df['title'].str.contains(r'methanol|CH3OH', case=False, regex=True, na=False) |
                  df['abstract'].str.contains(r'methanol|CH3OH', case=False, regex=True, na=False)) &
                 (df['categories'].str.contains(r'\bastro-ph\b', case=False, na=False))]

In [19]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1223 entries, 987 to 2263724
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              1223 non-null   object
 1   submitter       1222 non-null   object
 2   authors         1223 non-null   object
 3   title           1223 non-null   object
 4   comments        1124 non-null   object
 5   journal-ref     404 non-null    object
 6   doi             1095 non-null   object
 7   report-no       11 non-null     object
 8   categories      1223 non-null   object
 9   license         1003 non-null   object
 10  abstract        1223 non-null   object
 11  versions        1223 non-null   object
 12  update_date     1223 non-null   object
 13  authors_parsed  1223 non-null   object
dtypes: object(14)
memory usage: 143.3+ KB


In [15]:
def download_arxiv_pdfs(filtered_df, download_folder='../../data/arXiv/pdfs'):
    os.makedirs(download_folder, exist_ok=True)
    
    for _, row in filtered_df.iterrows():
        try:
            id = row['id']
            title = row['title']
            link = f'https://arxiv.org/pdf/{id}'

            sanitized_title = re.sub(r'[^\w\-_\. ]', '_', title)
            filename = os.path.join(download_folder, f'{sanitized_title}.pdf')
            
            response = requests.get(link)
            
            if response.status_code == 200:
                
                with open(filename, 'wb') as f:
                    f.write(response.content)
                
                print(f'Successfully downloaded: {filename}')
            else:
                print(f'Failed to download PDF for DOI: {row["doi"]}, \nTitle: {row["title"]}. Status code: {response.status_code}')
        
        except Exception as e:
            print(f'Error processing DOI {row["doi"]}: {e}')

# Usage
download_arxiv_pdfs(filtered_df)

Successfully downloaded: ../../data/arXiv/pdfs\Evidence for a Massive Protocluster in S255N.pdf
Successfully downloaded: ../../data/arXiv/pdfs\Observations of chemical differentiation in clumpy molecular clouds.pdf
Successfully downloaded: ../../data/arXiv/pdfs\A multi-transition molecular line study of candidate massive young_  stellar objects associated with methanol masers.pdf
Successfully downloaded: ../../data/arXiv/pdfs\The molecular environment of massive star forming cores associated with_  Class II methanol maser emission.pdf
Successfully downloaded: ../../data/arXiv/pdfs\Multi-wavelength observations of Southern Hot Molecular Cores traced by_  methanol masers - I. Ammonia and 24 GHz Continuum Data.pdf
Successfully downloaded: ../../data/arXiv/pdfs\A search for OH 6 GHz maser emission towards southern supernova remnants.pdf
Successfully downloaded: ../../data/arXiv/pdfs\Gas-grain chemistry in cold interstellar cloud cores with a microscopic_  Monte Carlo approach to surface ch