In [65]:
# runs in jupyter container 
import os
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
from multiprocessing import Pool
from concurrent.futures import ProcessPoolExecutor, as_completed

In [66]:
!pip install langdetect




In [67]:
import json
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from tqdm import tqdm
import csv

In [68]:
meta_data_dir = os.getenv("METADATA_DIR", "/mnt/metadata")

input_path = os.path.join(meta_data_dir, 'arxiv-metadata-oai-snapshot.json')
output_path = os.path.join('/home/jovyan/work/', 'arxiv_cleaned_v3.csv')
pdf_filenames_txt = os.path.join(meta_data_dir, 'all_files_list.txt')

print("Input Path:", input_path)
print("Output Path:", output_path)
print("PDF Filenames TXT:", pdf_filenames_txt)

Input Path: /mnt/metadata/arxiv-metadata-oai-snapshot.json
Output Path: /home/jovyan/work/arxiv_cleaned_v3.csv
PDF Filenames TXT: /mnt/metadata/all_files_list.txt


In [69]:
print(os.listdir(meta_data_dir))

['all_files_list.txt', 'arxiv-metadata-oai-snapshot.json', 'authors-parsed.json', 'internal-citations.json']


In [70]:
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False


In [71]:
def extract_latest_version_info(versions):
    if not versions or not isinstance(versions, list):
        return "", ""
    
    # Assume latest is the last string in the list
    latest_version = versions[-1]
    
    if isinstance(latest_version, str):
        return latest_version, ""  # No created date info available
    elif isinstance(latest_version, dict):
        return latest_version.get('version', ''), latest_version.get('created', '')
    else:
        return "", ""


In [72]:
def flatten_authors(authors_parsed):
    if not authors_parsed or not isinstance(authors_parsed, list):
        return ""
    return ", ".join(" ".join(filter(None, author)) for author in authors_parsed)

In [73]:
pdf_filenames_set = set()
with open(pdf_filenames_txt, 'r', encoding='utf-8') as f:
    for line in f:
        filename = line.strip()
        if filename:
            pdf_filenames_set.add(filename)

print(f"Loaded {len(pdf_filenames_set)} PDF filenames.")

Loaded 364068 PDF filenames.


In [74]:
with open(input_path, 'r', encoding='utf-8') as f:
    total_lines = sum(1 for _ in f)

print(f"Total lines in metadata file: {total_lines}")

Total lines in metadata file: 2720631


In [75]:
english_count = 0
fieldnames_written = False

In [76]:
with open(input_path, 'r', encoding='utf-8') as f:
    for i in range(1):
        line = f.readline()
        if not line:
            break  # End of file reached before 5 lines
        try:
            record = json.loads(line)
            print(json.dumps(record, indent=2))  # Pretty-print each record
        except json.JSONDecodeError:
            print(f"⚠️ Line {i+1}: Invalid JSON, skipping...")

{
  "id": "0704.0001",
  "submitter": "Pavel Nadolsky",
  "authors": "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  "title": "Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies",
  "comments": "37 pages, 15 figures; published version",
  "journal-ref": "Phys.Rev.D76:013009,2007",
  "doi": "10.1103/PhysRevD.76.013009",
  "report-no": "ANL-HEP-PR-07-12",
  "categories": "hep-ph",
  "license": null,
  "abstract": "  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with

In [77]:
with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = None

    progress = tqdm(infile, total=total_lines, desc=f"Stored: {english_count}", dynamic_ncols=True)

    for line in progress:
        try:
            record = json.loads(line)
            text = f"{record.get('title', '')} {record.get('abstract', '')}".strip()

            # Get version info FIRST to construct the PDF filename
            latest_version, latest_created = extract_latest_version_info(record.get("versions", []))
            pdf_filename = f"{record['id']}{latest_version}.txt"

            # Only keep if the file is in your allowed PDF names
            if pdf_filename in pdf_filenames_set:
                # Now check if it's English
                if text and is_english(text):
                # ✅ Process the matching record
                    record["latest_version"] = latest_version
                    record["latest_created"] = latest_created
                    record["pdf_filename"] = pdf_filename
                    record.pop("versions", None)
        
                    if writer is None:
                        fieldnames = list(record.keys())
                        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                        writer.writeheader()
                        fieldnames_written = True
        
                    writer.writerow(record)
                    english_count += 1
        
                    # 🔄 Update progress bar description
                    progress.set_description(f"Stored: {english_count}")

        except (json.JSONDecodeError, UnicodeEncodeError, KeyError):
            continue  # Skip broken lines or missing keys

print(f"\n✅ English records written: {english_count}")
print(f"📁 Saved cleaned data to: {output_path}")

Stored: 227494: 100%|██████████| 2720631/2720631 [33:47<00:00, 1341.54it/s] 


✅ English records written: 227494
📁 Saved cleaned data to: /home/jovyan/work/arxiv_cleaned_v3.csv



