In [1]:
# runs in jupyter container 
import os
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
from multiprocessing import Pool
from concurrent.futures import ProcessPoolExecutor, as_completed

In [2]:
!pip install langdetect




In [3]:
import json
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from tqdm import tqdm
import csv

In [4]:
meta_data_dir = os.getenv("METADATA_DIR", "/mnt/meta-data")

input_path = os.path.join('/home/jovyan/work/data-pipeline-files/metadata-v5/', 'arxiv-metadata-oai.json')
output_path = os.path.join('/home/jovyan/work/', 'arxiv_cleaned_v2.csv')
pdf_filenames_txt = os.path.join('/home/jovyan/work/data-pipeline-files', 'all_files_list.txt')

print("Input Path:", input_path)
print("Output Path:", output_path)
print("PDF Filenames TXT:", pdf_filenames_txt)

Input Path: /home/jovyan/work/data-pipeline-files/metadata-v5/arxiv-metadata-oai.json
Output Path: /home/jovyan/work/arxiv_cleaned_v2.csv
PDF Filenames TXT: /home/jovyan/work/data-pipeline-files/all_files_list.txt


In [5]:
print(os.listdir('/home/jovyan/work/'))

['.DS_Store', 'data-pipeline-files', '.git', 'System Diagram.png', 'README.md', '.ipynb_checkpoints']


In [6]:
print(os.listdir('/home/jovyan/work/data-pipeline-files'))

['store_text_to_chunks.ipynb', 'docker-compose-meta-data-db.yaml', 'venv', 'docker-compose-meta-data-db-jupyter.yaml', 'clear_metadata.ipynb', 'data-preprocessing.ipynb', 'rearrage_folders.sh', 'tar_files_list.txt', 'pdf-to-text-from-folder.py', 'pull-data-script.sh', 'read_me_setup_dbs.md', 'metadata-v5', 'create-tar-files-list.sh', 'docker-compose-download-raw-pdfs.yaml', 'docker-compose-pdf-to-text.yaml', 'folders_list.txt', 'download-dependensies.sh', 'process-meta-data.ipynb', 'create_text_files_list.sh', 'all_files_list.txt', '.ipynb_checkpoints', 'process_tar_files.sh']


In [7]:
print(os.listdir(meta_data_dir))

['meta-data', 'metadata', 'raw-data', 'text-files-data']


In [8]:
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False


In [9]:
def extract_latest_version_info(versions):
    if not versions or not isinstance(versions, list):
        return "", ""
    
    # Assume latest is the last string in the list
    latest_version = versions[-1]
    
    if isinstance(latest_version, str):
        return latest_version, ""  # No created date info available
    elif isinstance(latest_version, dict):
        return latest_version.get('version', ''), latest_version.get('created', '')
    else:
        return "", ""


In [10]:
def flatten_authors(authors_parsed):
    if not authors_parsed or not isinstance(authors_parsed, list):
        return ""
    return ", ".join(" ".join(filter(None, author)) for author in authors_parsed)

In [11]:
pdf_filenames_set = set()
with open(pdf_filenames_txt, 'r', encoding='utf-8') as f:
    for line in f:
        filename = line.strip()
        if filename:
            pdf_filenames_set.add(filename)

print(f"Loaded {len(pdf_filenames_set)} PDF filenames.")

Loaded 449322 PDF filenames.


In [12]:
with open(input_path, 'r', encoding='utf-8') as f:
    total_lines = sum(1 for _ in f)

print(f"Total lines in metadata file: {total_lines}")

Total lines in metadata file: 3360984


In [13]:
english_count = 0
fieldnames_written = False

In [14]:
with open(input_path, 'r', encoding='utf-8') as f:
    for i in range(1):
        line = f.readline()
        if not line:
            break  # End of file reached before 5 lines
        try:
            record = json.loads(line)
            print(json.dumps(record, indent=2))  # Pretty-print each record
        except json.JSONDecodeError:
            print(f"⚠️ Line {i+1}: Invalid JSON, skipping...")

{
  "id": "0704.0001",
  "submitter": "Pavel Nadolsky",
  "authors": "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  "title": "Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies",
  "comments": "37 pages, 15 figures; published version",
  "journal-ref": "Phys.Rev.D76:013009,2007",
  "doi": "10.1103/PhysRevD.76.013009",
  "abstract": "  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed te

In [None]:
with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = None

    # Count total lines for progress bar
    with open(input_path, 'r', encoding='utf-8') as f:
        total_lines = sum(1 for _ in f)

    progress = tqdm(infile, total=total_lines, desc="Stored: 0", dynamic_ncols=True)

    for line in progress:
        try:
            record = json.loads(line)
            text = f"{record.get('title', '')} {record.get('abstract', '')}".strip()

            # Extract version info
            latest_version, latest_created = extract_latest_version_info(record.get("versions", []))
            pdf_filename = f"{record['id']}{latest_version}.txt"

            # Check filename existence and English language
            if pdf_filename in pdf_filenames_set and text and is_english(text):
                record["latest_version"] = latest_version
                # record["latest_created"] = latest_created  # Uncomment if needed
                record["txt_filename"] = pdf_filename
                record["created_yymm"] = pdf_filename.split('.')[0]  # You may want a more precise format
                record.pop("versions", None)

                if writer is None:
                    fieldnames = list(record.keys())
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                    writer.writeheader()
                    fieldnames_written = True

                writer.writerow(record)
                english_count += 1
                progress.set_description(f"Stored: {english_count}")

        except (json.JSONDecodeError, UnicodeEncodeError, KeyError):
            continue  # Skip corrupted lines or missing keys

print(f"\n✅ English records written: {english_count}")
print(f"📁 Cleaned data saved to: {output_path}")

Stored: 23384:   1%|          | 23531/3360984 [01:58<4:16:36, 216.77it/s]