In [1]:
!pip install langdetect


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


In [2]:
import json
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from tqdm import tqdm
import csv

In [14]:
input_path = '/mnt/object/metadata/arxiv-metadata-oai.json'
output_path = '/home/cc/arxiv_cleaned_v1.csv'
pdf_filenames_csv = '/Users/riyagarg/Download/all_files_list.txt'  # Replace this with your CSV file path
pdf_filenames_txt = '/Users/riyagarg/Downloads/all_files_list.txt'

In [9]:
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False


In [10]:
def extract_latest_version_info(versions):
    if not versions or not isinstance(versions, list):
        return "", ""
    latest = versions[-1]
    return latest.get('version', ''), latest.get('created', '')


In [11]:
def flatten_authors(authors_parsed):
    if not authors_parsed or not isinstance(authors_parsed, list):
        return ""
    return ", ".join(" ".join(filter(None, author)) for author in authors_parsed)

In [16]:
pdf_filenames_set = set()
with open(pdf_filenames_txt, 'r', encoding='utf-8') as f:
    for line in f:
        filename = line.strip()
        if filename:
            pdf_filenames_set.add(filename)

print(f"Loaded {len(pdf_filenames_set)} PDF filenames.")

Loaded 364068 PDF filenames.


In [17]:
print(len(pdf_filenames_set))

364068


In [95]:
total_lines = 2600000  # Estimate for tqdm (can adjust if needed)
english_count = 0
fieldnames_written = False

In [96]:
with open(input_path, 'r') as infile, open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = None
    fieldnames_written = False
    english_count = 0

    progress = tqdm(infile, total=total_lines, desc=f"Stored: {english_count}")

    for line in progress:
        try:
            record = json.loads(line)
            text = f"{record.get('title', '')} {record.get('abstract', '')}".strip()
    
            # Get version info FIRST to construct the PDF filename
            latest_version, latest_created = extract_latest_version_info(record.get("versions", []))
            pdf_filename = f"{record['id']}{latest_version}.pdf"
    
            # Only keep if the file is in your allowed PDF names
            if pdf_filename not in pdf_filenames_set:
                continue
    
            # Now check if it's English
            if not text or not is_english(text):
                continue
    
            # ✅ Process the matching record
            record["latest_version"] = latest_version
            record["latest_created"] = latest_created
            record["pdf_filename"] = pdf_filename
            record.pop("versions", None)
    
            combined_authors = flatten_authors(record.get("authors_parsed", []))
            record["authors_combined"] = combined_authors
            record.pop("authors_parsed", None)
    
            if writer is None:
                fieldnames = list(record.keys())
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
                fieldnames_written = True
    
            writer.writerow(record)
            english_count += 1

            # 🔄 Update progress bar description
            progress.set_description(f"Stored: {english_count}")

        except (json.JSONDecodeError, UnicodeEncodeError, KeyError):
            continue  # Skip broken lines or missing keys

print(f"\n✅ English records written: {english_count}")
print(f"📁 Saved cleaned data to: {output_path}")


Stored: 36346: : 2710806it [04:38, 9750.27it/s]                            


✅ English records written: 36346
📁 Saved cleaned data to: /kaggle/working/english_arxiv_full_cleaned_final6.csv



