In [9]:
import json


# Read all abstracts from the data/arxiv-metadata-oai-snapshot.json file
text_for_embedding = []
with open("../data/arxiv-metadata-oai-snapshot.json", "r") as f:
    for line in f:
        paper = {}
        record = json.loads(line)
        paper["id"] = record["id"]
        if "title" in record:
            paper["title"] = record["title"]
        if "abstract" in record:
            paper["abstract"] = record["abstract"]
        if "journal-ref" in record:
            paper["journal-ref"] = record["journal-ref"]
        if "categories" in record:
            paper["categories"] = record["categories"]
        text_for_embedding.append(paper)

In [10]:
# Save abstracts to a json file with one abstract per line for processing
with open("../data/text_for_embedding.json", "w") as f:
    for paper in text_for_embedding:
        f.write(json.dumps(paper) + "\n")

In [14]:
import json
import re
from nltk.corpus import stopwords
import nltk

# Download the stopwords data
nltk.download("stopwords")
# Get the set of English stopwords
stop_words = set(stopwords.words("english"))


# Function to clean the text
def clean_text(text):
    if not isinstance(text, str):
        return ""  # Return empty string if input is not a string
    text = text.replace("\n", " ")  # Remove newline characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters and digits
    words = text.split()  # Tokenize the text
    cleaned_words = [word for word in words if word not in stop_words]  # Remove stopwords
    return " ".join(cleaned_words)  # Return the cleaned text as a string


# Read the input file
with open("../data/text_for_embedding.json", "r") as file:
    papers = [json.loads(line) for line in file]

# Clean the text for each paper
cleaned_papers = []
for paper in papers:
    cleaned_title = clean_text(paper.get("title", "")) if "title" in paper else ""
    cleaned_abstract = clean_text(paper.get("abstract", "")) if "abstract" in paper else ""
    cleaned_journal_ref = clean_text(paper.get("journal-ref", "")) if "journal-ref" in paper else ""
    cleaned_categories = clean_text(paper.get("categories", "")) if "categories" in paper else ""

    cleaned_paper = {
        "id": paper.get("id", ""),
        "title": cleaned_title,
        "abstract": cleaned_abstract,
        "journal-ref": cleaned_journal_ref,
        "categories": cleaned_categories,
    }
    cleaned_papers.append(cleaned_paper)

# Save the cleaned text to a json file
with open("../data/cleaned_texts_for_embedding.json", "w") as file:
    for paper in cleaned_papers:
        json.dump(paper, file)
        file.write("\n")

print("Cleaning complete. The cleaned text is saved to cleaned_texts_for_embedding.json")

[nltk_data] Downloading package stopwords to /Users/arjan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cleaning complete. The cleaned text is saved to cleaned_texts_for_embedding.json
