In [None]:
import os
import requests
import json
import re

import pandas as pd

import kagglehub # to access the data
import fitz  # PyMuPDF to extract text from PDFs

## Explore Kaggle arXiv dataset

In [None]:
# Download arXiv dataset
path = kagglehub.dataset_download("Cornell-University/arxiv")

print("Path to dataset files:", path)

In [25]:
# Load the metadata - need to be chunks cause it is huge
chunk_size = 10000
json_chunks = pd.read_json('test/arxiv-metadata-oai-snapshot.json', lines=True, chunksize=chunk_size)

# Process each chunk
for chunk in json_chunks:
    # Perform operations on each chunk here
    print(chunk.columns)
    break
    print(chunk["reference"].head())
    # print(chunk.head())

Index(['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
       'report-no', 'categories', 'license', 'abstract', 'versions',
       'update_date', 'authors_parsed'],
      dtype='object')


## Generate a subset of ML papers

Generated a subset of papers which were tagged with one of the categories ["cs.LG", "stat.ML", "cs.AI", "stat.ML", "cs.CV", "cs.NE"] - can choose different field or include more / less depending on how big we want the dataset 

The format is a jason similar to that from Kaggle

In [21]:
# Define a list of Machine Learning categories (from arXiv)
ml_categories = ["cs.LG", "stat.ML", "cs.AI", "stat.ML", "cs.CV", "cs.NE"]

# Create an empty list to store filtered papers
ml_papers = []

# Open and process the JSON file line by line (assuming one paper per line)
with open('test/arxiv-metadata-oai-snapshot.json', 'r') as file:
    for line in file:
        try:
            # Parse each line as a JSON object
            paper = json.loads(line)
            categories = paper.get('categories', []).split()
            # Check if any of the categories match the Machine Learning categories
            if any(category in ml_categories for category in categories):
                ml_papers.append(paper)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e} - skipping this line.")

# Save the filtered papers to a new JSON file
with open('arxiv_ml_papers.json', 'w') as outfile:
    json.dump(ml_papers, outfile, indent=2)

print(f"Found {len(ml_papers)} Machine Learning papers.")


Found 359531 Machine Learning papers.


### Scrape pdf for references 

This function returns the reference section of a paper - might be usefull for generating relations, but need a LM to produce the relations - maybe it is possible for the model to generate the relations with just the raw text from the papers - would be nicer

In [48]:
### example of extracting references from a pdf
def extract_references_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    references = []

    # Loop through all pages of the PDF
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        
        # Look for references by searching for patterns (e.g., "References", "Bibliography")
        # The exact pattern depends on the format of the paper. 
        # A simple example is to look for the string "References":
        if "REFERENCES" in text:
            references_section = text.split("REFERENCES")[1]  # Get text after "References"
            # Combine the text after "REFERENCES" into one block of text
            references_block = " ".join(references_section.split("\n"))
            
            # Append the block as a single reference section
            references.append(references_block)
            
    return references

# Example usage
pdf_path = "/Users/emilienilsson/Documents/DTU/9semester/02456-dl/project/arxiv_ml_papers/0704.0047.pdf"
references = extract_references_from_pdf(pdf_path)
for ref in references:
    print(ref)


 [1] Chan, Y. T. Ho, K. C. 1994 , A simple and efﬁcient estimator for hy- perbolic location, IEEE Transactions on Signal Processing 42(8), 1905– 1915. [2] Cherkassky, V. Mulier, F. 1998 , Leraning from Data: Concepts, Theory, and Methods, John Wiley & Sons inc., New York. [3] Friedlander, B. 1987 , A passive localization algorithm and its accuracy analysis, IEEE Journal of Oceanic Engineering OE-12(1), 234–245. [4] Grabec, I. Antoloviˇc, B. 1994 , Intelligent locator of AE sources, in T. Kishi, Y. Mori M. Enoki, eds, The 12th International Acoustic Emission Symposium, Vol. 7 of Progress in Acoustic Emission, The Japanese Society for Non-Destructive Inspection, Tokyo, Japan, pp. 565–570. [5] Grabec, I. Sachse, W. 1991 , ‘Automatic modeling of physical phenomena: Application to ultrasonic data’, J. Appl. Phys. 69(9), 6233–6244. [6] Grabec, I. Sachse, W. 1997 , Synergetics of Measurement, Prediction and Control, Springer-Verlag, Berlin. [7] Kosel, T. Grabec, I. 1998 , Intelligent locator 

This funciton should be used if we want to seperate the references - should make it easier to make the relations manually

In [43]:
def extract_references_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    references = []
    
    # Regex pattern to match references by index (e.g., [1], [2], [3], etc.)
    ref_pattern = re.compile(r'\[(\d+)\]')  # Matches [1], [2], [3], etc.
    
    # Variable to temporarily store reference content
    current_reference = []
    inside_references = False
    
    # Loop through all pages of the PDF
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        
        # Look for references section
        if "REFERENCES" in text or "BIBLIOGRAPHY" in text:
            inside_references = True
            # Get the part of the text after the "References" section
            references_section = text.split("REFERENCES")[1] if "REFERENCES" in text else text.split("BIBLIOGRAPHY")[1]
        
        if inside_references:
            # Split the section into lines and process each line
            for line in references_section.split("\n"):
                # Check if the line contains a reference index (e.g., [1], [2], etc.)
                if ref_pattern.search(line):
                    # If there's a current reference being built, save it
                    if current_reference:
                        references.append(" ".join(current_reference))  # Join the lines of the reference
                    # Start a new reference
                    current_reference = [line.strip()]  # Start new reference with the current line
                else:
                    # If it's part of the current reference, add it to the current reference
                    current_reference.append(line.strip())

            # After finishing all lines in this section, add the last reference
            if current_reference:
                references.append(" ".join(current_reference))
            break  # We can stop processing after the references section

    return references

# Example usage
pdf_path = "/Users/emilienilsson/Documents/DTU/9semester/02456-dl/project/arxiv_ml_papers/0704.0047.pdf"
references = extract_references_from_pdf(pdf_path)
for i, ref in enumerate(references):
    print(f"Reference {i+1}: {ref}")

Reference 1: 
Reference 2: [1] Chan, Y. T. Ho, K. C. 1994 , A simple and efﬁcient estimator for hy- perbolic location, IEEE Transactions on Signal Processing 42(8), 1905– 1915.
Reference 3: [2] Cherkassky, V. Mulier, F. 1998 , Leraning from Data: Concepts, Theory, and Methods, John Wiley & Sons inc., New York.
Reference 4: [3] Friedlander, B. 1987 , A passive localization algorithm and its accuracy analysis, IEEE Journal of Oceanic Engineering OE-12(1), 234–245.
Reference 5: [4] Grabec, I. Antoloviˇc, B. 1994 , Intelligent locator of AE sources, in T. Kishi, Y. Mori M. Enoki, eds, The 12th International Acoustic Emission Symposium, Vol. 7 of Progress in Acoustic Emission, The Japanese Society for Non-Destructive Inspection, Tokyo, Japan, pp. 565–570.
Reference 6: [5] Grabec, I. Sachse, W. 1991 , ‘Automatic modeling of physical phenomena: Application to ultrasonic data’, J. Appl. Phys. 69(9), 6233–6244.
Reference 7: [6] Grabec, I. Sachse, W. 1997 , Synergetics of Measurement, Prediction

## Download Full-Text PDFs from arXiv

download all papers to pdf format from the machine learning papers - this is a lot do it on hpc

In [None]:
# Example: Downloading the PDF for a given arXiv ID
def download_pdf(arxiv_id, save_path):
    url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    response = requests.get(url)
    
    if response.status_code == 200:
        pdf_path = os.path.join(save_path, f"{arxiv_id}.pdf")
        with open(pdf_path, "wb") as f:
            f.write(response.content)
        #print(f"Downloaded {arxiv_id}.pdf")
    else:
        print(f"Failed to download {arxiv_id}.pdf")

# Download PDFs for all Machine Learning papers
save_path = "arxiv_ml_papers"
os.makedirs(save_path, exist_ok=True)

for paper in ml_papers:
    arxiv_id = paper.get("id")
    download_pdf(arxiv_id, save_path)

## Extract Text from PDFs

extract text from the pdfs - should this be saved as a json or txt?
also run on hpc this is a lot 

In [None]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    # Open the PDF file
    doc = fitz.open(pdf_path)
    text = ""
    
    # Extract text from each page
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text()
    
    return text

# Example usage
# pdf_path = "arxiv_pdf/2101.00001.pdf"  # Replace with your downloaded PDF
# paper_text = extract_text_from_pdf(pdf_path)
# print(paper_text[:1000])  # Print the first 1000 characters of the extracted text

# extract text from all pdfs and save each in a json file
pdf_dir = "arxiv_ml_papers"
text_dir = "arxiv_ml_text"
os.makedirs(text_dir, exist_ok=True)

for paper in ml_papers:
    try:
        arxiv_id = paper.get("id")
        pdf_path = os.path.join(pdf_dir, f"{arxiv_id}.pdf")
        text = extract_text_from_pdf(pdf_path)
        text_path = os.path.join(text_dir, f"{arxiv_id}.json")
        with open(text_path, "w") as f:
            json.dump({"text": text}, f)
    except Exception as e:
        print(f"Error processing {arxiv_id}: {e}")
