In [39]:
import os
import glob

latest_versions = {}

os.chdir("../Raw_data")

pdf_files = glob.glob("*.pdf")

for pdf_file in pdf_files:
    base_filename = pdf_file[:-6] 

    version_number = int(pdf_file[-5]) 

    if base_filename not in latest_versions or version_number > latest_versions[base_filename]:
        latest_versions[base_filename] = version_number

for pdf_file in pdf_files:
    base_filename = pdf_file[:-6]
    version_number = int(pdf_file[-5])

    if version_number < latest_versions[base_filename]:
        os.remove(pdf_file)
        print(f"Deleted old version: {pdf_file}")



In [43]:
import os
import glob
import PyPDF2
import json

import re

def extract_references_content(text):
    
    pattern = r'R\s*E\s*F\s*E\s*R\s*E\s*N\s*C\s*E\s*S(.+)'

    
    match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)

    
    if match:
        
        return match.group(1).strip()
    else:
        return None

def extract_references(text):
    
    references_content = extract_references_content(text)

    
    if references_content:
        
        pattern = r'\[\d+\]\s*([^“]+?)“([^”]+)”'

        
        matches = re.findall(pattern, references_content, re.DOTALL)

        
        cleaned_matches = []
        for authors, title in matches:
            
            authors_list = [author.strip().replace(',', '') for author in authors.split(',')]

            
            authors_list = [author.replace(' and ', ' ').replace(' et al.', '') for author in authors_list]

            
            authors_list = list(filter(None, authors_list))

            
            cleaned_title = title.replace('\n', '').rstrip(',')

            cleaned_matches.append({'authors': authors_list, 'title': cleaned_title.strip()})

        return cleaned_matches
    else:
        return None

def extract_information(pdf_file):
    
    with open(pdf_file, 'rb') as file:
        
        pdf_reader = PyPDF2.PdfReader(file)

        
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text = page.extract_text()

            
            references = extract_references(text)

        
        file.close()
        return references

def process_remaining_pdfs(raw_data_folder, json_file_path):
    
    pdf_metadata_dict = {}

    
    with open(json_file_path, 'r', encoding='utf-8') as json_file:
        for line in json_file:
            
            try:
                data = json.loads(line)
            except json.JSONDecodeError:
                print("Error decoding JSON:", line)
                continue  

            
            pdf_id = data.get('id', '')
            if not pdf_id.startswith("2003"):
                continue  

            
            pdf_filename = pdf_id + ".pdf"

            
            title, authors_parsed, doi = extract_metadata_from_json(data)

            
            pdf_metadata_dict[pdf_filename] = {
                'title': title,
                'authors': authors_parsed,
                'doi': doi
            }

    
    os.chdir(raw_data_folder)

    
    pdf_files = glob.glob("*.pdf")

    
    for pdf_file in pdf_files:
        
        pdf_filename = os.path.splitext(pdf_file)[0][:-2]+".pdf"

        
        if pdf_filename in pdf_metadata_dict:
            
            references = extract_information(pdf_file)

            
            metadata = pdf_metadata_dict[pdf_filename]
            title = metadata['title']
            authors_parsed = metadata['authors']
            doi = metadata['doi']
            pdf_metadata_dict['references'] = references


    return pdf_metadata_dict
    

def extract_metadata_from_json(data):
    
    title = data.get('title', '')
    authors_parsed = data.get('authors_parsed', [])
    doi = data.get('doi', '')

    authors_parsed = [f"{author[1]} {author[0]}" if len(author) > 1 else author[0] for author in authors_parsed]

    return title, authors_parsed, doi


raw_data_folder = "../Raw_data"


json_file_path = "../arxiv_metadata_dump//arxiv-metadata-oai-snapshot.json"


data_dict = process_remaining_pdfs(raw_data_folder, json_file_path)






2003.00958.pdf
Title: Score Engineered Logistic Regression
Authors Parsed: ['Bruce Hoadley']
DOI: None
References: None
--------------------------------------------------
2003.00600.pdf
Title: A Lobster-inspired Hybrid Actuator With Rigid and Soft Components
Authors Parsed: ['Yaohui Chen', 'Sing Le', 'Qiao Chu Tan', 'Oscar Lau', 'Chaoyang Song']
DOI: 10.1115/detc2017-68082
References: None
--------------------------------------------------
2003.01081.pdf
Title: On-the-fly Optimization of Parallel Computation of Symbolic Symplectic
  Invariants
Authors Parsed: ['Joseph Ben Geloun', 'Camille Coti', 'Allen D. Malony']
DOI: None
References: []
--------------------------------------------------
2003.00667.pdf
Title: MVP: Unified Motion and Visual Self-Supervised Learning for Large-Scale
  Robotic Navigation
Authors Parsed: ['Marvin Chancán', 'Michael Milford']
DOI: None
References: None
--------------------------------------------------
2003.00309.pdf
Title: Possible Lattice and Charge Orde

unknown widths : 
[0, IndirectObject(501, 0, 140433016204944)]
unknown widths : 
[0, IndirectObject(503, 0, 140433016204944)]
unknown widths : 
[0, IndirectObject(506, 0, 140433016204944)]
unknown widths : 
[0, IndirectObject(508, 0, 140433016204944)]
unknown widths : 
[0, IndirectObject(510, 0, 140433016204944)]
unknown widths : 
[0, IndirectObject(512, 0, 140433016204944)]
unknown widths : 
[0, IndirectObject(515, 0, 140433016204944)]
unknown widths : 
[0, IndirectObject(503, 0, 140433016204944)]
unknown widths : 
[0, IndirectObject(522, 0, 140433016204944)]
unknown widths : 
[0, IndirectObject(525, 0, 140433016204944)]
unknown widths : 
[0, IndirectObject(528, 0, 140433016204944)]
unknown widths : 
[0, IndirectObject(530, 0, 140433016204944)]
unknown widths : 
[0, IndirectObject(532, 0, 140433016204944)]
unknown widths : 
[0, IndirectObject(512, 0, 140433016204944)]
unknown widths : 
[0, IndirectObject(534, 0, 140433016204944)]
unknown widths : 
[0, IndirectObject(536, 0, 1404330162

2003.00963.pdf
Title: Piecewise Linear Valued Constraint Satisfaction Problems with Fixed
  Number of Variables
Authors Parsed: ['Manuel Bodirsky', 'Marcello Mamino', 'Caterina Viola']
DOI: None
References: None
--------------------------------------------------
2003.00893.pdf
Title: Gated Fusion Network for Degraded Image Super Resolution
Authors Parsed: ['Xinyi Zhang', 'Hang Dong', 'Zhe Hu', 'Wei-Sheng Lai', 'Fei Wang', 'Ming-Hsuan Yang']
DOI: None
References: None
--------------------------------------------------
2003.00971.pdf
Title: Graphing Website Relationships for Risk Prediction: Identifying Derived
  Threats to Users Based on Known Indicators
Authors Parsed: ['Philip H. Kulp', 'Nikki E. Robinson']
DOI: 10.1007/978-3-030-63089-8
References: [{'authors': ['D. Chiba', 'K. Tobe', 'T. Mori', 'and S. Goto. Detecting Malicious Websites by Learning IP Address Features. \n2012 IEEE/IPSJ 12th International Symposium on Applications the Internet', '29 –39. \n[2] T. Fawcett'], 'title': 

KeyboardInterrupt: 