In [1]:
import os
import glob

latest_versions = {}

os.chdir("../Raw_data_testing")

pdf_files = glob.glob("*.pdf")

for pdf_file in pdf_files:
    base_filename = pdf_file[:-6] 

    version_number = int(pdf_file[-5]) 

    if base_filename not in latest_versions or version_number > latest_versions[base_filename]:
        latest_versions[base_filename] = version_number

for pdf_file in pdf_files:
    base_filename = pdf_file[:-6]
    version_number = int(pdf_file[-5])

    if version_number < latest_versions[base_filename]:
        os.remove(pdf_file)
        print(f"Deleted old version: {pdf_file}")



In [2]:
import os
import glob
import PyPDF2
import json

import re

def extract_references_content(text):
    
    pattern = r'R\s*E\s*F\s*E\s*R\s*E\s*N\s*C\s*E\s*S(.+)'

    
    match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)

    
    if match:
        
        return match.group(1).strip()
    else:
        return None

def extract_references(text):
    
    references_content = extract_references_content(text)

    
    if references_content:
        
        pattern = r'\[\d+\]\s*([^“]+?)“([^”]+)”'

        
        matches = re.findall(pattern, references_content, re.DOTALL)

        
        cleaned_matches = []
        for authors, title in matches:
            
            authors_list = [author.strip().replace(',', '') for author in authors.split(',')]

            
            authors_list = [author.replace(' and ', ' ').replace(' et al.', '') for author in authors_list]

            
            authors_list = list(filter(None, authors_list))

            
            cleaned_title = title.replace('\n', '').rstrip(',')

            cleaned_matches.append({'authors': authors_list, 'title': cleaned_title.strip()})

        return cleaned_matches
    else:
        return None

def extract_information(pdf_file):
    
    with open(pdf_file, 'rb') as file:
        
        pdf_reader = PyPDF2.PdfReader(file)

        
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            try:
                text = page.extract_text()
            except:
                return None

            
            references = extract_references(text)

        
        file.close()
        return references

def process_remaining_pdfs(raw_data_folder, json_file_path):
    
    pdf_metadata_dict = {}

    
    with open(json_file_path, 'r', encoding='utf-8') as json_file:
        for line in json_file:
            
            try:
                data = json.loads(line)
            except json.JSONDecodeError:
                print("Error decoding JSON:", line)
                continue  

            
            pdf_id = data.get('id', '')
            if not pdf_id.startswith("2003"):
                continue  

            
            pdf_filename = pdf_id + ".pdf"

            
            title, authors_parsed, doi = extract_metadata_from_json(data)

            
            pdf_metadata_dict[pdf_filename] = {
                'title': title,
                'authors': authors_parsed,
                'doi': doi
            }

    
    os.chdir(raw_data_folder)

    
    pdf_files = glob.glob("*.pdf")

    result_dict = {}

    
    for pdf_file in pdf_files:
        
        pdf_filename = os.path.splitext(pdf_file)[0][:-2]+".pdf"

        
        if pdf_filename in pdf_metadata_dict:
            
            references = extract_information(pdf_file)

            metadata = pdf_metadata_dict[pdf_filename]
            metadata['references'] = references
            result_dict[pdf_filename] = metadata    
            
    return result_dict
    

def extract_metadata_from_json(data):
    
    title = data.get('title', '')
    authors_parsed = data.get('authors_parsed', [])
    doi = data.get('doi', '')

    authors_parsed = [f"{author[1]} {author[0]}" if len(author) > 1 else author[0] for author in authors_parsed]

    return title, authors_parsed, doi


raw_data_folder = "../Raw_data_testing"


json_file_path = "../arxiv_metadata_dump/arxiv-metadata-oai-snapshot.json"


data_dict = process_remaining_pdfs(raw_data_folder, json_file_path)





FloatObject (b'0.000-167979') invalid; use 0.0 instead


In [3]:
for key, value in data_dict.items():
    print(key, ":", value)


2003.01609.pdf : {'title': 'SELD-TCN: Sound Event Localization & Detection via Temporal\n  Convolutional Networks', 'authors': ['Karim Guirguis', 'Christoph Schorn', 'Andre Guntoro', 'Sherif Abdulatif', 'Bin Yang'], 'doi': '10.23919/Eusipco47968.2020.9287716', 'references': [{'authors': ['T. Marques', 'L. Thomas', 'S. Martin', 'D. Mellinger', 'J. A Ward', 'D. Moretti', 'D. Harris', 'and P. Tyack'], 'title': 'Estimating animal population density usingpassive acoustics'}, {'authors': ['E. Benetos T. Weyde'], 'title': 'An Efﬁcient Temporally-Constrained Proba-bilistic Model for Multiple-Instrument Music Transcription'}, {'authors': ['T. Heittola', 'A. Klapuri', 'and T. Virtanen'], 'title': 'Musical Instrument Recog-nition in Polyphonic Audio Using Source-Filter Model for SoundSeparation.'}, {'authors': ['D. Rossiter', 'G. Lam', 'and B. Mak'], 'title': 'Automatic Audio Indexing and AudioPlayback Speed Control as Tools for Language Learning'}, {'authors': ['R. Roden', 'S. Gerlach', 'N. Mori

In [4]:
os.chdir("../")



transformed_list = []


for pdf_filename, pdf_data in data_dict.items():
    
    title = pdf_data['title']
    authors = pdf_data['authors']
    doi = pdf_data['doi']
    references = pdf_data['references']

    
    main_author = authors[0]

    
    co_authors = authors[1:]

    
    cite_count = sum(1 for entry in data_dict.values() if title in entry['title'])

    
    published_article_count = sum(1 for entry in data_dict.values() if main_author in entry['authors'])

    
    all_titles = [entry['title'] for entry in data_dict.values()] + [ref['title'] for ref in references]
    total_cite_count = sum(1 for entry_title in all_titles if title in entry_title)

    
    transformed_entry = {
        "author": main_author,
        "title": title,
        "coAuthor": co_authors,
        "citeCount": total_cite_count,
        "references": {ref['title']: ref['authors'] for ref in references},
        "doi": doi,
        "publishedArticle": published_article_count
    }

    
    transformed_list.append(transformed_entry)


output_file = 'DLdata_scraped.json'
with open(output_file, 'w') as json_file:
    json.dump(transformed_list, json_file, indent=2)