Filtering Papers on the Basis of Selected Topics

In [8]:
import pandas as pd
import os
main_df = pd.read_csv('metadata.csv')

# Load the filter criteria file
filter_df = pd.read_csv('topics_filtered.csv')

# Loading topics for filtering
filter_values = filter_df['relevant_topics'].tolist()


filtered_df = main_df[main_df['primary_topic.display_name'].isin(filter_values)]

filtered_df= filtered_df[filtered_df['language'] == 'en']


# Save the filtered DataFrame to a new Excel file
filtered_df.to_csv('filtered_metadata.csv', index=False)

  main_df = pd.read_csv('metadata.csv')


PDF Data Scraping

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
import re

df = pd.read_csv('filtered_metadata.csv')
pdf_url_column = 'primary_location.pdf_url'
title_column = 'title'

# Remove all null values in the specified column
df = df[df[pdf_url_column].notnull()]

# Ensure the directory to save PDFs exists
output_dir = 'final_pdfs'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Create a directory for MDPI PDFs
mdpi_output_dir = 'final_mdpi_pdfs'
if not os.path.exists(mdpi_output_dir):
    os.makedirs(mdpi_output_dir)

# Path to the log file
log_path = 'downloaded_files.log'

# Load already downloaded files if they exist
downloaded_files = set()
if os.path.exists(log_path):
    with open(log_path, 'r') as log:
        downloaded_files = set(log.read().splitlines())

# List to store URLs that couldn't be downloaded
failed_urls = []

# Function to log downloaded files
def log_downloaded_file(log_path, url):
    with open(log_path, 'a') as log:
        log.write(url + '\n')

# Function to sanitize filenames
def sanitize_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', '_', filename)

# Function to download file
def download_file(session, url, output_dir, filename):
    try:
        response = session.get(url)
        if response.status_code == 200:
            filepath = os.path.join(output_dir, sanitize_filename(filename))
            with open(filepath, 'wb') as f:
                f.write(response.content)
            print(f'Successfully downloaded: {filepath}')
            return url, True
        else:
            print(f'Failed to download: {url}')
            return url, False
    except Exception as e:
        print(f'An error occurred for URL: {url} - {e}')
        return url, False

# Prepare a list of tasks for concurrent execution
tasks = []

# Create a session object to reuse connections
session = requests.Session()

# Iterate through the DataFrame and prepare tasks
for index, row in df.iterrows():
    url = row[pdf_url_column]
    title = row[title_column]
    filename = f"{title}.pdf"

    if url in downloaded_files:
        print(f'Already downloaded: {url}')
        continue

    if 'mdpi' in url.lower():
        if os.path.exists(os.path.join(mdpi_output_dir, sanitize_filename(filename))):
            print(f'Already downloaded MDPI PDF: {filename}')
            continue
        tasks.append((session, url, mdpi_output_dir, filename))
    else:
        if os.path.exists(os.path.join(output_dir, sanitize_filename(filename))):
            print(f'Already downloaded: {filename}')
            continue
        tasks.append((session, url, output_dir, filename))

# Function to execute the download tasks
def execute_task(task):
    session, url, output_dir, filename = task
    return download_file(session, url, output_dir, filename)

# Execute download tasks concurrently
with ThreadPoolExecutor(max_workers=10) as executor:
    future_to_url = {executor.submit(execute_task, task): task[1] for task in tasks}
    for future in as_completed(future_to_url):
        url = future_to_url[future]
        try:
            result_url, success = future.result()
            if success:
                log_downloaded_file(log_path, result_url)
            else:
                failed_urls.append(result_url)
        except Exception as e:
            print(f'An error occurred for URL: {url} - {e}')
            failed_urls.append(url)

# Save failed URLs to a file
failed_urls_file = 'failed_urls.txt'
with open(failed_urls_file, 'w') as f:
    f.write('\n'.join(failed_urls))
print(f'Failed URLs saved to: {failed_urls_file}')


JSON Data Scraping

In [None]:
from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
import json
    
## Load configuration
con_file = open("config.json")
config = json.load(con_file)
con_file.close()

## Initialize client
client = ElsClient(config['apikey'])
client.inst_token = config['insttoken']


df = df[df['primary_topic.display_name'].isin(filter_values)]
elsevier_df = df[(df['primary_location.source.host_organization_name'] == 'Elsevier BV') & (df['has_fulltext'] == True)]
dois = elsevier_df['doi'].dropna()

for doi in dois:
    doi = doi.split("doi.org/")[1]
    print(doi)
    doi_doc = FullDoc(doi = doi)
    if doi_doc.read(client):
        print ("doi_doc.title: ", doi_doc.title)
        doi_doc.write()   
    else:
        print (f'Read document failed for {doi}')