In [1]:
pip install pyalex

Collecting pyalex
  Downloading pyalex-0.18-py3-none-any.whl.metadata (14 kB)
Downloading pyalex-0.18-py3-none-any.whl (13 kB)
Installing collected packages: pyalex
Successfully installed pyalex-0.18
Note: you may need to restart the kernel to use updated packages.


In [4]:
## Test run
import pyalex
from pyalex import Works

# Optional: Set your email for the OpenAlex polite pool
pyalex.config.email = "bennis.yiu@connect.polyu.hk"

# List of DOIs to query
dois_to_query = [
    "10.1177/10963480241229235",
    "10.1002/adfm.202413884",
    "10.1109/TNNLS.2023.3336563",
    "10.1016/j.esci.2024.100281",
    "10.1109/TEVC.2023.3278132"
]

def get_openalex_data_for_dois(doi_list):
    """
    Retrieves publication data from OpenAlex for a list of DOIs.

    Args:
        doi_list: A list of DOI strings.

    Returns:
        A list of dictionaries, where each dictionary contains
        information for a successfully found DOI. Returns an empty
        list if no data is found or an error occurs.
    """
    results = []
    print(f"Querying OpenAlex for {len(doi_list)} DOIs...")

    for doi in doi_list:
        print(f"\n--- Processing DOI: {doi} ---")
        try:
            # Construct the full DOI URL for querying
            # pyalex often works better with the full URL format
            full_doi_url = f"https://doi.org/{doi.lower()}" # Lowercase is good practice

            # Query OpenAlex Works endpoint by DOI
            work = Works()[full_doi_url] # Using dict-like access requires the full URL

            if not work:
                print(f"  DOI not found in OpenAlex: {doi}")
                continue

            # --- Extract desired information ---
            # Basic Info
            title = work.get('title', 'N/A')
            pub_year = work.get('publication_year', 'N/A')
            openalex_id = work.get('id', 'N/A')
            cited_by_count = work.get('cited_by_count', 0)
            journal_name = work.get('host_venue', {}).get('display_name', 'N/A')
            journal_issn = work.get('host_venue', {}).get('issn_l', 'N/A') # Linking ISSN

            # Authors and Affiliations
            authors_info = []
            if work.get('authorships'):
                for authorship in work['authorships']:
                    author_name = authorship.get('author', {}).get('display_name', 'N/A')
                    author_orcid = authorship.get('author', {}).get('orcid') # Might be None
                    institutions = authorship.get('institutions', [])
                    institution_names = [inst.get('display_name', 'N/A') for inst in institutions]
                    authors_info.append({
                        "name": author_name,
                        "orcid": author_orcid,
                        "institutions": institution_names
                    })

            # Concepts (Topics/Subjects) - Let's take the top 3
            concepts_info = []
            if work.get('concepts'):
                # Sort concepts by score (descending) and take top 3
                sorted_concepts = sorted(work['concepts'], key=lambda x: x.get('score', 0), reverse=True)
                for concept in sorted_concepts[:3]:
                    concepts_info.append({
                        "name": concept.get('display_name', 'N/A'),
                        "level": concept.get('level', 'N/A'),
                        "score": concept.get('score', 'N/A')
                    })

            # Store extracted data
            extracted_data = {
                "doi": doi,
                "openalex_id": openalex_id,
                "title": title,
                "publication_year": pub_year,
                "cited_by_count": cited_by_count,
                "journal": journal_name,
                "journal_issn_l": journal_issn,
                "authors": authors_info,
                "concepts": concepts_info,
                # Add more fields here if needed by exploring the 'work' object
                # e.g., 'type', 'abstract_inverted_index', 'referenced_works', 'related_works'
            }
            results.append(extracted_data)

            # --- Print some key retrieved info ---
            print(f"  Title: {title}")
            print(f"  Journal: {journal_name} ({pub_year})")
            print(f"  Authors: {', '.join([a['name'] for a in authors_info])}")
            print(f"  Top Concepts: {', '.join([c['name'] for c in concepts_info])}")
            print(f"  Cited By: {cited_by_count}")
            print(f"  OpenAlex ID: {openalex_id}")

        except Exception as e:
            print(f"  Error processing DOI {doi}: {e}")
            # This could be a network error, API error, or the DOI truly not existing

    print("\n--- Finished Querying ---")
    
    return results

# --- Main execution ---
if __name__ == "__main__":
    # Call the function with the list of DOIs
    retrieved_data = get_openalex_data_for_dois(dois_to_query)

    # You can now work with the 'retrieved_data' list, which contains
    # dictionaries of information for each successfully processed DOI.
    print(f"\nSuccessfully retrieved data for {len(retrieved_data)} out of {len(dois_to_query)} DOIs.")

    # Example: Print the title of the first result if available
    if retrieved_data:
        print(f"\nExample - Title of first result: {retrieved_data[0].get('title', 'N/A')}")

    # Example: Print author names and institutions for the second result if available
    if len(retrieved_data) > 1:
        print("\nExample - Authors/Institutions of second result:")
        for author_info in retrieved_data[1].get('authors', []):
            print(f"  - {author_info['name']} ({', '.join(author_info['institutions'])})")

Querying OpenAlex for 5 DOIs...

--- Processing DOI: 10.1177/10963480241229235 ---
  Title: Artificial Intelligence in Hospitality and Tourism: Insights From Industry Practices, Research Literature, and Expert Opinions
  Journal: N/A (2024)
  Authors: Hyunsu Kim, Kevin Kam Fung So, Seunghun Shin, Jing Li
  Top Concepts: Hospitality, Tourism, Hospitality industry
  Cited By: 30
  OpenAlex ID: https://openalex.org/W4391822953

--- Processing DOI: 10.1002/adfm.202413884 ---
  Title: Hierarchical Engineering on Built‐In Electric Field of Bimetallic Zeolitic Imidazolate Derivatives Towards Amplified Dielectric Loss
  Journal: N/A (2024)
  Authors: Shijie Zhang, Jiajun Zheng, Di Lan, Zhenguo Gao, Xiaowei Liang, Qingfeng Tian, Zhiwei Zhao, Guanglei Wu
  Top Concepts: Materials science, Bimetallic strip, Zeolitic imidazolate framework
  Cited By: 26
  OpenAlex ID: https://openalex.org/W4403192979

--- Processing DOI: 10.1109/TNNLS.2023.3336563 ---
  Title: Learning to Aggregate Multi-Scale Con

In [6]:
retrieved_data

[{'doi': '10.1177/10963480241229235',
  'openalex_id': 'https://openalex.org/W4391822953',
  'title': 'Artificial Intelligence in Hospitality and Tourism: Insights From Industry Practices, Research Literature, and Expert Opinions',
  'publication_year': 2024,
  'cited_by_count': 30,
  'journal': 'N/A',
  'journal_issn_l': 'N/A',
  'authors': [{'name': 'Hyunsu Kim',
    'orcid': 'https://orcid.org/0000-0003-0103-9313',
    'institutions': ['California State University, Fullerton']},
   {'name': 'Kevin Kam Fung So',
    'orcid': 'https://orcid.org/0000-0002-4846-7481',
    'institutions': ['Oklahoma State University', 'Kyung Hee University']},
   {'name': 'Seunghun Shin',
    'orcid': 'https://orcid.org/0000-0001-7022-6732',
    'institutions': ['Hong Kong Polytechnic University']},
   {'name': 'Jing Li',
    'orcid': 'https://orcid.org/0000-0003-3621-0838',
    'institutions': ['Texas Tech University']}],
  'concepts': [{'name': 'Hospitality', 'level': 3, 'score': 0.8348868},
   {'name'

In [None]:
## Revise version:

import pyalex
from pyalex import Works
import pandas as pd
import time
import csv
import os  # To check if CSV exists

# --- Configuration ---
# Optional: Set your email for the OpenAlex polite pool (recommended)
pyalex.config.email = "bennis.yiu@connect.polyu.hk"

# Define the headers for the output CSV based on the data we plan to extract
# This ensures consistency even if some records lack certain fields
CSV_HEADERS = [
    "input_doi", # Keep track of the DOI we queried with
    "openalex_id",
    "title",
    "publication_year",
    "cited_by_count",
    "journal",
    "journal_issn_l",
    "authors_names", # Comma-separated string of names
    "authors_orcids", # Comma-separated string of ORCIDs (or N/A)
    "institutions", # Comma-separated string of unique institutions
    "top_concept_1_name",
    "top_concept_1_level",
    "top_concept_1_score",
    "top_concept_2_name",
    "top_concept_2_level",
    "top_concept_2_score",
    "top_concept_3_name",
    "top_concept_3_level",
    "top_concept_3_score",
    "retrieval_status" # Indicate success or failure for this DOI
]

def fetch_and_save_openalex_data(
    df,
    doi_column='prism_doi',
    output_csv_path='openalex_results.csv',
    max_calls=4000,
    sleep_time=0.1 # Small delay between calls to be polite
    ):
    """
    Retrieves publication data from OpenAlex for DOIs in a DataFrame,
    limits API calls, and saves results incrementally to a CSV.

    Args:
        df (pd.DataFrame): Input DataFrame containing DOIs.
        doi_column (str): Name of the column containing the DOIs.
        output_csv_path (str): Path to save the output CSV file.
        max_calls (int): Maximum number of OpenAlex API calls for this run.
        sleep_time (float): Seconds to wait between API calls.

    Returns:
        int: The number of DOIs successfully processed and saved in this run.
    """
    if doi_column not in df.columns:
        print(f"Error: DOI column '{doi_column}' not found in DataFrame.")
        return 0

    dois_to_process = df[doi_column].dropna().unique() # Get unique, non-null DOIs
    print(f"Found {len(dois_to_process)} unique non-null DOIs in column '{doi_column}'.")

    processed_count = 0
    calls_made = 0

    # Check if CSV exists to decide whether to write headers
    file_exists = os.path.isfile(output_csv_path)

    # Open CSV file in append mode ('a')
    # Use newline='' to prevent extra blank rows in Windows
    # Use utf-8 encoding for broad compatibility
    try:
        with open(output_csv_path, 'a', newline='', encoding='utf-8') as csvfile:
            # Use DictWriter for easy writing from dictionaries
            writer = csv.DictWriter(csvfile, fieldnames=CSV_HEADERS)

            # Write header only if the file is new
            if not file_exists:
                writer.writeheader()
                print(f"Created new output file: {output_csv_path}")
            else:
                print(f"Appending to existing file: {output_csv_path}")

            # Iterate through the unique DOIs
            for doi in dois_to_process:
                if calls_made >= max_calls:
                    print(f"\nReached maximum API calls limit ({max_calls}). Stopping.")
                    break

                if not isinstance(doi, str) or not doi.strip():
                    print(f"Skipping invalid DOI value: {doi}")
                    continue

                doi_cleaned = doi.strip().lower() # Clean and normalize DOI
                print(f"\n--- Processing DOI ({calls_made + 1}/{max_calls}): {doi_cleaned} ---")

                calls_made += 1
                extracted_data = {header: 'N/A' for header in CSV_HEADERS} # Initialize with defaults
                extracted_data["input_doi"] = doi # Store the original queried DOI
                retrieval_successful = False

                try:
                    # Construct the full DOI URL
                    full_doi_url = f"https://doi.org/{doi_cleaned}"

                    # Query OpenAlex Works endpoint by DOI
                    work = Works()[full_doi_url]
                    time.sleep(sleep_time) # Pause before next potential call

                    if work and work.get('id'): # Check if a valid work object was returned
                        # --- Extract desired information ---
                        extracted_data["openalex_id"] = work.get('id', 'N/A')
                        extracted_data["title"] = work.get('title', 'N/A')
                        extracted_data["publication_year"] = work.get('publication_year', 'N/A')
                        extracted_data["cited_by_count"] = work.get('cited_by_count', 0)
                        extracted_data["journal"] = work.get('host_venue', {}).get('display_name', 'N/A')
                        extracted_data["journal_issn_l"] = work.get('host_venue', {}).get('issn_l', 'N/A')

                        # Authors and Affiliations
                        author_names = []
                        author_orcids = []
                        all_institutions = set() # Use a set to store unique institution names
                        if work.get('authorships'):
                            for authorship in work['authorships']:
                                author_names.append(authorship.get('author', {}).get('display_name', 'N/A'))
                                author_orcids.append(authorship.get('author', {}).get('orcid', 'N/A') or 'N/A') # Handle None ORCID
                                institutions = authorship.get('institutions', [])
                                for inst in institutions:
                                    inst_name = inst.get('display_name', 'N/A')
                                    if inst_name != 'N/A':
                                        all_institutions.add(inst_name)

                        extracted_data["authors_names"] = ", ".join(author_names)
                        extracted_data["authors_orcids"] = ", ".join(author_orcids)
                        extracted_data["institutions"] = ", ".join(sorted(list(all_institutions)))

                        # Concepts (Top 3)
                        if work.get('concepts'):
                            sorted_concepts = sorted(work['concepts'], key=lambda x: x.get('score', 0), reverse=True)
                            for i, concept in enumerate(sorted_concepts[:3]):
                                extracted_data[f"top_concept_{i+1}_name"] = concept.get('display_name', 'N/A')
                                extracted_data[f"top_concept_{i+1}_level"] = concept.get('level', 'N/A')
                                extracted_data[f"top_concept_{i+1}_score"] = concept.get('score', 'N/A')

                        print(f"  Success: Found data for {doi_cleaned}")
                        extracted_data["retrieval_status"] = "Success"
                        retrieval_successful = True
                        processed_count += 1

                    else:
                        print(f"  Failed: DOI not found or no data in OpenAlex: {doi_cleaned}")
                        extracted_data["retrieval_status"] = "Failed - Not Found"

                except Exception as e:
                    print(f"  Error processing DOI {doi_cleaned}: {e}")
                    extracted_data["retrieval_status"] = f"Failed - Error: {type(e).__name__}"
                    # Optional: Add a longer sleep after an error
                    # time.sleep(1)

                # Write the row to CSV regardless of success/failure, using the status field
                writer.writerow(extracted_data)
                csvfile.flush() # Ensure data is written to disk periodically

    except IOError as e:
        print(f"Error opening or writing to CSV file {output_csv_path}: {e}")
        return 0 # Indicate failure to write
    except Exception as e:
        print(f"An unexpected error occurred during processing: {e}")
        return processed_count # Return count up to the point of failure

    print(f"\n--- Finished Run ---")
    print(f"Total API calls made in this run: {calls_made}")
    print(f"DOIs successfully processed and saved in this run: {processed_count}")
    print(f"Results saved to: {output_csv_path}")

    return processed_count


# --- Main execution ---
if __name__ == "__main__":

    # --- 1. Load your Scopus data ---
    # Replace 'your_scopus_data.csv' with the actual path to your file
    scopus_data_path = 'your_scopus_data.csv'
    output_file = 'openalex_enhanced_data.csv'
    doi_col_name = 'prism_doi' # The name of the DOI column in your Scopus file
    api_limit = 4000 # Set your desired limit per run

    try:
        print(f"Loading Scopus data from: {scopus_data_path}")
        input_df = pd.read_csv(scopus_data_path)
        print(f"Loaded {len(input_df)} rows.")
    except FileNotFoundError:
        print(f"Error: Input file not found at {scopus_data_path}")
        # Example: Create a dummy DataFrame for testing if file not found
        print("Creating a dummy DataFrame for testing.")
        input_df = pd.DataFrame({
            'prism_doi': [
                "10.1177/10963480241229235",
                "10.1002/adfm.202413884",
                "10.1109/TNNLS.2023.3336563",
                "10.1016/j.esci.2024.100281",
                "10.1109/TEVC.2023.3278132",
                "invalid_doi", # Example of an invalid DOI
                None, # Example of a missing DOI
                "10.1000/nodata_doi", # Example of a DOI potentially not in OpenAlex
            ],
            'other_scopus_data': range(8)
        })
        doi_col_name = 'prism_doi' # Ensure this matches the dummy frame
    except Exception as e:
        print(f"Error loading CSV: {e}")
        exit() # Exit if we can't load the data

    # --- 2. Fetch data from OpenAlex and save to CSV ---
    successfully_processed = fetch_and_save_openalex_data(
        df=input_df,
        doi_column=doi_col_name,
        output_csv_path=output_file,
        max_calls=api_limit
    )

    # --- 3. Load the results from the CSV into a DataFrame ---
    if successfully_processed > 0 or os.path.isfile(output_file):
        try:
            print(f"\nLoading results from {output_file} into a DataFrame...")
            results_df = pd.read_csv(output_file)
            print(f"Successfully loaded {len(results_df)} rows from the CSV.")
            print("\nFirst 5 rows of the results DataFrame:")
            print(results_df.head())

            # You can now work with results_df
            # For example, merge it back with your original Scopus data if needed:
            # merged_df = pd.merge(input_df, results_df, left_on=doi_col_name, right_on='input_doi', how='left')
            # print("\nPreview of merged data:")
            # print(merged_df.head())

        except FileNotFoundError:
            print(f"Error: Output file {output_file} not found after processing. Cannot load DataFrame.")
        except Exception as e:
            print(f"Error reading results CSV into DataFrame: {e}")
    else:
        print("\nNo new data was processed or saved, skipping loading results DataFrame.")