In [1]:
pip install pyalex

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pyalex
from pyalex import Works
import pandas as pd
import time
import csv
import os  # To check if CSV exists

In [6]:
## Test run

# Optional: Set your email for the OpenAlex polite pool
pyalex.config.email = "bennis.yiu@connect.polyu.hk"

# List of DOIs to query
dois_to_query = [
    "10.1177/10963480241229235",
    "10.1002/adfm.202413884",
    "10.1109/TNNLS.2023.3336563",
    "10.1016/j.esci.2024.100281",
    "10.1109/TEVC.2023.3278132"
]

def get_openalex_data_for_dois(doi_list):
    """
    Retrieves publication data from OpenAlex for a list of DOIs.

    Args:
        doi_list: A list of DOI strings.

    Returns:
        A list of dictionaries, where each dictionary contains
        information for a successfully found DOI. Returns an empty
        list if no data is found or an error occurs.
    """
    results = []
    print(f"Querying OpenAlex for {len(doi_list)} DOIs...")

    for doi in doi_list:
        print(f"\n--- Processing DOI: {doi} ---")
        try:
            # Construct the full DOI URL for querying
            # pyalex often works better with the full URL format
            full_doi_url = f"https://doi.org/{doi.lower()}" # Lowercase is good practice

            # Query OpenAlex Works endpoint by DOI
            work = Works()[full_doi_url] # Using dict-like access requires the full URL

            if not work:
                print(f"  DOI not found in OpenAlex: {doi}")
                continue

            # --- Extract desired information ---
            # Basic Info
            title = work.get('title', 'N/A')
            pub_year = work.get('publication_year', 'N/A')
            openalex_id = work.get('id', 'N/A')
            cited_by_count = work.get('cited_by_count', 0)
            journal_name = work.get('host_venue', {}).get('display_name', 'N/A')
            journal_issn = work.get('host_venue', {}).get('issn_l', 'N/A') # Linking ISSN

            # Authors and Affiliations
            authors_info = []
            if work.get('authorships'):
                for authorship in work['authorships']:
                    author_name = authorship.get('author', {}).get('display_name', 'N/A')
                    author_orcid = authorship.get('author', {}).get('orcid') # Might be None
                    institutions = authorship.get('institutions', [])
                    institution_names = [inst.get('display_name', 'N/A') for inst in institutions]
                    authors_info.append({
                        "name": author_name,
                        "orcid": author_orcid,
                        "institutions": institution_names
                    })

            # Concepts (Topics/Subjects) - Let's take the top 3
            concepts_info = []
            if work.get('concepts'):
                # Sort concepts by score (descending) and take top 3
                sorted_concepts = sorted(work['concepts'], key=lambda x: x.get('score', 0), reverse=True)
                for concept in sorted_concepts[:3]:
                    concepts_info.append({
                        "name": concept.get('display_name', 'N/A'),
                        "level": concept.get('level', 'N/A'),
                        "score": concept.get('score', 'N/A')
                    })

            # Store extracted data
            extracted_data = {
                "doi": doi,
                "openalex_id": openalex_id,
                "title": title,
                "publication_year": pub_year,
                "cited_by_count": cited_by_count,
                "journal": journal_name,
                "journal_issn_l": journal_issn,
                "authors": authors_info,
                "concepts": concepts_info,
                # Add more fields here if needed by exploring the 'work' object
                # e.g., 'type', 'abstract_inverted_index', 'referenced_works', 'related_works'
            }
            results.append(extracted_data)

            # --- Print some key retrieved info ---
            print(f"  Title: {title}")
            print(f"  Journal: {journal_name} ({pub_year})")
            print(f"  Authors: {', '.join([a['name'] for a in authors_info])}")
            print(f"  Top Concepts: {', '.join([c['name'] for c in concepts_info])}")
            print(f"  Cited By: {cited_by_count}")
            print(f"  OpenAlex ID: {openalex_id}")

        except Exception as e:
            print(f"  Error processing DOI {doi}: {e}")
            # This could be a network error, API error, or the DOI truly not existing

    print("\n--- Finished Querying ---")
    
    return results


In [None]:

# --- Main execution ---
if __name__ == "__main__":
    # Call the function with the list of DOIs
    retrieved_data = get_openalex_data_for_dois(dois_to_query)

    # You can now work with the 'retrieved_data' list, which contains
    # dictionaries of information for each successfully processed DOI.
    print(f"\nSuccessfully retrieved data for {len(retrieved_data)} out of {len(dois_to_query)} DOIs.")

    # Example: Print the title of the first result if available
    if retrieved_data:
        print(f"\nExample - Title of first result: {retrieved_data[0].get('title', 'N/A')}")

    # Example: Print author names and institutions for the second result if available
    if len(retrieved_data) > 1:
        print("\nExample - Authors/Institutions of second result:")
        for author_info in retrieved_data[1].get('authors', []):
            print(f"  - {author_info['name']} ({', '.join(author_info['institutions'])})")

In [4]:
retrieved_data_df = pd.DataFrame(retrieved_data)

In [5]:
retrieved_data_df.head()

Unnamed: 0,doi,openalex_id,title,publication_year,cited_by_count,journal,journal_issn_l,authors,concepts
0,10.1177/10963480241229235,https://openalex.org/W4391822953,Artificial Intelligence in Hospitality and Tou...,2024,30,,,"[{'name': 'Hyunsu Kim', 'orcid': 'https://orci...","[{'name': 'Hospitality', 'level': 3, 'score': ..."
1,10.1002/adfm.202413884,https://openalex.org/W4403192979,Hierarchical Engineering on Built‐In Electric ...,2024,26,,,"[{'name': 'Shijie Zhang', 'orcid': 'https://or...","[{'name': 'Materials science', 'level': 0, 'sc..."
2,10.1109/TNNLS.2023.3336563,https://openalex.org/W4391130239,Learning to Aggregate Multi-Scale Context for ...,2024,33,,,"[{'name': 'Ye Liu', 'orcid': 'https://orcid.or...","[{'name': 'Computer science', 'level': 0, 'sco..."
3,10.1016/j.esci.2024.100281,https://openalex.org/W4398247259,Heterogeneous structure design for stable Li/N...,2024,29,,,"[{'name': 'Hongyang Chen', 'orcid': 'https://o...","[{'name': 'Battery (electricity)', 'level': 3,..."
4,10.1109/TEVC.2023.3278132,https://openalex.org/W4381983181,Knowledge Learning for Evolutionary Computation,2023,27,,,"[{'name': 'Yi Jiang', 'orcid': 'https://orcid....","[{'name': 'Computer science', 'level': 0, 'sco..."


### OpenAlex - PolyU's Institutional Repository

In [None]:
# Useful url: 
# Find your intitution ID
'https://api.openalex.org/institutions?search=hong%kong%polytechnic%university'

# OpenAlex Institute ID: https://openalex.org/I14243506
# ROR ID: https://ror.org/0030zas98 

# All works of the institution
'https://api.openalex.org/works?filter=institutions.id:https://openalex.org/I14243506'

# Works for specific years
'https://api.openalex.org/works?filter=institutions.id:https://openalex.org/I14243506,publication_year:2020-2025&sort=publication_date:desc'

In [7]:
import requests
import pandas as pd
import time
import json # For handling complex data structures if needed

In [8]:
def fetch_and_normalize_openalex_data(api_url, csv_filename="output_publications.csv"):
    """
    Fetches publication data from the OpenAlex API using pagination,
    normalizes the JSON results, saves them to a CSV file, and returns
    a pandas DataFrame.

    Args:
        api_url (str): The base OpenAlex API URL with filters and sorting.
                       Pagination parameters ('page', 'per_page') will be added.
        csv_filename (str): The name for the output CSV file.

    Returns:
        pandas.DataFrame or None: A DataFrame containing the normalized
                                  publication data, or None if an error occurs during fetch.
    """
    all_results = []
    page = 1
    # Use a larger page size for fewer requests (OpenAlex max is 200)
    per_page = 200 
    
    # IMPORTANT: Add your email to the headers for API politeness (as recommended by OpenAlex)
    # Replace 'your_email@example.com' with your actual email address.
    headers = {
        'User-Agent': 'PolyUAnalysisPortfolioProject/0.1 (mailto:bennis.yiu@connect.polyu.hk)',
        'Accept': 'application/json'
    }
    
    print(f"Starting data fetch from OpenAlex...")
    print(f"Base URL: {api_url}")
    print("INFO: Using headers with User-Agent and mailto. PLEASE REPLACE 'bennis.yiu@connect.polyu.hk' in the code with your actual email.")

    while True:
        # Construct the URL with pagination parameters
        paginated_url = f"{api_url}&page={page}&per_page={per_page}"
        print(f"Fetching page {page} (up to {per_page} results per page)...")

        try:
            response = requests.get(paginated_url, headers=headers, timeout=60) # Increased timeout
            response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
            
            # Handle potential empty response or non-JSON response gracefully
            if not response.content:
                print(f"Warning: Received empty response for page {page}. Assuming end of results.")
                break
                
            try:
                data = response.json()
            except json.JSONDecodeError:
                print(f"Error: Could not decode JSON from response for page {page}.")
                print("Response first 500 chars:", response.text[:500])
                # Decide how to handle: break, continue, or return None
                print("Stopping data fetch due to JSON decode error.")
                return None # Or potentially return df with data fetched so far

            results = data.get('results', [])

            if not results:
                print("No more results found on this page. Fetching complete.")
                break # Exit loop if no results on this page

            all_results.extend(results)
            total_fetched = len(all_results)
            total_count = data.get('meta', {}).get('count', 'unknown')
            print(f"Fetched {len(results)} results from page {page}. Total results so far: {total_fetched} / {total_count}")

            page += 1
            time.sleep(0.2) # Be polite to the API - wait 200ms between requests

        except requests.exceptions.Timeout:
            print(f"Request timed out on page {page}. Retrying after a longer wait...")
            time.sleep(10) # Wait longer before retry
            continue # Retry the same page
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data from {paginated_url}: {e}")
            print("Please check the URL, your internet connection, and API rate limits.")
            # Optionally return the data fetched so far, or None
            if all_results:
                 print("Returning data fetched up to the point of error.")
                 break # Exit loop and process collected data
            else:
                 return None # Return None if no data was fetched before error
        except Exception as e: # Catch unexpected errors
            print(f"An unexpected error occurred during fetching page {page}: {e}")
            return None


    if not all_results:
        print("No data was fetched or collected.")
        return pd.DataFrame() # Return empty DataFrame

    print(f"\nTotal results fetched: {len(all_results)}")
    print("Normalizing data into a DataFrame...")

    try:
        # Use pandas json_normalize for flattening the structure
        # It handles nested dicts well, creating columns like 'author.name'
        df = pd.json_normalize(all_results)

        print(f"Initial normalization resulted in {df.shape[1]} columns.")

        # --- Optional Data Cleaning & Simplification ---
        # Some fields like 'authorships', 'concepts', 'locations' etc. remain complex 
        # (lists of dictionaries). You might want to extract specific info from them.
        
        # Example: Extract author display names into a comma-separated string
        if 'authorships' in df.columns:
            try:
                df['author_names'] = df['authorships'].apply(
                    lambda x: ', '.join([auth.get('author', {}).get('display_name', '') 
                                         for auth in x if isinstance(auth, dict)]) 
                    if isinstance(x, list) else None
                )
            except Exception as e:
                print(f"Warning: Could not extract author names cleanly: {e}")
                df['author_names'] = None


        # Example: Extract concept display names
        if 'concepts' in df.columns:
            try:
                df['concept_names'] = df['concepts'].apply(
                    lambda x: ', '.join([concept.get('display_name', '') 
                                         for concept in x if isinstance(concept, dict) and concept.get('level', 10) == 0]) # Top level concepts only
                    if isinstance(x, list) else None
                )
            except Exception as e:
                print(f"Warning: Could not extract concept names cleanly: {e}")
                df['concept_names'] = None

        # Example: Extract primary source display name
        if 'primary_location.source.display_name' in df.columns:
            df['source_display_name'] = df['primary_location.source.display_name']
        elif 'primary_location' in df.columns: # Handle cases where source might be missing
             df['source_display_name'] = df['primary_location'].apply(lambda x: x.get('source', {}).get('display_name') if isinstance(x, dict) else None)
        else:
             df['source_display_name'] = None
             
        # Example: Simplify institutions from authorships
        if 'authorships' in df.columns:
            try:
                df['institution_names'] = df['authorships'].apply(
                    lambda x: list(set( # Use set to get unique names
                        inst.get('display_name', '') 
                        for auth in x if isinstance(auth, dict) and isinstance(auth.get('institutions'), list) 
                        for inst in auth['institutions'] if isinstance(inst, dict)
                    )) if isinstance(x, list) else None
                )
                # Convert list to comma separated string for easier CSV viewing
                df['institution_names_str'] = df['institution_names'].apply(lambda x: ', '.join(x) if isinstance(x, list) else None)

            except Exception as e:
                print(f"Warning: Could not extract institution names cleanly: {e}")
                df['institution_names'] = None
                df['institution_names_str'] = None


        # Select and potentially reorder columns you care most about
        # This makes the final CSV/DataFrame cleaner. Add/remove based on your needs.
        core_columns = [
            'id', 'doi', 'title', 'display_name', 'publication_year', 'publication_date', 
            'type', 'cited_by_count', 'is_oa', 'oa_status', 
            'source_display_name', # Simplified source name
            'author_names', # Simplified author names
            'concept_names', # Simplified concepts
            'institution_names_str', # Simplified institution names
            # Add other potentially useful flattened columns from json_normalize:
            'primary_location.source.id', 'primary_location.landing_page_url', 'primary_location.pdf_url',
            'biblio.volume', 'biblio.issue', 'biblio.first_page', 'biblio.last_page',
            'language', 'is_retracted', 'is_paratext',
            # You might want to keep some original complex fields for deeper analysis later
            # 'authorships', 'concepts', 'locations', 'grants', 'referenced_works', 'related_works' 
        ]
        
        # Filter DataFrame to keep only existing core columns + any extra ones created
        existing_cols = [col for col in core_columns if col in df.columns]
        # Add any columns in df not explicitly mentioned in core_columns (optional)
        # extra_cols = [col for col in df.columns if col not in existing_cols]
        # df_final = df[existing_cols + extra_cols] 
        df_final = df[existing_cols] # Keep only selected/created columns for simplicity


        print("Normalization and simplification complete.")
        print("Final DataFrame columns:", df_final.columns.tolist())
        print("Final DataFrame shape:", df_final.shape)

    except Exception as e:
        print(f"An error occurred during data normalization/processing: {e}")
        print("Returning the DataFrame from initial normalization if available, otherwise None.")
        # If normalization itself failed hard, df might not exist
        if 'df' in locals():
             return df # Return the result of initial json_normalize
        else:
             return None 

    try:
        print(f"Saving data to {csv_filename}...")
        # Use utf-8 encoding for compatibility with special characters
        df_final.to_csv(csv_filename, index=False, encoding='utf-8')
        print(f"Data successfully saved to {csv_filename}")
    except Exception as e:
        print(f"An error occurred while saving the CSV file: {e}")
        # Still return the DataFrame even if saving fails
    
    return df_final

In [9]:
# --- How to Use ---
if __name__ == "__main__":
    # The URL provided by the user (without pagination parameters)
    polyu_api_base_url = "https://api.openalex.org/works?filter=institutions.id:https://openalex.org/I14243506,publication_year:2020-2025&sort=publication_date:desc"
    
    output_csv_file = "polyu_publications_2020-2025.csv"

    # IMPORTANT: Remember to replace 'your_email@example.com' inside the function's header definition.
    
    # Call the function to fetch, normalize, save, and get the DataFrame
    polyu_dataframe = fetch_and_normalize_openalex_data(polyu_api_base_url, output_csv_file)

    if polyu_dataframe is not None:
        print("\n--- DataFrame Info ---")
        polyu_dataframe.info()
        print("\n--- DataFrame Head (first 5 rows) ---")
        # Display more columns if needed for verification
        with pd.option_context('display.max_rows', 5, 'display.max_columns', 15): 
            print(polyu_dataframe.head())
    else:
        print("\nProcess failed. No DataFrame was returned.")

Starting data fetch from OpenAlex...
Base URL: https://api.openalex.org/works?filter=institutions.id:https://openalex.org/I14243506,publication_year:2020-2025&sort=publication_date:desc
INFO: Using headers with User-Agent and mailto. PLEASE REPLACE 'bennis.yiu@connect.polyu.hk' in the code with your actual email.
Fetching page 1 (up to 200 results per page)...
Fetched 200 results from page 1. Total results so far: 200 / 44972
Fetching page 2 (up to 200 results per page)...
Fetched 200 results from page 2. Total results so far: 400 / 44972
Fetching page 3 (up to 200 results per page)...
Fetched 200 results from page 3. Total results so far: 600 / 44972
Fetching page 4 (up to 200 results per page)...
Fetched 200 results from page 4. Total results so far: 800 / 44972
Fetching page 5 (up to 200 results per page)...
Fetched 200 results from page 5. Total results so far: 1000 / 44972
Fetching page 6 (up to 200 results per page)...
Fetched 200 results from page 6. Total results so far: 1200 /

: 

In [None]:
polyu_dataframe.head()