In [None]:
# pip install -r requirements.txt

In [1]:
import pandas as pd
import json
import requests
import pytz
import psycopg2
import os
import time
import re
import json
import ast
from datetime import timedelta, datetime
from pandas import json_normalize
from dotenv import load_dotenv
from requests.exceptions import Timeout, RequestException
from psycopg2 import sql
from psycopg2.extras import execute_values

In [2]:
def get_credentials():
    """Load and validate credentials from environment variables."""

    print('get_credentials() method')
    load_dotenv()  # Load .env file

    # Facebook credentials
    scopus_api_key = os.getenv("SCOPUS_API_KEY")
    if not scopus_api_key:
        raise ValueError("SCOPUS_API_KEY is missing in .env!")

    scopus_credentials = {
        "access_token": scopus_api_key,
        "scopus_label": os.getenv("SCOPUS_LABEL")
    }

    # Database credentials
    db_credentials = {
        "hostname": os.getenv("DB_HOST"),
        "port": int(os.getenv("DB_PORT")),  # Convert to integer
        "username": os.getenv("DB_USER"),
        "password": os.getenv("DB_PASSWORD"),
        "database": os.getenv("DB_NAME"),
        "schema": os.getenv("DB_SCHEMA")
    }

    return scopus_credentials, db_credentials

In [3]:
def scopus_api_caller(url, params, headers, max_retries=3, timeout=20):
    print('scopus_api_caller() method')
    all_data = []
    retry_count = 0

    while url and retry_count < max_retries:
        try:
            print(f'Making request to URL: {url}')
            response = requests.get(
                url, params=params, headers=headers, timeout=timeout)
            print(f'Response status code: {response.status_code}')

            if response.status_code != 200:
                print(f'Error response content: {response.text}')

            response.raise_for_status()
            data = response.json()

            if 'search-results' in data and 'entry' in data['search-results']:
                all_data.extend(data['search-results']['entry'])
                print(
                    f"Collected {len(data['search-results']['entry'])} items. Total: {len(all_data)}")
            else:
                print("No data found in response")
                break

            # Check if there are more pages
            if 'link' in data['search-results']:
                next_link = next(
                    (link for link in data['search-results']['link'] if link['@ref'] == 'next'), None)
                if next_link:
                    url = next_link['@href']
                    params = {}  # Clear params as they're included in the next URL
                else:
                    url = None
                    print("No more pages")
            else:
                url = None
                print("No more pages")

            retry_count = 0  # Reset retry count on successful request

        except (Timeout, RequestException) as e:
            retry_count += 1
            print(
                f"Request failed: {e}. Retry attempt {retry_count} of {max_retries}")
            if retry_count == max_retries:
                print("Max retries reached. Exiting.")
                break
            time.sleep(2 ** retry_count)  # Exponential backoff

    print(f'Exiting scopus_api_caller. Total items collected: {len(all_data)}')
    return all_data

In [4]:
def scopus_search(scopus_credentials, query, start=0, count=25, sort='citedby-count', max_results=5000):
    print('scopus_search() method')

    scopus_api_key = scopus_credentials['access_token']
    url = 'https://api.elsevier.com/content/search/scopus'

    headers = {
        'X-ELS-APIKey': scopus_api_key,
        'Accept': 'application/json'
    }

    # list of fields
    fields = [
        # 'link ref=self', # Content Abstract Retrieval API URI
        # 'link ref=scopus', # Scopus abstract detail page URL
        # 'link ref=scopus-citedby', # Scopus Cited By Results URL
        'prism:url', # Content Abstract Retrieval API URI
        'dc:identifier',  # Unique identifier: Scopus ID
        'eid', # Electronic ID
        'dc:title',  # Article Title
        'prism:aggregationType', # Source Type
        'prism:doi',      # DOI for Abstract Retrieval API
        'prism:coverDate',  # Publication date
        'citedby-count',  # Citation count
        'prism:publicationName',  # Journal or conference name
        'affiliation',  # Author affiliation
        'prism:isbn', # Source Identifier
        'prism:issn', # Source Identifier
        'prism:volume', # Volume
        'prism:issueIdentifier', # Issue
        'prism:pageRange', # Page
        'pii', # Publication Item Identifier
        'pubmed-id', # MEDLINE Identifier
        # 'orcid', # ORCID
        'dc:creator', # First Author
        'subtype', # Document Type code
        'subtypeDescription', # Type of publication (e.g., Article, Conference Paper)
        'openaccess' # Open Access Status
    ]

    params = {
        'query': query,
        'field': ','.join(fields),
        'count': count,
        'start': start,
        'sort': sort
    }

    all_results = []
    total_results = None

    while len(all_results) < max_results:
        print(f'Full URL: {url}')
        print(f'Headers: {headers}')
        print(f'Params: {params}')

        # Fetch the data
        batch_results = scopus_api_caller(url, params, headers)

        if not batch_results:
            print("No results returned from API. Stopping search.")
            break

        all_results.extend(batch_results)

        # Check total number of results if not already set
        if total_results is None:
            total_results = int(batch_results[0].get(
                'search-results', {}).get('opensearch:totalResults', 0))
            print(f"Total results available: {total_results}")
            if total_results == 0:
                print("No results found for the given query.")
                break

        # Update start for the next page
        params['start'] = len(all_results)

        # Check if we've reached the end of results
        if len(all_results) >= total_results or len(all_results) >= max_results:
            print("All available results have been retrieved or max results reached.")
            break

        if len(batch_results) < count:
            print("Reached the end of available results.")
            break

    print(f'{len(all_results)} of SCOPUS data will be processed.')

    if not all_results:
        print("No results found for the given query.")

    return all_results

In [5]:
def process_list_item(item):
    if isinstance(item, dict):
        return [str(value) for value in item.values() if value]
    elif isinstance(item, str):
        return [item]
    elif isinstance(item, list):
        return [str(subitem) for subitem in item if subitem]
    else:
        return [str(item)] if item else []

In [6]:
# def process_scopus_search_results(all_data):
#     """Process Scopus search results data
#     Return a dataframe with selected columns and database-friendly names
#     """
#     if not all_data:
#         print("No data received from Scopus API")
#         return pd.DataFrame()

#     # Convert to DataFrame
#     df = pd.json_normalize(all_data)

#     # Function to clean column names
#     def clean_column_name(name):
#         # Replace non-alphanumeric characters with underscores
#         name = re.sub(r'[^a-zA-Z0-9]', '_', name)
#         # Replace multiple underscores with a single underscore
#         name = re.sub(r'_+', '_', name)
#         # Remove leading or trailing underscores
#         name = name.strip('_')
#         # Convert to lowercase
#         return name.lower()

#     # Clean column names
#     df.columns = [clean_column_name(col) for col in df.columns]

#     # Ensure all fields from scopus_search() are present
#     expected_fields = [
#         'dc_identifier',
#         'prism_doi',
#         'prism_coverdate',
#         'citedby_count',
#         'prism_publicationname',
#         'subtype',
#         'subtypedescription'
#     ]

#     for field in expected_fields:
#         if field not in df.columns:
#             df[field] = None
#             print(
#                 f"Warning: '{field}' not found in API response. Added as empty column.")

#     # Convert numeric fields
#     if 'citedby_count' in df.columns:
#         df['citedby_count'] = pd.to_numeric(
#             df['citedby_count'], errors='coerce')

#     # Convert date fields
#     if 'prism_coverdate' in df.columns:
#         df['prism_coverdate'] = pd.to_datetime(
#             df['prism_coverdate'], errors='coerce')

#     # Add a column for publication year
#     if 'prism_coverdate' in df.columns:
#         df['publication_year'] = df['prism_coverdate'].dt.year
#     else:
#         print(
#             "Warning: 'prism_coverdate' not found in the data. Using 2100 as fallback year.")
#         df['publication_year'] = 2100

#     # Ensure publication_year is always an integer
#     df['publication_year'] = df['publication_year'].fillna(2100).astype(int)

#     # Print column names and their types for debugging
#     print("Column names and types:")
#     print(df.dtypes)

#     # Print the first few rows for debugging
#     print("First few rows of the processed dataframe:")
#     print(df.head())

#     return df

In [7]:
# ## Revised I - process_scopus_search_results()
# def process_scopus_search_results(all_data):
#     """
#     Process Scopus search results data based on specifically requested fields.
#     Normalizes nested data (affiliation, creator), cleans column names,
#     extracts key links, converts types, and adds publication year/month.

#     Args:
#         all_data (list): List of raw result dictionaries from Scopus API.

#     Returns:
#         pandas.DataFrame: Processed DataFrame with cleaned names and added columns.
#     """
#     if not all_data:
#         print("No data received from Scopus API to process.")
#         return pd.DataFrame()

#     print(f"Processing {len(all_data)} raw Scopus records.")

#     # --- Pre-processing Step: Extract Specific Links ---
#     # This makes handling links easier than relying solely on json_normalize
#     processed_data = []
#     for item in all_data:
#         new_item = item.copy() # Work on a copy
#         links = new_item.get('link', [])
#         if isinstance(links, list):
#             for link_info in links:
#                 if isinstance(link_info, dict):
#                     ref = link_info.get('@ref')
#                     href = link_info.get('@href')
#                     if ref and href:
#                         # Create specific keys for the links we want
#                         if ref == 'self':
#                             new_item['link_self_href'] = href
#                         elif ref == 'scopus':
#                             new_item['link_scopus_href'] = href
#                         elif ref == 'scopus-citedby':
#                             new_item['link_scopus_citedby_href'] = href
#             # Remove the original complex 'link' field after extraction
#             if 'link' in new_item:
#                 del new_item['link']
#         processed_data.append(new_item)
#     all_data = processed_data # Use the pre-processed data from now on
#     # --- End Pre-processing ---


#     # Convert to DataFrame using json_normalize to handle potential nested structures
#     # esp. for 'affiliation' and 'dc:creator'
#     try:
#         df = pd.json_normalize(all_data, sep='_') # Use underscore separator
#         print(f"Normalized data into DataFrame shape: {df.shape}")
#         # print("Initial columns after normalize:", df.columns.tolist()) # Debug
#     except Exception as e:
#         print(f"Error during pandas json_normalize: {e}")
#         print("Attempting basic DataFrame creation (may lose nested data).")
#         try:
#             df = pd.DataFrame(all_data)
#         except Exception as e2:
#             print(f"Basic DataFrame creation also failed: {e2}")
#             return pd.DataFrame()

#     # --- Robust Column Name Cleaning Function ---
#     def clean_column_name(name):
#         name = str(name) # Ensure string
#         # Replace problematic characters (:, -, @, .) with underscores
#         # Keeping @ handling just in case, although pre-processing links helps
#         name = re.sub(r'[:\-@\.]', '_', name)
#         # Handle CamelCase by inserting underscore before capitals (except first char)
#         # name = re.sub(r'(?<!^)(?=[A-Z])', '_', name) # Optional: uncomment if you see CamelCase cols
#         # Remove characters that are not alphanumeric or underscore
#         name = re.sub(r'[^a-zA-Z0-9_]', '', name)
#         # Replace multiple underscores with a single underscore
#         name = re.sub(r'_+', '_', name)
#         # Remove leading or trailing underscores
#         name = name.strip('_')
#         # Convert to lowercase
#         return name.lower()

#     # Clean column names
#     df.columns = [clean_column_name(col) for col in df.columns]
#     print(f"Cleaned column names. Shape remains: {df.shape}")
#     # print("Cleaned columns:", df.columns.tolist()) # Debug

#     # --- Define expected columns (cleaned names) based on fields requested ---
#     # Ensure these match the cleaned versions of fields from scopus_search + pre-processed links
#     expected_fields_cleaned = [
#         'link_self_href',             # Extracted pre-processing
#         'link_scopus_href',           # Extracted pre-processing
#         'link_scopus_citedby_href',   # Extracted pre-processing
#         'prism_url',
#         'dc_identifier',
#         'eid',
#         'dc_title',
#         'prism_aggregationtype',
#         'prism_doi',
#         'prism_coverdate',
#         'citedby_count',
#         'prism_publicationname',
#         # 'affiliation' itself might not exist after normalize if it was nested
#         # Check for normalized affiliation columns instead (e.g., affiliation_0_affilname)
#         'prism_isbn',               # Note: ISBN might be list, handled later
#         'prism_issn',               # Note: ISSN might be list, handled later
#         'prism_volume',
#         'prism_issueidentifier',
#         'prism_pagerange',
#         'pii',
#         'pubmed_id',
#         'orcid',
#         # 'dc_creator' might not exist if nested, check for 'dc_creator_$' or similar
#         'subtype',
#         'subtypedescription',
#         'openaccess',
#         # Add publication year/month which are created here
#         'publication_year',
#         'publication_month'
#     ]

#     # --- Ensure Core/Expected Columns Exist (add if missing) ---
#     # We focus on ensuring columns exist, normalization might create others (like affiliation parts)
#     for field in expected_fields_cleaned:
#          if field not in df.columns:
#             # Don't automatically add affiliation/creator base names if normalized versions exist
#             if field not in ['affiliation', 'dc_creator']:
#                  # Check if a related normalized column exists (simple check)
#                  related_prefix = field.split('_')[0] # e.g., 'link'
#                  if not any(col.startswith(related_prefix) for col in df.columns):
#                       df[field] = None
#                       # print(f"Warning: Column '{field}' not found and no related cols found. Added as empty.")


#     # --- Type Conversions ---
#     numeric_cols = ['citedby_count', 'openaccess', 'pubmed_id'] # pubmed_id is often numeric
#     for col in numeric_cols:
#         if col in df.columns:
#             df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) # Coerce errors to NaN, then fill with 0
#             # Convert to integer if no NaNs were introduced or after filling
#             # Check if column contains only integer values (or NaN filled as 0)
#             if df[col].dropna().mod(1).eq(0).all():
#                  df[col] = df[col].astype(int)
#             else:
#                  df[col] = df[col].astype(float) # Keep as float if decimals exist


#     date_cols = ['prism_coverdate']
#     for col in date_cols:
#         if col in df.columns:
#             df[col] = pd.to_datetime(df[col], errors='coerce') # Coerce errors to NaT


#     # --- Add Publication Year and Month ---
#     if 'prism_coverdate' in df.columns and pd.api.types.is_datetime64_any_dtype(df['prism_coverdate']):
#         df['publication_year'] = df['prism_coverdate'].dt.year.fillna(0).astype(int) # Use 0 for missing
#         df['publication_month'] = df['prism_coverdate'].dt.month.fillna(0).astype(int) # Use 0 for missing
#         print("Added 'publication_year' and 'publication_month' columns.")
#     else:
#         print("Warning: 'prism_coverdate' not found or not in expected date format. Cannot add year/month reliably.")
#         # Ensure columns exist even if calculation fails
#         if 'publication_year' not in df.columns: df['publication_year'] = 0
#         if 'publication_month' not in df.columns: df['publication_month'] = 0
#         df['publication_year'] = df['publication_year'].fillna(0).astype(int)
#         df['publication_month'] = df['publication_month'].fillna(0).astype(int)


#     # --- Handle potential list/dict columns (flatten to string) ---
#     # Important for ISBN, ISSN, and potentially affiliation/creator if normalization wasn't perfect
#     # Check if ANY value in a column is a list or dict
#     for col in df.select_dtypes(include=['object']).columns: # Check only object columns
#         if df[col].apply(lambda x: isinstance(x, (list, dict))).any():
#             print(f"Column '{col}' contains lists/dicts. Converting to string representation.")
#             # Safely convert to string, handling None/NaN
#             df[col] = df[col].apply(lambda x: str(x) if x is not None else None)


#     # --- Simplify Affiliation/Author (Example: Extract First) ---
#     # This depends on the output of json_normalize. Check your actual column names.
#     # Common patterns: 'affiliation_0_affilname', 'dc_creator_$'

#     # Attempt to find the first affiliation name column
#     affil_name_cols = sorted([col for col in df.columns if 'affiliation' in col and 'affilname' in col])
#     if affil_name_cols:
#         df['first_affiliation_name'] = df[affil_name_cols[0]]
#         print(f"Extracted first affiliation name into 'first_affiliation_name' from '{affil_name_cols[0]}'.")

#     # Attempt to find the first author name column (often 'dc_creator_$' from normalize)
#     creator_col = 'dc_creator_' # Common result from normalize on dc:creator if it's simple text
#     if creator_col in df.columns:
#          df['first_author_name'] = df[creator_col]
#          print(f"Extracted first author name into 'first_author_name' from '{creator_col}'.")
#     else:
#          # Fallback check if dc:creator wasn't normalized or missing
#          if 'dc_creator' in df.columns:
#               df['first_author_name'] = df['dc_creator'].astype(str) # Convert potential dict/list to string
#               print("Copied 'dc_creator' to 'first_author_name' (structure unknown).")


#     # --- Final Check and Select Columns ---
#     # Optionally, select only the columns you definitively want in the final output
#     # This prevents unexpected columns from normalization cluttering the result.
#     # Create a final list of desired columns based on expected_fields_cleaned + created ones
#     final_columns_to_keep = [
#         # Core IDs & Links
#         'dc_identifier', 'eid', 'prism_doi', 'pii', 'pubmed_id', 'orcid',
#         'link_self_href', 'link_scopus_href', 'link_scopus_citedby_href', 'prism_url',
#         # Title & Creator
#         'dc_title', 'first_author_name', # Use simplified author name
#         # Publication Details
#         'prism_publicationname', 'prism_aggregationtype', 'prism_issn', 'prism_isbn',
#         'prism_volume', 'prism_issueidentifier', 'prism_pagerange',
#         # Dates & Citation
#         'prism_coverdate', 'publication_year', 'publication_month', 'prism_coverdisplaydate',
#         'citedby_count',
#         # Document Type
#         'subtype', 'subtypedescription',
#         # Affiliation
#         'first_affiliation_name', # Use simplified affiliation name
#         # (Optionally add more normalized affiliation columns if needed, e.g., 'affiliation_0_city')
#         # Open Access
#         'openaccess',
#         # 'openaccessflag' # Not requested in your fields list, but often comes with 'openaccess'
#     ]

#     # Filter DataFrame to keep only desired columns, adding missing ones as None
#     final_df = pd.DataFrame()
#     for col in final_columns_to_keep:
#         if col in df.columns:
#             final_df[col] = df[col]
#         else:
#             final_df[col] = None # Add column if it wasn't created/found

#     print("\nFinal selected columns:")
#     print(final_df.columns.tolist())

#     # Print final column names and types for debugging
#     print("\nFinal columns and data types after processing & selection:")
#     print(final_df.dtypes)

#     # Print the first few rows for debugging
#     print("\nFirst few rows of the final processed dataframe:")
#     print(final_df.head())

#     return final_df

In [8]:
## Revised II

def process_scopus_search_results(all_data):
    """
    Process Scopus search results based on the specifically requested fields.
    Normalizes data, cleans column names, ensures requested columns exist,
    performs basic type conversions, and adds publication year/month.

    Args:
        all_data (list): List of raw result dictionaries from Scopus API.

    Returns:
        pandas.DataFrame: Processed DataFrame.
    """
    if not all_data:
        print("No data received from Scopus API to process.")
        return pd.DataFrame()

    print(f"Processing {len(all_data)} raw Scopus records.")

    # Normalize potential nested structures (like affiliation, creator)
    try:
        # Using sep='_' handles nested fields like affiliation_0_affilname
        df = pd.json_normalize(all_data, sep='_')
        print(f"Normalized data into DataFrame shape: {df.shape}")
        # print("Initial columns after normalize:", df.columns.tolist()) # Debug
    except Exception as e:
        print(f"Error during pandas json_normalize: {e}")
        print("Attempting basic DataFrame creation (may lose nested data).")
        try:
            df = pd.DataFrame(all_data)
        except Exception as e2:
            print(f"Basic DataFrame creation also failed: {e2}")
            return pd.DataFrame() # Cannot process

    # --- Column Name Cleaning Function ---
    def clean_column_name(name):
        name = str(name) # Ensure string
        # Replace problematic characters (:, -, @, .) with underscores
        name = re.sub(r'[:\-@\.]', '_', name)
        # Optional: Handle CamelCase if observed in column names
        # name = re.sub(r'(?<!^)(?=[A-Z])', '_', name)
        # Remove characters that are not alphanumeric or underscore
        name = re.sub(r'[^a-zA-Z0-9_]', '', name)
        # Replace multiple underscores with a single underscore
        name = re.sub(r'_+', '_', name)
        # Remove leading or trailing underscores
        name = name.strip('_')
        # Convert to lowercase
        return name.lower()

    # Clean column names
    df.columns = [clean_column_name(col) for col in df.columns]
    print(f"Cleaned column names. Shape remains: {df.shape}")
    # print("Cleaned columns:", df.columns.tolist()) # Debug

    # --- Define cleaned names of fields requested in scopus_search ---
    # List reflects the LATEST fields list provided in scopus_search
    requested_fields_cleaned = [
        'prism_url',
        'dc_identifier',
        'eid',
        'dc_title',
        'prism_aggregationtype',
        'prism_doi',
        'prism_coverdate',
        'citedby_count',
        'prism_publicationname',
        # 'affiliation' will likely be normalized, e.g., affiliation_0_affilname
        'prism_isbn',
        'prism_issn',
        'prism_volume',
        'prism_issueidentifier',
        'prism_pagerange',
        'pii',
        'pubmed_id',
        # 'dc_creator' might be normalized, e.g., dc_creator_$
        'subtype',
        'subtypedescription',
        'openaccess',
        # Add derived fields
        'publication_year',
        'publication_month'
    ]

    # --- Ensure columns related to requested fields exist ---
    # This adds the column with None if neither the base name nor any normalized version exists
    base_requested_fields = [ # Original names from your fields list
        'prism:url', 'dc:identifier', 'eid', 'dc:title', 'prism:aggregationType',
        'prism:doi', 'prism:coverDate', 'citedby-count', 'prism:publicationName',
        'affiliation', 'prism:isbn', 'prism:issn', 'prism:volume',
        'prism:issueIdentifier', 'prism:pageRange', 'pii', 'pubmed-id',
        'dc:creator', 'subtype', 'subtypeDescription', 'openaccess'
    ]
    for field_original in base_requested_fields:
        field_cleaned = clean_column_name(field_original)
        # Check if the cleaned name or any column starting with it (due to normalize) exists
        if field_cleaned not in df.columns and not any(col.startswith(field_cleaned + '_') for col in df.columns):
             # Specifically handle affiliation/creator base names - don't add if normalized versions exist
             if field_cleaned in ['affiliation', 'dc_creator'] and any(col.startswith(field_cleaned + '_') for col in df.columns):
                  continue # Don't add the base name 'affiliation' if 'affiliation_0_...' exists
             else:
                  df[field_cleaned] = None
                  # print(f"Warning: Column for '{field_original}' (cleaned: '{field_cleaned}') not found. Added as empty.")


    # --- Basic Type Conversions ---
    numeric_cols = ['citedby_count', 'openaccess', 'pubmed_id']
    for col in numeric_cols:
        if col in df.columns:
            # Convert to numeric, coercing errors; fill resulting NaNs with 0
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
            # Attempt conversion to integer if appropriate
            if df[col].dropna().mod(1).eq(0).all():
                 try:
                     df[col] = df[col].astype(int)
                 except pd.errors.IntCastingNaNError: # Should not happen after fillna(0) but safety first
                     df[col] = df[col].astype(float) # Keep as float if conversion fails
            else:
                 df[col] = df[col].astype(float) # Keep as float if decimals exist

    date_cols = ['prism_coverdate']
    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce') # Coerce errors to NaT


    # --- Add Publication Year and Month ---
    if 'prism_coverdate' in df.columns and pd.api.types.is_datetime64_any_dtype(df['prism_coverdate']):
        df['publication_year'] = df['prism_coverdate'].dt.year.fillna(0).astype(int) # Use 0 for missing
        df['publication_month'] = df['prism_coverdate'].dt.month.fillna(0).astype(int) # Use 0 for missing
        print("Added 'publication_year' and 'publication_month' columns.")
    else:
        print("Warning: 'prism_coverdate' not found or not in expected date format. Cannot add year/month reliably.")
        if 'publication_year' not in df.columns: df['publication_year'] = 0
        if 'publication_month' not in df.columns: df['publication_month'] = 0
        # Ensure integer type even if defaults are used
        df['publication_year'] = df['publication_year'].fillna(0).astype(int)
        df['publication_month'] = df['publication_month'].fillna(0).astype(int)


    # --- Convert Remaining List/Dict Columns to String ---
    # Checks only object columns for efficiency
    for col in df.select_dtypes(include=['object']).columns:
        # Check if *any* non-null entry in the column is a list or dict
        if df[col].dropna().apply(lambda x: isinstance(x, (list, dict))).any():
            print(f"Column '{col}' contains lists/dicts. Converting to string representation.")
            # Convert lists/dicts to string, leave others (like simple strings, numbers) as they are
            df[col] = df[col].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
            # Handle potential None values that were not lists/dicts
            df[col] = df[col].astype(str).replace({'None': None, 'nan': None}) # Convert actual strings 'None'/'nan' back if needed

    # --- Final Output ---
    print("\nFinal columns and data types after processing:")
    # Displaying dtypes gives a good overview of the result
    print(df.dtypes)

    print("\nFirst few rows of the processed dataframe:")
    print(df.head())

    # Return the DataFrame with all columns resulting from normalization and processing
    return df

In [9]:
def exclude_existing_results(new_results, existing_df):
    if existing_df.empty:
        return new_results

    existing_ids = set(existing_df['dc_identifier'].tolist())
    return [result for result in new_results if result.get('dc:identifier') not in existing_ids]



In [10]:
# def scopus_research_procedures(years_to_process):
#     try:
#         scopus_credentials, db_credentials = get_credentials()  # pylint: disable=unused-variable

#         max_results_per_api_call = 5000

#         csv_file = 'polyu_research_output.csv'
#         existing_df = pd.DataFrame()
#         try:
#             existing_df = pd.read_csv(csv_file)
#             if 'publication_year' not in existing_df.columns:
#                 print("Adding missing publication_year column to existing data")
#                 existing_df['publication_year'] = pd.to_datetime(existing_df['prism_coverdate']).dt.year
#             print(f"Loaded {len(existing_df)} existing records from {csv_file}")
#         except FileNotFoundError:
#             print(f"No existing file found at {csv_file}. Starting fresh.")
#         except Exception as e:
#             print(f"Error reading CSV file: {e}")

#         all_new_results = []
#         latest_df = existing_df.copy()  # Initialize latest_df with existing data

#         for year in years_to_process:
#             query = f"AFFIL(\"The Hong Kong Polytechnic University\") AND PUBYEAR = {year}"
#             print(f"\nExecuting Scopus search with query: {query}")

#             year_results = []
#             start = 0
#             while True:
#                 try:
#                     polyu_results = scopus_search(
#                         scopus_credentials, query, start=start, max_results=max_results_per_api_call)

#                     if not polyu_results:
#                         print(f"No more results found for {year}.")
#                         break

#                     year_results.extend(polyu_results)
#                     start += len(polyu_results)

#                     print(f"Retrieved {len(polyu_results)} results for {year}. Total for {year}: {len(year_results)}")

#                     if len(polyu_results) < max_results_per_api_call:
#                         print(f"Reached the end of available results for {year}.")
#                         break

#                 except Exception as e:
#                     print(f"Error during API call: {e}")
#                     print("Saving current results and moving to next year.")
#                     break

#             # Process and save results for this year
#             if year_results:
#                 try:
#                     new_df = process_scopus_search_results(year_results)
#                     if 'publication_year' not in new_df.columns:
#                         # Ensure publication_year is added
#                         new_df['publication_year'] = year

#                     # Combine with existing data
#                     latest_df = pd.concat([latest_df, new_df], ignore_index=True)
#                     latest_df.drop_duplicates(subset='dc_identifier', keep='last', inplace=True)

#                     # Try to save the results to a CSV file
#                     try:
#                         latest_df.to_csv(csv_file, index=False)
#                         print(f"\nResults saved to '{csv_file}'. Total records: {len(latest_df)}")
#                     except Exception as e:
#                         print(f"Error saving CSV file: {e}")
#                         print("Continuing with in-memory DataFrame.")

#                 except Exception as e:
#                     print(f"Error processing results for {year}: {e}")

#             all_new_results.extend(year_results)

#         if not all_new_results:
#             print("No new results to process across all years.")
#         else:
#             print(f"Total new results across all years: {len(all_new_results)}")

#         return latest_df  # Return the latest DataFrame

#     except Exception as e:
#         print(f"An unexpected error occurred: {e}")
#         import traceback
#         traceback.print_exc()
#         return pd.DataFrame()  # Return an empty DataFrame in case of overall failure

In [11]:
## Revised I - parameters: publication year + list of subtypedescription

def scopus_research_procedures(publication_year, document_types):
    """
    Fetches Scopus research output for PolyU for a specific year and
    list of document types, handling pagination via scopus_search,
    adds publication month, and saves to a year-specific CSV.

    Args:
        publication_year (int): The year to fetch publications for.
        document_types (list): A list of strings representing the
                                'subtypeDescription' values to query
                                (e.g., ['Article', 'Conference Paper']).

    Returns:
        pandas.DataFrame: A DataFrame containing the combined and deduplicated
                          results for the specified year and document types,
                          including a 'publication_month' column.
                          Returns an empty DataFrame on major failure.
    """
    try:
        scopus_credentials, _ = get_credentials() # Assuming db_credentials not needed here

        # Define the maximum results limit PER document type search for the given year
        # This limit is passed down to scopus_search
        max_results_per_type_query = 5000

        # Use a year-specific CSV file for loading/saving
        csv_file = f'polyu_research_output_{publication_year}.csv'
        existing_df = pd.DataFrame()
        try:
            existing_df = pd.read_csv(csv_file)
            # Ensure necessary columns exist from previous runs when loading
            # Note: process_scopus_search_results already adds publication_year
            # We need to potentially add publication_month if it's missing from old file
            if 'prism_coverdate' in existing_df.columns and 'publication_month' not in existing_df.columns:
                 print(f"Adding missing 'publication_month' column to existing data in {csv_file}")
                 # Ensure prism_coverdate is datetime before extracting month
                 existing_df['prism_coverdate'] = pd.to_datetime(existing_df['prism_coverdate'], errors='coerce')
                 existing_df['publication_month'] = existing_df['prism_coverdate'].dt.month.fillna(0).astype(int) # Use 0 for missing month
            print(f"Loaded {len(existing_df)} existing records from {csv_file}")
        except FileNotFoundError:
            print(f"No existing file found at {csv_file}. Starting fresh for year {publication_year}.")
        except Exception as e:
            print(f"Error reading CSV file {csv_file}: {e}. Starting fresh for year {publication_year}.")
            existing_df = pd.DataFrame() # Ensure it's an empty DF if read fails

        # List to hold all *raw* results collected for this year across all specified types
        all_new_raw_results_for_year = []

        # --- Loop through each document type for the given year ---
        for doc_type in document_types:
            # Construct the Scopus query for the specific year and document type
            query = f'AFFIL("The Hong Kong Polytechnic University") AND PUBYEAR = {publication_year} AND SUBTYPE("{doc_type}")'
            print(f"\nExecuting Scopus search for Year: {publication_year}, Type: '{doc_type}'")
            print(f"Query: {query}")

            try:
                # Call your existing scopus_search function.
                # It internally handles pagination using scopus_api_caller up to max_results.
                # We start from 0 for each document type query.
                type_results = scopus_search(
                    scopus_credentials,
                    query,
                    start=0,
                    # count=25, # Use default count from scopus_search
                    # sort='citedby-count', # Use default sort from scopus_search
                    max_results=max_results_per_type_query # Pass the limit
                )

                if type_results:
                    print(f"Retrieved {len(type_results)} raw results for '{doc_type}'.")
                    # Extend the list of raw results for the year
                    all_new_raw_results_for_year.extend(type_results)
                else:
                    # scopus_search already prints messages if no results are found
                    print(f"No results returned by scopus_search for '{doc_type}'.")

            except Exception as e:
                # Log error if scopus_search fails for a specific type, but continue
                print(f"Error during scopus_search call for {doc_type} (Year: {publication_year}): {e}")
                print(f"Skipping document type '{doc_type}' and continuing...")
                # Optionally add more detailed logging here if needed
                # traceback.print_exc() # Uncomment for full traceback during debugging

        # --- Process all collected raw results for the year ---
        if not all_new_raw_results_for_year:
            print(f"\nNo new raw results collected for year {publication_year} across specified document types.")
            # Return the DataFrame loaded at the start (might be empty or contain previous data)
            return existing_df
        else:
            print(f"\nTotal new raw results collected across all types for {publication_year}: {len(all_new_raw_results_for_year)}")

            # --- Process the combined raw list using your existing function ---
            try:
                # process_scopus_search_results handles normalization, cleaning, and adds 'publication_year'
                new_df = process_scopus_search_results(all_new_raw_results_for_year)

                if not new_df.empty:
                    # --- Add the new 'publication_month' column ---
                    if 'prism_coverdate' in new_df.columns:
                         # Your process_scopus_search_results already converts prism_coverdate to datetime
                         # Extract month, fill NaT/NaN with 0, convert to int
                         new_df['publication_month'] = new_df['prism_coverdate'].dt.month.fillna(0).astype(int)
                         print("Added 'publication_month' column to new data.")
                    else:
                         # This case should ideally not happen if prism_coverdate is always requested and processed
                         print("Warning: 'prism_coverdate' column not found in processed DataFrame. Cannot add 'publication_month'.")
                         new_df['publication_month'] = 0 # Add column with default value

                    # --- Combine with existing data loaded earlier ---
                    # Ensure columns match if needed, but concat handles differences by creating NaNs
                    combined_df = pd.concat([existing_df, new_df], ignore_index=True)

                    # --- Deduplicate based on Scopus ID ---
                    # Use dc_identifier (cleaned name from process_scopus_search_results)
                    if 'dc_identifier' in combined_df.columns:
                        initial_rows = len(combined_df)
                        # Keep the 'last' entry, assuming newer fetches might have updated info (like citations)
                        combined_df.drop_duplicates(subset='dc_identifier', keep='last', inplace=True)
                        dedup_rows = len(combined_df)
                        print(f"Deduplicated records based on 'dc_identifier'. Kept {dedup_rows} out of {initial_rows} total records.")
                    else:
                        print("Warning: 'dc_identifier' column not found in combined DataFrame. Cannot deduplicate effectively.")


                    # --- Save updated data to the year-specific CSV ---
                    try:
                        combined_df.to_csv(csv_file, index=False)
                        print(f"\nResults saved to '{csv_file}'. Total records for {publication_year}: {len(combined_df)}")
                    except Exception as e:
                        print(f"Error saving CSV file '{csv_file}': {e}")
                        print("Returning in-memory DataFrame without saving.")

                    return combined_df # Return the latest combined DataFrame for the year
                else:
                     print("Processing raw results returned an empty DataFrame. No new data to add.")
                     return existing_df # Return the originally loaded data

            except Exception as e:
                print(f"Error during process_scopus_search_results for year {publication_year}: {e}")
                traceback.print_exc() # Print full traceback for processing errors
                # Return existing_df as processing failed, safer than returning partial/corrupt data
                print("Returning existing data due to processing error.")
                return existing_df

    except Exception as e:
        print(f"An unexpected error occurred in scopus_research_procedures: {e}")
        traceback.print_exc() # Print full traceback for unexpected errors
        return pd.DataFrame() # Return an empty DataFrame in case of major failure

In [12]:
# --- Example Usage (using the modified function) ---

# Define the document types based on your image/Scopus fields
doc_types_to_process = [
    "Article", "Book", "Book Chapter", "Conference Paper", "Data Paper",
    "Editorial", "Erratum", "Letter", "Note", "Retracted",
    "Review", "Short Survey"
]

# doc_types_to_process = [
#     "Article"
# ]



In [13]:
# target_year = 2024 # Example: Process data for 2023

# print(f"\n--- Starting Scopus Processing for Year: {target_year} ---")

# # Call the modified function for the specific year and document types
# yearly_data_df = scopus_research_procedures(target_year, doc_types_to_process)

# print(f"\n--- Processing Finished for Year: {target_year} ---")

# if not yearly_data_df.empty:
#     print(f"Final DataFrame shape for {target_year}: {yearly_data_df.shape}")
#     print(f"Columns: {yearly_data_df.columns.tolist()}")
#     print("\nSample data (first 5 rows):")
#     print(yearly_data_df.head())
#     if 'publication_month' in yearly_data_df.columns:
#         print("\nPublication count per month (0 = month unknown/missing):")
#         # Value counts and sort by month index
#         print(yearly_data_df['publication_month'].value_counts().sort_index())
# else:
#     print(f"No data returned or generated for {target_year}.")

In [None]:
# results_df = scopus_research_procedures()

In [None]:
# results_df.head()

In [None]:
def check_duplicates(df1, df2):
    # Assuming 'dc_identifier' is the unique identifier for each record
    duplicates = df1[df1['dc_identifier'].isin(df2['dc_identifier'])]
    print(f"Number of duplicate records: {len(duplicates)}")
    return duplicates

In [None]:
scopus_credentials, db_credentials = get_credentials() 

In [None]:
def scopus_search_data_uploader(db_credentials, df, table_name='scopus_search_output'):
    """
    Upload Scopus data to Postgres database.
    Creates the table if it doesn't exist, then upserts data based on dc_identifier.
    """
    conn = None
    cursor = None
    print('scopus_search_data_uploader() method')
    
    try:
        # Data preprocessing
        date_columns = ['prism_coverdate']
        for col in date_columns:
            if col in df.columns:
                df[col] = pd.to_datetime(df[col], errors='coerce')

        df['publication_year'] = df['publication_year'].fillna(9999).astype(int)

        # Connect to the Postgres database
        conn = psycopg2.connect(
            host=db_credentials['hostname'],
            database=db_credentials['database'],
            user=db_credentials['username'],
            password=db_credentials['password'],
            port=db_credentials['port'],
            connect_timeout=30
        )
        cursor = conn.cursor()

        # Set the schema
        cursor.execute(sql.SQL("SET search_path TO {};").format(
            sql.Identifier(db_credentials['schema'])
        ))

        # Check if table exists, if not create it
        cursor.execute(sql.SQL("""
            CREATE TABLE IF NOT EXISTS {} (
                fa BOOLEAN,
                prism_url TEXT,
                dc_identifier TEXT PRIMARY KEY,
                prism_publicationname TEXT,
                prism_coverdate DATE,
                prism_doi TEXT,
                citedby_count INTEGER,
                subtype TEXT,
                subtypedescription TEXT,
                publication_year INTEGER
            )
        """).format(sql.Identifier(table_name)))

        # Prepare the data for insertion
        columns = df.columns.tolist()
        
        # Construct the INSERT ... ON CONFLICT DO UPDATE query
        insert_query = sql.SQL("""
            INSERT INTO {} ({})
            VALUES %s
            ON CONFLICT (dc_identifier) DO UPDATE SET
            {}
        """).format(
            sql.Identifier(table_name),
            sql.SQL(', ').join(map(sql.Identifier, columns)),
            sql.SQL(', ').join(
                sql.SQL("{0} = EXCLUDED.{0}").format(sql.Identifier(col))
                for col in columns if col != 'dc_identifier'
            )
        )

        # Insert or update data in chunks
        chunk_size = 50
        for i in range(0, len(df), chunk_size):
            chunk = df.iloc[i:i+chunk_size]
            values = [tuple(row) for _, row in chunk.iterrows()]
            execute_values(cursor, insert_query, values)
            print(f"Processed chunk of {len(chunk)} records")

        conn.commit()
        print(f"Successfully uploaded/updated data for {len(df)} records")

    except psycopg2.Error as e:
        print(f"Database error: {e}")
        if conn:
            conn.rollback()
        raise e
    except Exception as e:
        print(f"Unexpected error: {e}")
        if conn:
            conn.rollback()
        raise e
    finally:
        if cursor:
            cursor.close()
        if conn:
            conn.close()

In [None]:
# test DB connection
# import psycopg2
# from dotenv import load_dotenv
# import os

# # Load environment variables
# load_dotenv()

# # Retrieve credentials
# db_credentials = {
#     'hostname': os.getenv('DB_HOST'),
#     'port': int(os.getenv('DB_PORT', 5432)),
#     'username': os.getenv('DB_USER'),
#     'password': os.getenv('DB_PASSWORD'),
#     'database': os.getenv('DB_NAME')
# }

# try:
#     # Print credentials for debugging
#     print("Hostname:", db_credentials['hostname'])
#     print("Port:", db_credentials['port'])
#     print("Username:", db_credentials['username'])
#     print("Password:", "*****")
#     print("Database:", db_credentials['database'])

#     # Attempt to connect to the database
#     conn = psycopg2.connect(
#         host=db_credentials['hostname'],
#         port=db_credentials['port'],
#         user=db_credentials['username'],
#         password=db_credentials['password'],
#         database=db_credentials['database']
#     )
#     print("Connection successful!")
#     conn.close()
# except Exception as e:
#     print(f"Error connecting to the database: {e}")

In [None]:
scopus_search_data_uploader(db_credentials, latest_df, table_name='scopus_search_output')

### Abstract Retrieval API Here

In [14]:
def abstract_retrieval(scopus_credentials, doi):
    print(f'abstract_retrieval() method for DOI: {doi}')

    scopus_api_key = scopus_credentials['access_token']
    url = f'https://api.elsevier.com/content/abstract/doi/{doi}'

    headers = {
        'X-ELS-APIKey': scopus_api_key,
        'Accept': 'application/json'
    }

    response = requests.get(url, headers=headers)
    print(f"Response status code: {response.status_code}")
    response.raise_for_status()

    json_response = response.json()
    # Print first 500 characters
    print(
        f"Response structure: {json.dumps(json_response, indent=2)[:500]}...")

    return json_response

In [15]:
def process_abstract_retrieval_results(abstract_data):
    """Process Abstract Retrieval API results"""
    if not abstract_data:
        print("No data received from Abstract Retrieval API")
        return {}

    # Handle case where abstract_data is a list
    if isinstance(abstract_data, list):
        abstract_data = abstract_data[0] if abstract_data else {}

    coredata = abstract_data.get(
        'abstracts-retrieval-response', {}).get('coredata', {})

    processed_data = {
        'dc:identifier': coredata.get('dc:identifier'),
        'dc:title': coredata.get('dc:title'),
        'prism:doi': coredata.get('prism:doi'),
        'prism:coverDate': coredata.get('prism:coverDate'),
        'citedby-count': coredata.get('citedby-count'),
        'prism:publicationName': coredata.get('prism:publicationName'),
        'subtypeDescription': coredata.get('subtypeDescription'),
        'prism:volume': coredata.get('prism:volume'),
        'prism:issueIdentifier': coredata.get('prism:issueIdentifier'),
        'prism:pageRange': coredata.get('prism:pageRange'),
        'openaccess': coredata.get('openaccess'),
        'pubmed-id': coredata.get('pubmed-id'),
    }

    # Process affiliation data
    affiliations = abstract_data.get(
        'abstracts-retrieval-response', {}).get('affiliation', [])
    if not isinstance(affiliations, list):
        affiliations = [affiliations] if affiliations else []
    processed_data['affiliations'] = [
        {
            'name': aff.get('affilname'),
            'city': aff.get('affiliation-city'),
            'country': aff.get('affiliation-country')
        }
        for aff in affiliations
    ]

    # Process author data
    authors = coredata.get('dc:creator', {})
    if isinstance(authors, dict):
        authors = authors.get('author', [])
    if not isinstance(authors, list):
        authors = [authors] if authors else []
    processed_data['authors'] = [
        {
            'name': author.get('ce:indexed-name'),
            'affiliation': author.get('affiliation', {}).get('@id') if isinstance(author.get('affiliation'), dict) else author.get('affiliation')
        }
        for author in authors
    ]

    return processed_data

In [None]:
# def abstract_retrieval_procedures():
#     try:
#         scopus_credentials, db_credentials = get_credentials()

#         # Test DOIs
#         test_dois = [
#             "10.1016/j.rcim.2023.102626",
#             "10.1038/s41467-024-46022-3",
#             "10.1038/s41586-024-07161-1",
#             "10.1016/j.xinn.2024.100612",
#             "10.1016/j.apcatb.2023.123312",
#             "10.1016/j.apcatb.2023.123335",
#             "10.1002/adma.202311970",
#             "10.1109/JIOT.2024.3361173",
#             "10.1016/j.engstruct.2023.117193",
#             "10.1002/adma.202310918",
#             "10.1002/adma.202307404",
#             "10.1007/s00170-022-10767-2",
#             "10.1038/s41560-023-01415-4",
#             "10.1021/acsnano.3c10674",
#             "10.1002/adma.202300034",
#             "10.1021/jacs.3c10516",
#             "10.1109/TEVC.2022.3215743",
#             "10.1002/adma.202313548",
#             "10.1016/j.joule.2023.12.009",
#             "10.1016/j.knosys.2023.111158"
#         ]

#         print(f"Testing abstract retrieval for {len(test_dois)} DOIs")

#         abstract_results = []
#         for doi in test_dois:
#             try:
#                 print(f"\nProcessing DOI: {doi}")
#                 abstract_data = abstract_retrieval(scopus_credentials, doi)
#                 processed_abstract = process_abstract_retrieval_results(
#                     abstract_data)
#                 abstract_results.append(processed_abstract)
#                 print(
#                     f"Successfully retrieved and processed abstract for DOI: {doi}")

#                 # Print some details of the processed abstract
#                 print("Abstract details:")
#                 print(f"Title: {processed_abstract.get('dc:title', 'N/A')}")
#                 print(
#                     f"Publication Name: {processed_abstract.get('prism:publicationName', 'N/A')}")
#                 print(
#                     f"Cover Date: {processed_abstract.get('prism:coverDate', 'N/A')}")
#                 print(
#                     f"Cited by Count: {processed_abstract.get('citedby-count', 'N/A')}")
#                 print("---")
#             except Exception as e:
#                 print(f"Error processing abstract for DOI {doi}: {e}")

#         if not abstract_results:
#             print("No abstract results to process.")
#             return

#         # Convert abstract results to DataFrame
#         abstract_df = pd.DataFrame(abstract_results)

#         # Print the first few rows of the abstract DataFrame
#         print("\nFirst few rows of the abstract DataFrame:")
#         print(abstract_df.head())

#         # Print DataFrame info
#         print("\nAbstract DataFrame info:")
#         abstract_df.info()

#         # Save abstract results to CSV
#         abstract_csv_file = 'scopus_abstract_output_test.csv'
#         abstract_df.to_csv(abstract_csv_file, index=False)
#         print(f"\nAbstract results saved to '{abstract_csv_file}'")
        
#         return abstract_df

#     except Exception as e:
#         print(f"An unexpected error occurred: {e}")
#         import traceback
#         traceback.print_exc()

In [None]:
def abstract_retrieval_procedures():
    try:
        scopus_credentials, db_credentials = get_credentials()

        # Load the CSV file with DOIs
        csv_file = 'polyu_scopus_search_output_2020_2025.csv'
        try:
            df = pd.read_csv(csv_file)
            print(f"Loaded {len(df)} records from {csv_file}")
        except FileNotFoundError:
            print(
                f"No file found at {csv_file}. Please run scopus_research_procedures first.")
            return
        except Exception as e:
            print(f"Error reading CSV file: {e}")
            return

        # Get unique DOIs
        dois = df['prism_doi'].dropna().unique()
        print(f"Found {len(dois)} unique DOIs to process")

        abstract_results = []
        for doi in dois:
            try:
                abstract_data = abstract_retrieval(scopus_credentials, doi)
                processed_abstract = process_abstract_retrieval_results(
                    abstract_data)
                abstract_results.append(processed_abstract)
                print(
                    f"Successfully retrieved and processed abstract for DOI: {doi}")
            except Exception as e:
                print(f"Error processing abstract for DOI {doi}: {e}")

        if not abstract_results:
            print("No abstract results to process.")
            return

        # Convert abstract results to DataFrame
        abstract_df = pd.DataFrame(abstract_results)

        # Print the first few rows of the abstract DataFrame
        print("\nFirst few rows of the abstract DataFrame:")
        print(abstract_df.head())

        # Print DataFrame info
        print("\nAbstract DataFrame info:")
        abstract_df.info()

        # Save abstract results to CSV
        abstract_csv_file = 'scopus_abstract_output.csv'
        abstract_df.to_csv(abstract_csv_file, index=False)
        print(f"\nAbstract results saved to '{abstract_csv_file}'")
        
        return abstract_df

        # Upload abstract results to database
        # try:
        #     abstract_retrieval_data_uploader(
        #         db_credentials, abstract_df, table_name='scopus_abstract_output')
        #     print("Abstract Retrieval Data has been uploaded to DB.")
        # except Exception as e:
        #     print(f"Error uploading abstract data to database: {e}")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        import traceback
        traceback.print_exc()

In [26]:
abstract_test_df = abstract_retrieval_procedures()

get_credentials() method
Testing abstract retrieval for 20 DOIs

Processing DOI: 10.1016/j.rcim.2023.102626
abstract_retrieval() method for DOI: 10.1016/j.rcim.2023.102626


Response status code: 200
Response structure: {
  "abstracts-retrieval-response": {
    "affiliation": [
      {
        "affiliation-city": "Morgantown",
        "affilname": "Benjamin M. Statler College of Engineering and Mineral Resources",
        "affiliation-country": "United States"
      },
      {
        "affiliation-city": "Hangzhou",
        "affilname": "State Key Laboratory of Fluid Power and Mechatronic Systems",
        "affiliation-country": "China"
      },
      {
        "affiliation-city": "Hangzhou",
        "affilname...
Successfully retrieved and processed abstract for DOI: 10.1016/j.rcim.2023.102626
Abstract details:
Title: Human Digital Twin in the context of Industry 5.0
Publication Name: Robotics and Computer-Integrated Manufacturing
Cover Date: 2024-02-01
Cited by Count: 183
---

Processing DOI: 10.1038/s41467-024-46022-3
abstract_retrieval() method for DOI: 10.1038/s41467-024-46022-3
Response status code: 200
Response structure: {
  "abstracts-retrieval-re

In [27]:
abstract_test_df.head()

Unnamed: 0,dc:identifier,dc:title,prism:doi,prism:coverDate,citedby-count,prism:publicationName,subtypeDescription,prism:volume,prism:issueIdentifier,prism:pageRange,openaccess,pubmed-id,affiliations,authors
0,SCOPUS_ID:85166028251,Human Digital Twin in the context of Industry 5.0,10.1016/j.rcim.2023.102626,2024-02-01,183,Robotics and Computer-Integrated Manufacturing,Review,85,,,0,,[{'name': 'Benjamin M. Statler College of Engi...,"[{'name': 'Wang B.', 'affiliation': [{'@id': '..."
1,SCOPUS_ID:85186172651,Rational molecular and device design enables o...,10.1038/s41467-024-46022-3,2024-12-01,152,Nature Communications,Article,15,1.0,,1,38418862.0,[{'name': 'Chongqing Institute of Green and In...,"[{'name': 'Fu J.', 'affiliation': '60008928'}]"
2,SCOPUS_ID:85188802733,"A three-dimensional liquid diode for soft, int...",10.1038/s41586-024-07161-1,2024-04-04,126,Nature,Article,628,8006.0,84-92,0,38538792.0,"[{'name': 'City University of Hong Kong', 'cit...","[{'name': 'Zhang B.', 'affiliation': [{'@id': ..."
3,SCOPUS_ID:85191897734,Emerging contaminants: A One Health perspective,10.1016/j.xinn.2024.100612,2024-07-01,123,Innovation,Review,5,4.0,,1,,"[{'name': 'Hong Kong Baptist University', 'cit...","[{'name': 'Wang F.', 'affiliation': [{'@id': '..."
4,SCOPUS_ID:85171646525,Surface reconstruction and directed electron t...,10.1016/j.apcatb.2023.123312,2024-02-01,113,Applied Catalysis B: Environmental,Article,341,,,0,,[{'name': 'Guangdong University of Technology'...,"[{'name': 'Xu X.', 'affiliation': '60007155'}]"


In [None]:
def abstract_retrieval_procedures():
    try:
        scopus_credentials, db_credentials = get_credentials()

        # Load the CSV file with DOIs
        csv_file = 'polyu_research_output.csv'
        try:
            df = pd.read_csv(csv_file)
            print(f"Loaded {len(df)} records from {csv_file}")
        except FileNotFoundError:
            print(
                f"No file found at {csv_file}. Please run scopus_research_procedures first.")
            return
        except Exception as e:
            print(f"Error reading CSV file: {e}")
            return

        # Get unique DOIs
        dois = df['prism_doi'].dropna().unique()
        print(f"Found {len(dois)} unique DOIs to process")

        abstract_results = []
        for doi in dois:
            try:
                abstract_data = abstract_retrieval(scopus_credentials, doi)
                processed_abstract = process_abstract_retrieval_results(
                    abstract_data)
                abstract_results.append(processed_abstract)
                print(
                    f"Successfully retrieved and processed abstract for DOI: {doi}")
            except Exception as e:
                print(f"Error processing abstract for DOI {doi}: {e}")

        if not abstract_results:
            print("No abstract results to process.")
            return

        # Convert abstract results to DataFrame
        abstract_df = pd.DataFrame(abstract_results)

        # Print the first few rows of the abstract DataFrame
        print("\nFirst few rows of the abstract DataFrame:")
        print(abstract_df.head())

        # Print DataFrame info
        print("\nAbstract DataFrame info:")
        abstract_df.info()

        # Save abstract results to CSV
        abstract_csv_file = 'scopus_abstract_output.csv'
        abstract_df.to_csv(abstract_csv_file, index=False)
        print(f"\nAbstract results saved to '{abstract_csv_file}'")
        
        return abstract_df

        # Upload abstract results to database
        # try:
        #     abstract_retrieval_data_uploader(
        #         db_credentials, abstract_df, table_name='scopus_abstract_output')
        #     print("Abstract Retrieval Data has been uploaded to DB.")
        # except Exception as e:
        #     print(f"Error uploading abstract data to database: {e}")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        import traceback
        traceback.print_exc()