In [1]:
# Merge all data sheets but keeping the method information

import pandas as pd
import glob
import os
import re

# Define the folder path
folder_path = 'data/scopus-download-cited-by-2025-07-17' 


# Define the file pattern
file_pattern = os.path.join(folder_path, "scopus_*_*.csv")

# List all matching files
files = glob.glob(file_pattern)

# List to hold individual DataFrames
df_list = []

# Regular expression to extract the XAI method name from filename
pattern = re.compile(r"scopus_(.*?)_.*\.csv")

for file in files:
    match = pattern.search(os.path.basename(file))
    if match:
        cited_xai_method = match.group(1)
        df = pd.read_csv(file)
        df["cited_XAI_method"] = cited_xai_method 
        df_list.append(df)

# Merge all dataframes
merged_df = pd.concat(df_list, ignore_index=True)

# Remove duplicates based on 'EID' column

merged_df.shape[0]

106682

In [2]:
# Remove duplicates based on 'EID' column
# Drop duplicates on all columns except 'cited-xai-method'
df_duplicatefree = merged_df.groupby('EID').agg(
    lambda x: ', '.join(sorted(set(x))) if x.name == 'cited_XAI_method' else x.iloc[0]
).reset_index()

df_duplicatefree.shape[0]

81676

In [3]:
# Get frequency of each unique combination
combo_counts = df_duplicatefree['cited_XAI_method'].value_counts().reset_index()
combo_counts.columns = ['cited_XAI_method-combo', 'count']
print(combo_counts)

                                cited_XAI_method-combo  count
0                                                  PDP  19069
1                                     GradCAMSelvaraju  10997
2                                             DeepSHAP  10711
3                                            DeConvNet   8502
4                                                 LIME   5392
..                                                 ...    ...
962  Anchors, DeepSHAP, GradCAMSelvaraju, GradCAMZh...      1
963       DeConvNet, GradCAMZhou, LIME, LRP, gradcam++      1
964  ConditionalVariableImportance, DeConvNet, Deep...      1
965                                   BP, GBP, IG, PDP      1
966                 BP, DeepSHAP, LIME, LRP, gradcam++      1

[967 rows x 2 columns]


In [47]:
# Keep only English papers
df_filtered1 = df_duplicatefree[df_duplicatefree['Language of Original Document'] == 'English']
df_filtered1.shape[0]

79753

In [48]:
# Keep only articles
df_filtered2 = df_filtered1[df_filtered1["Document Type"] == "Article"]
df_filtered2.shape[0]

50632

In [61]:
# Keep only published articles
df_filtered3 = df_filtered2[df_filtered2["Publication Stage"] == "Final"]
df_filtered3.shape[0]

49437

In [63]:
# Calculate citations per year

df_filtered3['years_ago'] = 2025 - df_filtered3['Year']

df_filtered3['citations_per_year'] = df_filtered3.apply(
    lambda row: row['Cited by'] if row['years_ago'] <= 0 else row['Cited by'] / row['years_ago'],
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered3['years_ago'] = 2025 - df_filtered3['Year']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered3['citations_per_year'] = df_filtered3.apply(


In [64]:
# Save results
df_filtered3.to_csv('data/scopus-download-cited-by-2025-07-17/merged_filtered.csv', index=False)

In [51]:
df_filtered3 = pd.read_csv('data/scopus-download-cited-by-2025-07-17/merged_filtered.csv')

In [83]:
# Filtering with keywords in both Title and Abstract 

# Keywords should limit to the medical field
# And limit to imaging field, i.e. avoiding biomarkers


# Keywords Core Imaging Modalities, results: 2981
# keywords = ["x-ray", "x-rays", "ct", "mri", "pet", "spect", "ultrasound"]

# Keywords main methods with full names, results: 3309
# keywords = ["x-ray", "x-rays", "ct", "mri", "pet", "spect", "ultrasound", "computed tomography", "magnetic resonance imaging", "Positron Emission Tomography", "Single-Photon Emission Computed Tomography"]


# Keywords including Specialized or Derived Imaging Techniques, results: 3897
# keywords = ["ct", "tomography", "x-ray", "x-rays", "radiographs", "radiology", "mri", "magnetic resonance imaging", "pet", "spect", "fmri","ultrasound", "sonography" "mammography", "scintigraphy", "fluoroscopy", "histopathology", "ophthalmology", "Ultrasonography", "Elastography", "Scintigraphy", "scint", "echocardiogram", "fundus photography", "oct", "dermoscopy", "endoscopy"]

# Radlex keywords
# keywords = ["fluoroscopy", "magnetic resonance imaging", "mri", "spectroscopy", "nuclear medicine imaging", "panographic radiograph", "projection radiography", "spectroscopy", "tomography", "ct", "ultrasound"]

# Test single keywords
keywords = ["Fundoscopy"]

# Full matching only (not part of a word)
# \b means space, punctuation, or the start/end of a string
pattern = r"\b(" + "|".join(keywords) + r")\b"

mask = df_filtered3['Title'].str.contains(pattern, flags=re.IGNORECASE, regex=True) | \
       df_filtered3['Abstract'].str.contains(pattern, flags=re.IGNORECASE, regex=True)


df_keywordFiltered = df_filtered3[mask]
df_keywordFiltered.shape[0]

  mask = df_filtered3['Title'].str.contains(pattern, flags=re.IGNORECASE, regex=True) | \
  df_filtered3['Abstract'].str.contains(pattern, flags=re.IGNORECASE, regex=True)


2

In [71]:
df_filtered_citedLim = df_keywordFiltered[df_keywordFiltered["citations_per_year"] > 10]

df_filtered_citedLim.shape[0]

804