In [None]:
# Merge all data sheets but keeping the method information

import pandas as pd
import glob
import os
import re

# Define the folder path
folder_path = 'data/scopus-download-cited-by-2025-07-17' 

# Define the file pattern
file_pattern = os.path.join(folder_path, "scopus_*_*.csv")

# List all matching files
files = glob.glob(file_pattern)

# List to hold individual DataFrames
df_list = []

# Regular expression to extract the XAI method name from filename
pattern = re.compile(r"scopus_(.*?)_.*\.csv")

for file in files:
    match = pattern.search(os.path.basename(file))
    if match:
        cited_xai_method = match.group(1)
        df = pd.read_csv(file)
        df["cited_XAI_method"] = cited_xai_method 
        df_list.append(df)

# Merge all dataframes
merged_df = pd.concat(df_list, ignore_index=True)

# Remove duplicates based on 'EID' column

merged_df.shape[0]

106682

In [2]:
# Remove duplicates based on 'EID' column
# Drop duplicates on all columns except 'cited-xai-method'
df_duplicatefree = merged_df.groupby('EID').agg(
    lambda x: ', '.join(sorted(set(x))) if x.name == 'cited_XAI_method' else x.iloc[0]
).reset_index()

df_duplicatefree.shape[0]

81676

In [3]:
# Get frequency of each unique combination
combo_counts = df_duplicatefree['cited_XAI_method'].value_counts().reset_index()
combo_counts.columns = ['cited_XAI_method-combo', 'count']
print(combo_counts)

                                cited_XAI_method-combo  count
0                                                  PDP  19069
1                                     GradCAMSelvaraju  10997
2                                             DeepSHAP  10711
3                                            DeConvNet   8502
4                                                 LIME   5392
..                                                 ...    ...
962  Anchors, DeepSHAP, GradCAMSelvaraju, GradCAMZh...      1
963       DeConvNet, GradCAMZhou, LIME, LRP, gradcam++      1
964  ConditionalVariableImportance, DeConvNet, Deep...      1
965                                   BP, GBP, IG, PDP      1
966                 BP, DeepSHAP, LIME, LRP, gradcam++      1

[967 rows x 2 columns]


In [47]:
# Keep only English papers
df_filtered1 = df_duplicatefree[df_duplicatefree['Language of Original Document'] == 'English']
df_filtered1.shape[0]

79753

In [48]:
# Keep only articles
df_filtered2 = df_filtered1[df_filtered1["Document Type"] == "Article"]
df_filtered2.shape[0]

50632

In [61]:
# Keep only published articles
df_filtered3 = df_filtered2[df_filtered2["Publication Stage"] == "Final"]
df_filtered3.shape[0]

49437

In [None]:
# Calculate citations per year

df_filtered3['years_ago'] = 2025 - df_filtered3['Year']

df_filtered3['citations_per_year'] = df_filtered3.apply(
    lambda row: row['Cited by'] if row['years_ago'] <= 0 else row['Cited by'] / row['years_ago'],
    axis=1
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered3['years_ago'] = 2025 - df_filtered3['Year']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered3['citations_per_year'] = df_filtered3.apply(


In [64]:
# Save results
df_filtered3.to_csv('data/scopus-download-cited-by-2025-07-17/merged_filtered.csv', index=False)

In [1]:
import pandas as pd

df_filtered3 = pd.read_csv('data/scopus-download-cited-by-2025-07-17/merged_filtered.csv')

In [2]:
import re
# Filtering with keywords in both Title and Abstract 

# KEYWORDS
# All keywords from MeSH plus abbreviations

keywords_ends_with_graph = [ "Radio", "Tomo","Angio", "Echocardio", "Ultrasono", "Angiocardio", "Mammo",  "Thermo",  "Holo", "Cholangiopancreato", "Uro", "Phlebo", "Cholangio", "Aorto", "Microtomo", "Ventriculo", "Myelo", "Endosono",  "Arthro", "Lympho", "Reno", "Broncho", "Hysterosalpingo", "Lymphoscinti", "Echoencephalo","Cysto", "Cineangio", "Colono", "Cholecysto", "Cineradio", "Photomicro", "Microradio", "Pneumoencephalo", "Porto",  "Sialo", "Defeco", "Electrokymo", "Xeroradio",  "Neuroradio", "Dacryocysto", "Photofluoro", "Moire Topo", "Xeromammo", "Pneumoradio"] 

keywords_ending_scop = ["Spectro", "Micro","Fluoro", "Dermo","Strobo","Angio"] 

keywords_ending_metr =[ "Absorptio","Radiostereo", "Photogram"]

keywords_fixed = ["Magnetic Resonance Imaging", "ultrasound", "Neuroimaging", "Brain Mapping", "Radionuclide", "Molecular Imaging", "Optical Imaging", "Diffusion Tensor Imaging", "Multimodal Imaging", "Perfusion Imaging", "Electron Probe Microanalysis", "Radiomics", "Elasticity Imaging", "Barium Enema", "Echo-Planar Imaging", "Whole Body Imaging", "Cell Tracking", "Narrow Band Imaging", "Transillumination", "Terahertz Imaging", "Quantitative Phase Imaging", "Laser Speckle Contrast Imaging", "Radioimmunodetection", "Gated Blood-Pool Imaging", "Voltage-Sensitive Dye Imaging", "Laser Scanning Cytometry", "Postmortem Imaging", "Forensic Imaging", "Brain Cortical Thickness", "Dopaminergic Imaging",  "cat","ct", "mri", "pet", "cxr", "echo"]

keywords_plural_with_s_possible = ["x-ray","microwave", "Nuchal Translucency Measurement", "Cervical Length Measurement"]

# REGEX PATTERNS
# \b means space, punctuation, or the start/end of a string
# ?: means non-capturing group, i.e. not saving the results

pattern_graph = r'\b(?:' + '|'.join(f"{k}(?:graph|graphs|gram|grams|graphy|graphies|graphic|graphical)" for k in keywords_ends_with_graph) + r')\b'

pattern_scop = r'\b(?:' + '|'.join(f"{k}(?:scope|scopy|scopic|scopical)" for k in keywords_ending_scop) + r')\b'

pattern_metr = r'\b(?:' + '|'.join(f"{k}(?:metry|metric|metrical|meter)" for k in keywords_ending_metr) + r')\b'

pattern_fixed = r'\b(?:' + '|'.join(keywords_fixed) + r')\b'

pattern_plural = r'\b(?:' + '|'.join(f"{k}s?" for k in keywords_plural_with_s_possible) + r')\b'

# SEARCH
# Combine both columns of interest for search
combined = df_filtered3['Title'] + ' ' + df_filtered3['Abstract']

# Perform searches
mask_graph = combined.str.contains(pattern_graph, flags=re.IGNORECASE, regex=True)
mask_scop = combined.str.contains(pattern_scop, flags=re.IGNORECASE, regex=True)
mask_metr = combined.str.contains(pattern_metr, flags=re.IGNORECASE, regex=True)
mask_fixed = combined.str.contains(pattern_fixed, flags=re.IGNORECASE, regex=True)
mask_plural = combined.str.contains(pattern_plural, flags=re.IGNORECASE, regex=True)

# Combine all match masks
final_mask = mask_graph | mask_scop | mask_metr | mask_fixed | mask_plural

# Keep only matching rows
df_keywordFiltered = df_filtered3[final_mask].copy()

# Show the number of entries
df_keywordFiltered.shape[0]

5531

In [3]:
# Save results

df_keywordFiltered['citations_per_year'] = df_keywordFiltered['citations_per_year'].round().astype(int)


df_keywordFiltered.to_csv('data/scopus-download-cited-by-2025-07-17/keyword_filtered_2025-08-26.csv', index=False)

In [6]:
df_keywordFiltered.to_excel('data/scopus-download-cited-by-2025-07-17/keyword_filtered_2025-08-26.xlsx', index=False)

In [4]:
df_filtered_citedLim = df_keywordFiltered[df_keywordFiltered["citations_per_year"] > 20]

df_filtered_citedLim.shape[0]

366

In [None]:
# Backup graph keywords
# keywords_ends_with_graph_full = [ "Radiograph", "Tomograph","Angiograph", "Echocardiograph", "Ultrasonograph", "Angiocardiograph", "Mammograph",  "Thermograph",  "Holograph", "Cholangiopancreatograph", "Urograph", "Phlebograph", "Cholangiograph", "Aortograph", "Microtomograph", "Ventriculograph", "Myelograph", "Endosonograph",  "Arthrograph", "Lymphograph", "Renograph", "Bronchograph", "Hysterosalpingograph", "Lymphoscintigraph", "Echoencephalograph","Cystograph", "Cineangiograph", "Colonograph", "Cholecystograph", "Cineradiograph", "Photomicrograph", "Microradiograph", "Pneumoencephalograph", "Portograph",  "Sialograph", "Defecograph", "Electrokymograph", "Xeroradiograph",  "Neuroradiograph", "Dacryocystograph", "Photofluorograph", "Moire Topograph", "Xeromammograph", "Pneumoradiograph"] 