In [None]:
from semanticscholar import SemanticScholar
import pickle as pkl
import csv
import json
from difflib import SequenceMatcher as sm
from tqdm.notebook import tqdm
import pandas as pd
from collections import Counter

from IPython.display import display, HTML
display(HTML("<style>div.output_area pre {white-space: pre;}</style>"))


In [None]:
ss = SemanticScholar()
fields = ['title', 'abstract', 'year', 'venue', 'publicationVenue', 'externalIds', 'url',
          'journal', 'referenceCount', 'citationCount', 'influentialCitationCount',
          'fieldsOfStudy', 'authors', 's2FieldsOfStudy', 'publicationTypes']

In [None]:
# Retrieving papers from SS with keyword queries

keywords_xai = [
    " xai ", "(xai)", "hcxai", "explainability", 
    "interpretability", 
    "explainable ai",
    "explainable artificial intelligence",
    "interpretable ml", "interpretable machine learning", "interpretable model",
    "feature attribution", "feature importance", "global explanation", "local explanation",
    "local interpretation", "global interpretation",
    "model explanation", "model interpretation", "saliency", "counterfactual explanation"]

banned = ["/xai/xai", "xai-xai", "xai xai", "workshop", "proceedings"]

papers = {}

for query in keywords_xai:
    print("Retrieving papers with query:", query)

    res = ss.search_paper(query, fields=fields)
    
    for i,x in tqdm(enumerate(res)):
        title_lower = f" {x.title.lower()} {x.abstract.lower() if x.abstract else ''} "
        
        count = 0
        for keyword in keywords_xai:
            if keyword in title_lower:
                count += 1
        for keyword in banned:
            if keyword in title_lower:
                count = 0
        
        if count < 2:
            continue

        x = dict(x)
        if 'embedding' in x:
            del x['embedding']
            
        if x.paperId not in papers:
            papers[x.paperId] = x
            
    print("# papers:", len(papers))

In [None]:
# This cell performs one round of citation expansion based on a set of seed papers
# For the XAI-Scholar dataset, I performed continuous expansion runs until an empty round (no new papers found)

citation_expansion_seed = papers

expanded_papers = {}

for pid in tqdm(citation_expansion_seed):
    # The paper dicts returned by "search_paper()" don't include references and citations,
    # so we need to retrieve them separately.
    full_paper = dict(ss.get_paper(pid))
    expanded_papers[pid] = full_paper
    
    if 'references' in full_paper:
        refs = full_paper['references']
    else:
        refs = []
        
    if 'citations' in full_paper:
        cites = full_paper['citations']
    else:
        cites = []
        
    for paper in refs + cites:
        pid2 = paper['paperId']
        
        if pid2 in expanded_papers or pid2 in citation_expansion_seed:
            continue
    
        title = paper['title']
        abstract = paper['abstract'] if paper['abstract'] else ''
        title_lower = f" {title.lower()} {abstract.lower() if x['abstract'] else ''} "
        
        count = 0
        for keyword in keywords_xai:
            if keyword in title_lower:
                count += 1
        for keyword in banned:
            if keyword in title_lower:
                count = 0

        if count < 2:
            continue
        
        
        x = dict(ss.get_paper(pid2))
        
        if 'embedding' in x:
            del x['embedding']
        
        expanded_papers[pid2] = paper
        print("Hey :)")