# Influential Papers
In this notebook we perform a cittaion analysis to identify the most influential papers. A series of visualisations and tables are produced for th emain paper.

In [None]:
import swifter

import os
import json
import random
import time
import re
from datetime import datetime
from collections import defaultdict

import string 

import matplotlib.pyplot as plt

import Stemmer

import random
import requests
from itertools import chain
from more_itertools import sliced

import pandas as pd
import matplotlib as mpl

from matplotlib.pylab import plt
from matplotlib_venn import venn2, venn3
import matplotlib as mpl
from matplotlib.cm import ScalarMappable

import numpy as np

from glob import glob, iglob
from pathlib import Path
                         
from loguru import logger
from IPython.display import display, clear_output

from multiprocessing import Pool

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import nltk
nltk.download('punkt')

from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')

import seaborn as sns

from itables import init_notebook_mode, show, options
init_notebook_mode(all_interactive=False)

import networkx as nx


import seaborn as sns
sns.set_context('paper', font_scale=1.25)

pd.__version__

# Load the RecSys Papers

In [None]:
recsys_papers_df = pd.read_feather('../data/processed/2410_recsys_papers_with_topics.feather')

recsys_papers_df.shape

In [None]:
recsys_papers_df['bibtex_key'] = recsys_papers_df['bibtex'].map(
    lambda bibtex: bibtex.split('{')[1].split(',')[0] if bibtex is not None else ''
)

In [None]:
papers_by_topic_df = pd.read_feather('../data/processed/3410_papers_by_topic.feather')
papers_by_topic_df[(papers_by_topic_df['growing_papers']) & (papers_by_topic_df['growing_citations'])].sort_values(by='momentum', ascending=False)

# Build Citation Graph

In [None]:
def build_citation_graph(df):

    # Extract the citations
    citations = (
        df
        .set_index('paperId')['updated_citations']
        .explode()
        .dropna()
        .reset_index()
    )

    citations.columns = ['cited', 'citer']

    # Return the directed citation graph.
    return nx.from_pandas_edgelist(
        citations, 
        source="citer", target="cited", 
        edge_attr=None, create_using=nx.DiGraph()
    )
    

recsys_papers_graph = build_citation_graph(recsys_papers_df)

recsys_papers_graph

In [None]:
len(recsys_papers_graph.nodes()), len(recsys_papers_graph.edges()), len(set(recsys_papers_df['paperId'].unique()).union(set(recsys_papers_df['citations'].explode().values)))

# Calculate Centrality Measures

In [None]:
# A wrapper to make it each to set the alpha; used default alpha.
def katz_centrality(G):
    return nx.katz_centrality(G, alpha=.05)

def get_centrality_scores(G):

    def authorities_centrality(G): return nx.hits(G)[1]
    def pagerank_centrality(G): return nx.pagerank(G)
    
    centrality_measures = [
        nx.in_degree_centrality, 
        pagerank_centrality, 
        katz_centrality, 
    ]

    score_dicts = []
    
    for centrality_measure in centrality_measures:
        clear_output()
        logger.info(centrality_measure.__name__)
                    
        score_dicts.append(centrality_measure(G)) 
    
    scores_df = pd.DataFrame(score_dicts).T
    scores_df.columns = [centrality_measure.__name__ for centrality_measure in centrality_measures]

    return scores_df


In [None]:
recsys_papers_centrality_df = get_centrality_scores(recsys_papers_graph)
recsys_papers_centrality_df.shape

In [None]:
recsys_papers_centrality_df

## Add to the RecsSys Papers DF & Calculate Ranks
There are more than just Recsys papers in this graph but we are most interested in the recsys papers. We add the various cenrality estimates to the RS papers for further analysis.

In [None]:
recsys_papers_df = recsys_papers_df.set_index('paperId').join(recsys_papers_centrality_df, how='left').reset_index()
recsys_papers_df.head()

In [None]:
recsys_papers_df = pd.concat([
    recsys_papers_df, 
    recsys_papers_df.filter(like='_centrality').rank(ascending=False).add_suffix('_rank')    
], axis=1)

recsys_papers_df['centrality_rank_sum'] = recsys_papers_df.filter(like='centrality_rank').sum(axis=1)
recsys_papers_df['centrality_rank_mean'] = recsys_papers_df.filter(like='centrality_rank').mean(axis=1)
recsys_papers_df['centrality_rank_std'] = recsys_papers_df.filter(like='centrality_rank').std(axis=1)

recsys_papers_df['centrality_rank_harmonic_mean'] = 3/(
    (1/recsys_papers_df['pagerank_centrality_rank'])+
    (1/recsys_papers_df['katz_centrality'])+
    (1/recsys_papers_df['in_degree_centrality_rank'])
)

recsys_papers_df.head()

# The Most Influential RecSys Papers
We calculate overall influence based on a combination of centrality ranks.

In [None]:
def top_papers_table(top_papers):

    top_papers = top_papers[['year', 'title', 'author_names', 'citationCount', 'bibtex_key']].copy()

    top_papers['year'] = top_papers['year'].map(int)

    top_papers['title'] = (
        top_papers['title'].map(lambda title: title[:42]+' ...') 
        + top_papers['bibtex_key'].map(lambda key: '\cite{{{}}}'.format(key))
    )
    
    # Just take the surnames to save space.
    top_papers['author_names'] =  top_papers['author_names'].map(lambda author_names: author_names[0].split()[-1]+' et al.' if len(author_names)>1 else author_names[0].split()[-1])
    top_papers['citationCount'] = top_papers['citationCount'].map(lambda count: '{:,.0f}'.format(count))

    top_papers = top_papers[['year', 'title', 'author_names', 'citationCount']]
    top_papers.columns = ['Year', 'Title', 'Authors', 'Cites']
    top_papers.index = range(1, len(top_papers)+1)

    return top_papers
    

rank_cols = ['pagerank_centrality_rank', 'katz_centrality_rank', 'in_degree_centrality_rank']
    
def plot_impact_bars(ax, ranks_df, rank_cols=rank_cols, top_n=25):
    
    top_n_papers = ranks_df.sort_values(by='centrality_rank_sum', ascending=True).head(top_n)
    
    top_n_papers[rank_cols].plot(ax=ax, kind='bar', stacked=True, alpha=.9, ec='k', lw=1, width=0.75)
    
    # Over-print an outline around each stacked bar.
    top_n_papers['centrality_rank_sum'].plot(ax=ax, kind='bar', ec='k', lw=1, color='none', width=0.75)

    def tag_title(paper):
        if 'paper_type' in paper:
            if paper['paper_type']=='outside':
                return '* '

        return ''
    
    labels = top_n_papers.apply(
        lambda paper: '{}{}...'.format(
            tag_title(paper),
            paper['title'][:30]
        ), axis=1)

    def get_author_label(paper):
        author_names = [name.split()[-1] for name in paper['author_names']]
        author_label = author_names[0] if len(author_names)==1 else author_names[0]+' et al.'        
        return author_label

    
    labels = top_n_papers.apply(
        lambda paper: '{}{}, {},({:.0f})'.format(
            tag_title(paper),
            paper['title'][:25] if len(paper['title'])<=25 else paper['title'][:25]+'... ',
            get_author_label(paper), 
            paper['year'], 
        ), axis=1)
    
    ax.set_xticklabels(labels, ha='center', va='top')

    ax.legend(['Citations/In-Degree', 'PageRank', 'Katz'], ncol=3, loc='upper left', frameon=False)

    return top_n_papers


In [None]:
with_titles = recsys_papers_df['title'].map(lambda title: len(title.split())>2 if title is not None else False)
with_authors = recsys_papers_df['author_names'].map(lambda authors: len(authors)>0 if authors is not None else False)
with_venues = recsys_papers_df['venue'].map(lambda v: len(v)>3 if v is not None else False)
with_topic = recsys_papers_df['recsys_adj_topic_name'].notnull()

with_centrality_ranks = recsys_papers_df.filter(like='centrality_rank').notnull().all(axis=1)

with_centrality_ranks.sum()

## Top RecSys Papers Overall

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

top_n_papers_overall = plot_impact_bars(ax, recsys_papers_df[with_centrality_ranks & with_titles & with_topic & with_venues], top_n=25)

for x, (_, paper) in enumerate(top_n_papers_overall.iterrows()):
    ax.text(x, paper['centrality_rank_sum']+10, ', '.join(paper['recsys_adj_topic_name'].split(', ')[:2]), rotation=90, fontstyle='italic')

ax.set_ylim(0, 260)
ax.set_ylabel('Rank Sum')

fig.tight_layout()

fig.savefig('../graphs/3500_top_n_all_time_influential_recsys_papers.png', dpi=300, bbox_inches='tight')

print(top_papers_table(top_n_papers_overall).set_index('Year').to_latex())

In [None]:
print('\n'.join(top_n_papers_overall['bibtex'].values))

In [None]:
top_n_papers_overall['recsys_adj_topic_name'].nunique(), top_n_papers_overall['recsys_adj_topic_name'].value_counts()

## Top n Recent Influential Papers (2014-2024)

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

is_recent = recsys_papers_df['year']>2013

top_n_papers_recent = plot_impact_bars(ax, recsys_papers_df[with_centrality_ranks & with_titles & with_topic & with_venues & is_recent], top_n=25)

for x, (_, paper) in enumerate(top_n_papers_recent.iterrows()):
    ax.text(x, paper['centrality_rank_sum']+30, ', '.join(paper['recsys_adj_topic_name'].split(', ')[:2]), rotation=90, fontstyle='italic')

ax.set_ylim(0, 1600)
ax.set_ylabel('Rank Sum')

fig.tight_layout()

fig.savefig('../graphs/3500_top_n_recent_influential_recsys_papers.png', dpi=300, bbox_inches='tight')

print(top_papers_table(top_n_papers_recent).set_index('Year').to_latex())

In [None]:
print('\n'.join(top_n_papers_recent['bibtex'].values))

# Save RecSys Papers with Ranks

In [None]:
recsys_papers_df.to_feather('../data/processed/3500_recsys_papers_with_influence_ranks.feather')
recsys_papers_df.shape