# Initial Papers/Authors/Venues Analysis
In this notebook we focus on the the number of recsys papers, authors, and venues by year and produce several visualisations to show how the field has progressed since 1990.

In [None]:
import swifter
import math
import os
import json
import random
import time
from datetime import datetime

import string 

import matplotlib.pyplot as plt

import Stemmer

import random
import requests
from itertools import chain
from more_itertools import sliced

import pandas as pd
import matplotlib as mpl
from matplotlib.pylab import plt
from matplotlib import colormaps
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from matplotlib.cm import ScalarMappable
from matplotlib.lines import Line2D
from matplotlib.patches import Patch


import numpy as np

from glob import glob, iglob
from pathlib import Path
                         
from loguru import logger
from IPython.display import display, clear_output

from multiprocessing import Pool

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import nltk
from nltk.stem import WordNetLemmatizer

import seaborn as sns

from itables import init_notebook_mode, show, options
init_notebook_mode(all_interactive=False)


pd.__version__

In [None]:
sns.set_context('paper', font_scale=1.25)

# Setup

## Load the Papers Dataset

In [None]:
papers_dataset = '../data/processed/2300_recsys_universe_papers.feather'
papers_df = pd.read_feather(papers_dataset)
papers_df.shape

In [None]:
recsys_papers_dataset = '../data/processed/2300_inside_outside_papers.feather'
recsys_papers_df = pd.read_feather(recsys_papers_dataset)
recsys_papers_df.shape

In [None]:
# While we are here, we add teh bibtex key because this is useful later when 
# producing cite keys.
recsys_papers_df['bibtex_key'] = recsys_papers_df['bibtex'].apply(
    lambda bibtex: bibtex.split('{')[1].split(',')[0] if bibtex is not None else ''
)

recsys_papers_df['bibtex_key'].head()

## Fix some venue naming inconsistencies
Address some of the inconsistency in venue names and simplify venue names for later plots.

In [None]:
def fix_venue(df, from_venue, to_venue):

    return np.where(
        df['clean_venue']==from_venue, 
        to_venue, 
        df['clean_venue']
    )

venue_swaps = [

    ('chi extended abstract', 'chi'),
    ('chi conference companion', 'chi'),
    ('sigchi conference human factor computing system', 'chi'),
    ('extended abstract chi conference human factor computing system', 'chi'),
    ('chi extended abstract human factor computing system', 'chi'),
    ('chi conference human factor computing system', 'chi'),
    
    ('communication acm', 'cacm'),
    ('acm cacm', 'cacm'),
    
    ('acm trans interact intell syst', 'transaction interactive intelligent system'),
    
    ('acm trans inf syst', 'toism'),
    ('transaction information system', 'tois'),
    
    ('acm trans internet techn', 'toit'),
    ('transaction internet technology', 'toit'),
    
    ('acm trans multim comput commun appl', 'tomm'),
    
    ('acm transaction web', 'tweb'),
    
    ('web information system engineering', 'wise'),

    ('siam data mining', 'sdm'),

    ('acm transaction knowledge discovery data', 'tkdd'),
    ('transaction knowledge discovery data', 'tkdd'),

    ('acm transaction recommender system', 'tors'),
    ('transaction recommender system', 'tors'),

    ('acm conference recommender system', 'acm recsys'),
    ('conference recommender system', 'acm recsys'),
    
]

for venue_swap in venue_swaps:
    recsys_papers_df['clean_venue'] = fix_venue(recsys_papers_df, *venue_swap)

recsys_papers_df

# Universe Analysis
First we compare the papers/authors in the wider universe to the RS papers/authors and look at growth over time.

In [None]:
num_recsys_papers_by_year = recsys_papers_df.groupby('year').size().loc[1990:2023]
num_unique_recsys_authors_by_year = recsys_papers_df.set_index('year')['authors'].explode().dropna().groupby('year').nunique().loc[1990:2023]
num_unique_recsys_venues_by_year = recsys_papers_df.groupby('year')['clean_venue'].nunique().loc[1990:2023]

num_papers_by_year = papers_df.groupby('year').size().loc[1990:2023]
num_unique_authors_by_year = papers_df.set_index('year')['authors'].explode().dropna().groupby('year').nunique().loc[1990:2023]
num_unique_venues_by_year = papers_df.groupby('year')['clean_venue'].nunique().loc[1990:2023]


In [None]:

def plot_regplot(ax, data, order=4, c='k'):

    # growth = data.loc[2000:].pct_change().agg(['mean', 'std'])

    sns.regplot(ax=ax, x=data.index, y=data.values, color=c, order=order, line_kws=dict(lw=1), scatter_kws=dict(s=15, fc='w'), ci=False)
    sns.regplot(ax=ax, x=data.index, y=data.values, color=c, order=order, line_kws=dict(lw=1), scatter_kws=dict(s=15, fc='w'), ci=False)

    ax.set_ylim(0)
    ax.set_xlabel('')
    ax.set_ylabel('')

    

    

fig, (ax, bx) = plt.subplots(figsize=(9, 2.5), ncols=2, sharex=True)

aax = ax.twinx()
bbx = bx.twinx()
# ccx = cx.twinx()

plot_regplot(ax, num_papers_by_year, c='tab:blue')
plot_regplot(bx, num_unique_authors_by_year, c='tab:blue')

plot_regplot(aax, num_recsys_papers_by_year/num_papers_by_year, c='tab:orange')
plot_regplot(bbx, num_unique_recsys_authors_by_year/num_unique_authors_by_year, c='tab:orange')


for xx in [ax, bx]:
    xx.set_yticklabels(['{:.0f}'.format(n/1000) for n in xx.get_yticks()])
    
ax.set_ylabel("Papers ('000s)")
bx.set_ylabel("Authors ('000s)")

ax.set_title('(a) Papers x Year')
bx.set_title('(b) Unique Authors x Year')

for xx in [aax, bbx]:
    xx.set_ylim(0, .06)
    xx.set_yticks(np.arange(0, .07, .02))
    xx.set_ylabel('Frac RecSys')


custom_lines = [
            Line2D([0], [0], color='tab:blue', lw=1),
            Line2D([0], [0], color='tab:orange', lw=1)
    ]
    
ax.legend(custom_lines, ['$U_p$', '$R_p/U_p$'], ncol=1, loc='upper left', frameon=False)
bx.legend(custom_lines, ['$U_p$', '$R_p/U_p$'], ncol=1, loc='upper left', frameon=False)

fig.tight_layout()

fig.savefig('../graphs/3000_papers_authors_by_year_in_universe.png', dpi=300, bbox_inches='tight')


# The Rise of Recommender Systems
next, we focus on the core RS papers and look at how tehse papres, authors, and venues have grown over time.

## The Papers/Authors/Venues Bar Chart
This is a central chart in the study which shows the growth of the RS field over time in terms of the number of papers produced, unique authors, and venues over time. We also highlight the top cited papers each yer.

In [None]:
with_authors = recsys_papers_df['author_names'].map(lambda names: len(names) if names is not None else False)
with_citations = recsys_papers_df['citationCount']>100
with_title = (
    (recsys_papers_df['title'].map(lambda title: len(title)>15 if title is not None else False))
    & (recsys_papers_df['title'].map(lambda t: t.lower() if t is not None else '').map(lambda t: not(t.startswith('recommend'))))
    & (recsys_papers_df['title'].map(lambda t: t.lower() if t is not None else '').map(lambda t: not(t.startswith('introduction'))))
)

recsys_papers_df['citation_rank'] = recsys_papers_df['citationCount'].rank(method='first', ascending=False)

def top_paper_label(paper):
    
    paper_type = '*' if paper['paper_type']=='outside' else ''
    author_type = 'et al.' if len(paper['author_names'])>1 else ''

    return '{} {} {}\n{}...\n({}, {:,}, {:,})'.format(
        paper_type,
        paper['author_names'][0].split(' ')[-1],
        author_type,
        paper['title'][:16], 
        int(paper.name),
        paper['citationCount'],
        int(paper['citation_rank'])
    )

top_n_papers_by_year = (
    recsys_papers_df[with_authors & with_citations & with_title]
    .sort_values(by='citationCount', ascending=False)
    .groupby('year')
    .first()
    .apply(top_paper_label, axis=1)
)

top_n_papers_by_year.head()

In [None]:
top_n_cites_by_year = (
    recsys_papers_df[with_authors & with_citations & with_title]
    .sort_values(by='citationCount', ascending=False)
    .groupby('year')
    .first()['citationCount']
)

min_citations, max_citations = top_n_cites_by_year.agg(['min', 'max'])

norm = mpl.colors.Normalize(vmin=min_citations, vmax=max_citations*3)
cmap = plt.get_cmap('Purples')

Counting the number of venues is compicated because the data is noisy. Rather than attempting further cleanup we use an estimate below by eliminating venues that appear less than a minimum number of times.

In [None]:
venue_sizes = recsys_papers_df.groupby('clean_venue').size()
venue_years = recsys_papers_df.groupby('clean_venue')['year'].nunique()

min_venue_size = 3
min_venue_years = 2
use_venues = venue_sizes[(venue_sizes>min_venue_size) & (venue_years>min_venue_years)]

is_valid_venue = recsys_papers_df['clean_venue'].isin(use_venues.index)
is_valid_venue.sum()

### Version 1

In [None]:
fig, ax = plt.subplots(figsize=(12, 9))

num_authors_by_year = recsys_papers_df.groupby('year')['authors'].apply(lambda g: g.explode().dropna().nunique())
num_papers_by_year = recsys_papers_df.groupby('year').size()
num_venues_by_year = recsys_papers_df[is_valid_venue].groupby('year')['clean_venue'].nunique()

ax.bar(num_authors_by_year.index, num_authors_by_year, width=.9, color='tab:olive', ec='k', lw=.5, alpha=1, zorder=1000)

ax.bar(num_papers_by_year.index, num_papers_by_year, width=.7, color='tab:cyan', ec='k', lw=.5, alpha=.8, zorder=1000)

bx = ax.twinx()
bx.bar(num_venues_by_year.index, num_venues_by_year, width=.5, color='tab:brown', ec='k', lw=.5, alpha=.8, zorder=1000)

annotate_props = dict(
    ha='center', va='bottom', 
    bbox=dict(lw=.5, ec='k', fc='whitesmoke'), 
    fontsize=10, fontstyle='italic', 
    color='darkslategrey', 
    arrowprops=dict(arrowstyle='->', lw=.5, color='k')
)

offsets = {
    1992:(3000, 5), 1994:(10000, 4), 1995:(17000, 3), 1997:(24000, 2), 1998:(31000, 1), 1999:(38000, 0),  2000:(45000, -1),
    2001:(3500, 5), 2002:(10500, 4), 2003:(17500, 3), 2004:(24500, 2), 2005:(31500, 1), 2006:(38500, 0), 2007:(45000, -1), 
    2008:(5000, 5), 2009:(12000, 4), 2010:(19000, 3), 2011:(26000, 2), 2012:(33000, 1), 2013:(40000, 0), 2014:(47000, -1),
    2015:(11500, 5), 2016:(18500, 4), 2017:(25500, 3), 2018:(32500, 2), 2019:(39500, 1), 2020:(46500, 0), 
    2021:(22000, 2), 2022:(29000, 1), 2023:(36000, 0)
}

for year, label in top_n_papers_by_year.items():
    offset, z = offsets[year]
    y = num_authors_by_year.loc[year]
    
    c = cmap(norm(top_n_cites_by_year.loc[year]))
    annotate_props['bbox'] = {'lw': 0.5, 'ec': 'k', 'fc': c}
    
    ax.annotate(label, xy=(year, y), xytext=(year, offset), **annotate_props, zorder=z)

ax.grid(axis='y', ls='--')


ax.set_xlim(1989, 2024)
ax.set_xticks(range(1992, 2024, 2))

ax.set_ylim(0, 64000)
ax.set_yticks(range(0, 21000, 5000))

bx.set_ylim(0, 19275)
bx.set_yticks(range(0, 6100, 1500))


ax.tick_params(length=0)

ax.spines[['left', 'right', 'top', 'bottom']].set_visible(False)
bx.spines[['left', 'right', 'top', 'bottom']].set_visible(False)

ax.set_xlabel('Year')     
ax.set_ylabel('Number of Papers/Authors                                                                                        ')
bx.set_ylabel('Number of Venues                                                                                          ')


legend_elements = [
     Patch(facecolor='tab:olive', ec='k', lw=.5, label='Papers'),
     Patch(facecolor='tab:cyan', ec='k', lw=.5, label='Authors'),
     Patch(facecolor='tab:brown', ec='k', lw=.5, label='Venues')
]

ax.legend(handles=legend_elements, ncol=1, loc='center left', frameon=False)



# the ACM RecSys Line
ax.axvline(2007, lw=1, c='k', zorder=-100)
pre_recsys = recsys_papers_df['year']<2007
post_recsys = ~pre_recsys

# Classical vs Modern RecSys
recsys_papers_df['age'] = 2024-recsys_papers_df['year']
recsys_papers_df['citations_per_year'] = recsys_papers_df['citationCount']/recsys_papers_df['age']

pre_recsys_papers = len(recsys_papers_df[pre_recsys])
pre_recsys_authors = recsys_papers_df[pre_recsys]['authors'].explode().dropna().nunique()
pre_recsys_venues = recsys_papers_df[is_valid_venue & pre_recsys]['clean_venue'].nunique()
pre_recsys_citations = recsys_papers_df[pre_recsys]['citationCount'].sum()
pre_recsys_citations_per_year = recsys_papers_df[pre_recsys]['citations_per_year'].mean()

ax.text(1998, 60000, 
        '''Classical Recommender Systems\n'''
        '''(collaborative/content-based/hybrid/evaluation accuracy)\n'''
        '''{:,} papers, {:,} authors, {:,} venues'''.format(
            pre_recsys_papers, pre_recsys_authors, pre_recsys_venues
        ), bbox=dict(lw=.5, ec='k', fc='w'), ha='center', va='top', fontstyle='italic', fontsize=12
       )

post_recsys_papers = len(recsys_papers_df[post_recsys])
post_recsys_authors = recsys_papers_df[post_recsys]['authors'].explode().dropna().nunique()
post_recsys_venues = recsys_papers_df[is_valid_venue & post_recsys]['clean_venue'].nunique()
post_recsys_citations = recsys_papers_df[post_recsys]['citationCount'].sum()
post_recsys_citations_per_year = recsys_papers_df[post_recsys]['citations_per_year'].mean()

ax.text(2016.8, 60000, 
        '''Modern Recommender Systems\n'''
        '''(matrix factorization/beyond accuracy/deep learning/fairness)\n'''
        '''{:,} papers, {:,} authors, {:,} venues'''.format(
            post_recsys_papers, post_recsys_authors, post_recsys_venues
        ), bbox=dict(lw=.5, ec='k', fc='w'), ha='center', va='top', fontstyle='italic', fontsize=12
       )

ax.text(2007, 64000, '1st ACM RecSys', bbox=dict(lw=.5, ec='k', fc='w'), ha='center', va='top', fontsize=12, fontstyle='italic', fontweight='bold')


fig.suptitle('Papers, Authors, and Most Cited RecSys Papers by Year\n(n = {:,} papers, {:,} authors, {:,} venues)'.format(
    len(recsys_papers_df), recsys_papers_df['authors'].explode().dropna().nunique(), recsys_papers_df[is_valid_venue]['clean_venue'].nunique()
))

fig.tight_layout()

fig.savefig('../graphs/3000_num_papers_authors_venues_by_year.png', dpi=300, bbox_inches='tight')

### Version 2

In [None]:
sns.set_context('paper', font_scale=1.5)

fig, ax = plt.subplots(figsize=(16, 9))

ax.bar(num_authors_by_year.index, num_authors_by_year, width=.9, color='tab:olive', ec='k', lw=.5, alpha=1, zorder=1000)

ax.bar(num_papers_by_year.index, num_papers_by_year, width=.7, color='tab:cyan', ec='k', lw=.5, alpha=.8, zorder=1000)

bx = ax.twinx()
bx.bar(num_venues_by_year.index, num_venues_by_year, width=.5, color='tab:brown', ec='k', lw=.5, alpha=.8, zorder=1000)

annotate_props = dict(
    ha='center', va='bottom', 
    bbox=dict(lw=.5, ec='k', fc='whitesmoke'), 
    fontsize=10, fontstyle='italic', 
    color='darkslategrey', 
    arrowprops=dict(arrowstyle='->', lw=.5, color='k')
)

offsets = {
    1992:(3000, 5), 1994:(10000, 4), 1995:(17000, 3), 1997:(24000, 2), 1998:(31000, 1), 1999:(38000, 0),  2000:(45000, -1),
    2001:(3500, 5), 2002:(10500, 4), 2003:(17500, 3), 2004:(24500, 2), 2005:(31500, 1), 2006:(38500, 0), 2007:(45000, -1), 
    2008:(5000, 5), 2009:(12000, 4), 2010:(19000, 3), 2011:(26000, 2), 2012:(33000, 1), 2013:(40000, 0), 2014:(47000, -1),
    2015:(11500, 5), 2016:(18500, 4), 2017:(25500, 3), 2018:(32500, 2), 2019:(39500, 1), 2020:(46500, 0), 
    2021:(22000, 2), 2022:(29000, 1), 2023:(36000, 0)
}

for year, label in top_n_papers_by_year.items():
    offset, z = offsets[year]
    y = num_authors_by_year.loc[year]
    
    c = cmap(norm(top_n_cites_by_year.loc[year]))
    annotate_props['bbox'] = {'lw': 0.5, 'ec': 'k', 'fc': c}
    
    ax.annotate(label, xy=(year, y), xytext=(year, offset), **annotate_props, zorder=z)

ax.grid(axis='y', ls='--')


ax.set_xlim(1989, 2024)
ax.set_xticks(range(1992, 2024, 2))

ax.set_ylim(0, 60000)
ax.set_yticks(range(0, 21000, 5000))

bx.set_ylim(0, 19275)
bx.set_yticks(range(0, 6100, 1500))


ax.tick_params(length=0)

ax.spines[['left', 'right', 'top', 'bottom']].set_visible(False)
bx.spines[['left', 'right', 'top', 'bottom']].set_visible(False)

ax.set_xlabel('Year')     
ax.set_ylabel('Number of Papers/Authors                                                                  ')
bx.set_ylabel('Number of Venues                                                                       ')


legend_elements = [
     Patch(facecolor='tab:olive', ec='k', lw=.5, label='Papers'),
     Patch(facecolor='tab:cyan', ec='k', lw=.5, label='Authors'),
     Patch(facecolor='tab:brown', ec='k', lw=.5, label='Venues')
]

ax.legend(handles=legend_elements, ncol=1, loc='center left', frameon=False)



# the ACM RecSys Line
ax.axvline(2007, lw=1, c='k', zorder=-100)
pre_recsys = recsys_papers_df['year']<2007
post_recsys = ~pre_recsys

# Classical vs Modern RecSys
recsys_papers_df['age'] = 2024-recsys_papers_df['year']
recsys_papers_df['citations_per_year'] = recsys_papers_df['citationCount']/recsys_papers_df['age']

pre_recsys_papers = len(recsys_papers_df[pre_recsys])
pre_recsys_authors = recsys_papers_df[pre_recsys]['authors'].explode().dropna().nunique()
pre_recsys_venues = recsys_papers_df[is_valid_venue & pre_recsys]['clean_venue'].nunique()
pre_recsys_citations = recsys_papers_df[pre_recsys]['citationCount'].sum()
pre_recsys_citations_per_year = recsys_papers_df[pre_recsys]['citations_per_year'].mean()

ax.text(1998, 57000, 
        '''Classical Recommender Systems\n'''
        '''(collaborative/content-based/hybrid/evaluation accuracy)\n'''
        '''{:,} papers, {:,} authors, {:,} venues'''.format(
            pre_recsys_papers, pre_recsys_authors, pre_recsys_venues
        ), bbox=dict(lw=.5, ec='k', fc='w'), ha='center', va='top', fontstyle='italic', fontsize=12
       )

post_recsys_papers = len(recsys_papers_df[post_recsys])
post_recsys_authors = recsys_papers_df[post_recsys]['authors'].explode().dropna().nunique()
post_recsys_venues = recsys_papers_df[is_valid_venue & post_recsys]['clean_venue'].nunique()
post_recsys_citations = recsys_papers_df[post_recsys]['citationCount'].sum()
post_recsys_citations_per_year = recsys_papers_df[post_recsys]['citations_per_year'].mean()

ax.text(2016.8, 57000, 
        '''Modern Recommender Systems\n'''
        '''(matrix factorization/beyond accuracy/deep learning/fairness)\n'''
        '''{:,} papers, {:,} authors, {:,} venues'''.format(
            post_recsys_papers, post_recsys_authors, post_recsys_venues
        ), bbox=dict(lw=.5, ec='k', fc='w'), ha='center', va='top', fontstyle='italic', fontsize=12
       )

ax.text(2007, 60000, '1st ACM RecSys', bbox=dict(lw=.5, ec='k', fc='w'), ha='center', va='top', fontsize=12, fontstyle='italic', fontweight='bold')


fig.suptitle('Papers, Authors, and Most Cited RecSys Papers by Year\n(n = {:,} papers, {:,} authors, {:,} venues)'.format(
    len(recsys_papers_df), recsys_papers_df['authors'].explode().dropna().nunique(), recsys_papers_df[is_valid_venue]['clean_venue'].nunique()
))

fig.tight_layout()

fig.savefig('../graphs/3000_num_papers_authors_venues_by_year_ppt.png', dpi=300, bbox_inches='tight')

sns.set_context('paper', font_scale=1.25)


### A table of the top papers and their bibtext records

In [None]:
def top_papers_table(top_papers):

    top_papers = top_papers[['year', 'title', 'author_names', 'citationCount', 'bibtex_key']].copy()

    top_papers['year'] = top_papers['year'].map(int)

    top_papers['title'] = (
        top_papers['title'].map(lambda title: title[:45]+' ...') 
        + top_papers['bibtex_key'].map(lambda key: '\cite{{{}}}'.format(key))
    )
    
    # Just take the surnames to save space.
    top_papers['author_names'] =  top_papers['author_names'].map(lambda author_names: author_names[0].split()[-1]+' et al.' if len(author_names)>1 else author_names[0].split()[-1])
    top_papers['citationCount'] = top_papers['citationCount'].map(lambda count: '{:,.0f}'.format(count))

    top_papers = top_papers[['year', 'title', 'author_names', 'citationCount']]
    top_papers.columns = ['Year', 'Title', 'Authors', 'Cites']
    top_papers.index = range(1, len(top_papers)+1)

    return top_papers
    
top_papers = (
    recsys_papers_df[with_authors & with_citations & with_title]
    .sort_values(by='citationCount', ascending=False)
    .groupby('year')
    .first()
    .reset_index()
)

print(top_papers_table(top_papers).set_index('Year').to_latex())

In [None]:
top_papers_table(top_papers)

In [None]:
print('\n'.join(top_papers['bibtex']))

# Top Venues Scatter Plot
This is another major chart in the paper, this time looking at the popular venues for RS papers.

In [None]:
recsys_papers_df['frac_influential_citations'] = recsys_papers_df['influentialCitationCount']/recsys_papers_df[['citationCount', 'updated_citation_count']].max(axis=1)
recsys_papers_df['frac_influential_citations'] = recsys_papers_df['frac_influential_citations'].fillna(0)

recsys_papers_df['citation_count_pct_rank'] = recsys_papers_df['citationCount'].rank(pct=True, ascending=True)


In [None]:
venue_col = 'clean_venue'

# with_venues = recsys_papers_df[venue_col].notnull()is_valid_venue
with_min_citations = recsys_papers_df['citationCount']>=0

venues_df = pd.concat([
    recsys_papers_df[is_valid_venue].groupby(venue_col)['paperId'].count(),
    recsys_papers_df[is_valid_venue].groupby(venue_col)['citationCount'].mean(),
    recsys_papers_df[is_valid_venue].groupby(venue_col)['citation_count_pct_rank'].mean(),
    recsys_papers_df[is_valid_venue & with_min_citations].groupby(venue_col)['influentialCitationCount'].mean(),
    recsys_papers_df[is_valid_venue & with_min_citations].groupby(venue_col)['frac_influential_citations'].mean(),
    recsys_papers_df[is_valid_venue].groupby(venue_col)['year'].mean(),
], axis=1)

venues_df.columns = [
    'num_recsys_papers', 
    'mean_cites', 'mean_cite_rank',
    'mean_influential_cites', 'mean_frac_influential_cites', 
    'mean_year',
]

venues_df['rank_mean_frac_influential_cites'] = venues_df['mean_frac_influential_cites'].rank(pct=True)

min_paper_count = 10
with_min_venue_count = venues_df['num_recsys_papers'] > min_paper_count




## Version 1

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

x_noise = np.random.normal(0, 0.1, venues_df[with_min_venue_count]['num_recsys_papers'].shape)

venue_cmap = plt.get_cmap('coolwarm_r')
venue_norm = mpl.colors.Normalize(vmin=math.floor(venues_df[with_min_venue_count]['mean_year'].min()), vmax=math.floor(venues_df[with_min_venue_count]['mean_year'].max()))

marker_scale = 80
ax.scatter(
    x=venues_df[with_min_venue_count]['num_recsys_papers']+x_noise, 
    y=venues_df[with_min_venue_count]['mean_cites'], 
    s=(venues_df[with_min_venue_count]['mean_influential_cites'])*marker_scale,

    # norm = venue_norm,
    # cmap = venue_cmap,

    c = venue_cmap(venue_norm(venues_df[with_min_venue_count]['mean_year'])),
    
    # cmap='coolwarm', 
    # vmin=venues_df[with_min_venue_count]['mean_year'].min(), 
    # vmax=venues_df[with_min_venue_count]['mean_year'].max(),

    ec='k', lw=.5,
    
    alpha=.5
)

ax.axvline(venues_df[with_min_venue_count]['num_recsys_papers'].median(), c='k', lw=1, ls='--')
ax.axhline(venues_df[with_min_venue_count]['mean_cites'].median(), c='k', lw=1, ls='--')



def label_venue(ax, df, label, text, xoffset=20, yoffset=20):
    v = df.loc[label]

    num_inf_cites = v['mean_influential_cites']
    pct_inf_cites = v['mean_frac_influential_cites']*100
    
    xy = v['num_recsys_papers'], v['mean_cites']
    xytext = v['num_recsys_papers'] + xoffset, v['mean_cites'] + yoffset

    ax.plot(*xy, marker='.', c='k', markersize=1)
    
    ax.annotate(
        text+'\n({:.1f}, {:.1f}%)'.format(num_inf_cites, pct_inf_cites), 
        xy=xy, xytext=xytext, 
        arrowprops=dict(arrowstyle= '->', lw=.5), 
        fontsize=8, ha='center', va='center'
    )

label_venue(ax, venues_df[with_min_venue_count], 'arxivorg', 'ArXiv', -500, 5)
label_venue(ax, venues_df[with_min_venue_count], 'acm recsys', 'ACM RecSys', -200, 30)
label_venue(ax, venues_df[with_min_venue_count], 'sigir', 'SIGIR', 150, -40)
label_venue(ax, venues_df[with_min_venue_count], 'umuai', 'UMUAI', 0, 80)
label_venue(ax, venues_df[with_min_venue_count], 'iui', 'IUI', 55, -30)
label_venue(ax, venues_df[with_min_venue_count], 'web', 'WWW', 50, 100)
label_venue(ax, venues_df[with_min_venue_count], 'kdd', 'KDD', 10, 75)
label_venue(ax, venues_df[with_min_venue_count], 'kbs', 'KBS', -5, -20)
label_venue(ax, venues_df[with_min_venue_count], 'cacm', 'CACM', -1, 550)
label_venue(ax, venues_df[with_min_venue_count], 'tois', 'TOIS', 10, 500)
label_venue(ax, venues_df[with_min_venue_count], 'journal machine learning research', 'JMLR', 0, 200)
label_venue(ax, venues_df[with_min_venue_count], 'applied science', 'Applied Science', -40, -2)
label_venue(ax, venues_df[with_min_venue_count], 'ijcai', 'IJCAI', -45, 60)
label_venue(ax, venues_df[with_min_venue_count], 'aaai', 'AAAI', 150, 0)
label_venue(ax, venues_df[with_min_venue_count], 'icml', 'ICML', 25, -40)
label_venue(ax, venues_df[with_min_venue_count], 'cikm', 'CIKM', -130, -15)
label_venue(ax, venues_df[with_min_venue_count], 'wsdm', 'WSDM', 130, 6)
label_venue(ax, venues_df[with_min_venue_count], 'tkde', 'TKDE', -0, 150)
label_venue(ax, venues_df[with_min_venue_count], 'multimedia tools and applications', 'Multimedia Tools &\nApplications', 120, -4.5)
label_venue(ax, venues_df[with_min_venue_count], 'icdm', 'ICDM', -50, -12)
label_venue(ax, venues_df[with_min_venue_count], 'conference uncertainty artificial intelligence', 'UAI', 25, 150)
label_venue(ax, venues_df[with_min_venue_count], 'recommender system handbook', 'RecSys Handbook', 0, 140)
label_venue(ax, venues_df[with_min_venue_count], 'sigchi', 'CHI', 10, 120)
label_venue(ax, venues_df[with_min_venue_count], 'umap', 'UMAP', -130, 1)
label_venue(ax, venues_df[with_min_venue_count], 'ieee access', 'IEEE Access', 220, 5)
label_venue(ax, venues_df[with_min_venue_count], 'toism', 'TOISM', 100, -5)

ax.set_xscale('log')
ax.set_yscale('log')

ax.set_xlabel('Number of RecSys Papers Published')
ax.set_ylabel('Mean Cites / RecSys Paper')

ax.set_xlim(9.5, 3000)
ax.set_ylim(2, 3000)

ax.set_title('Venues (n = {:,} venues with ≥10 RecSys Papers)'.format(with_min_venue_count.sum()))

sm = ScalarMappable(cmap=venue_cmap, norm=venue_norm)
sm.set_array([])
cax = fig.add_axes([1, .12, 0.02, .8])
cbar = plt.colorbar(sm, cax=cax, alpha=.5)
cbar.set_label('Mean Publication Year')
cbar.set_ticks(range(2005, 2023, 3))

ax.set_xlim(9.5, 3000)
ax.set_ylim(2, 3000)

ax.set_xticklabels(['.1', '1', '10', '100', '1,000', '10k', '100k'])
ax.set_yticklabels(['.1', '1', '10', '100', '1,000', '10k', '100l'])


fig.tight_layout()

fig.savefig('../graphs/3000_cites_by_papers_per_venue.png', dpi=300, bbox_inches='tight')

## Version 2

In [None]:
sns.set_context('paper', font_scale=2.5)

fig, ax = plt.subplots(figsize=(14, 9))

x_noise = np.random.normal(0, 0.1, venues_df[with_min_venue_count]['num_recsys_papers'].shape)

venue_cmap = plt.get_cmap('coolwarm_r')
venue_norm = mpl.colors.Normalize(vmin=math.floor(venues_df[with_min_venue_count]['mean_year'].min()), vmax=math.floor(venues_df[with_min_venue_count]['mean_year'].max()))

marker_scale = 80
ax.scatter(
    x=venues_df[with_min_venue_count]['num_recsys_papers']+x_noise, 
    y=venues_df[with_min_venue_count]['mean_cites'], 
    s=(venues_df[with_min_venue_count]['mean_influential_cites'])*marker_scale,

    # norm = venue_norm,
    # cmap = venue_cmap,

    c = venue_cmap(venue_norm(venues_df[with_min_venue_count]['mean_year'])),
    
    # cmap='coolwarm', 
    # vmin=venues_df[with_min_venue_count]['mean_year'].min(), 
    # vmax=venues_df[with_min_venue_count]['mean_year'].max(),

    ec='k', lw=.5,
    
    alpha=.5
)

ax.axvline(venues_df[with_min_venue_count]['num_recsys_papers'].median(), c='k', lw=1, ls='--')
ax.axhline(venues_df[with_min_venue_count]['mean_cites'].median(), c='k', lw=1, ls='--')



def label_venue(ax, df, label, text, xoffset=20, yoffset=20):
    v = df.loc[label]

    num_inf_cites = v['mean_influential_cites']
    pct_inf_cites = v['mean_frac_influential_cites']*100
    
    xy = v['num_recsys_papers'], v['mean_cites']
    xytext = v['num_recsys_papers'] + xoffset, v['mean_cites'] + yoffset

    ax.plot(*xy, marker='.', c='k', markersize=1)
    
    ax.annotate(
        text+'\n({:.1f}, {:.1f}%)'.format(num_inf_cites, pct_inf_cites), 
        xy=xy, xytext=xytext, 
        arrowprops=dict(arrowstyle= '->', lw=.5), 
        fontsize=14, ha='center', va='center'
    )

label_venue(ax, venues_df[with_min_venue_count], 'arxivorg', 'ArXiv', -500, 5)
label_venue(ax, venues_df[with_min_venue_count], 'acm recsys', 'ACM RecSys', -200, 30)
label_venue(ax, venues_df[with_min_venue_count], 'sigir', 'SIGIR', 150, -40)
label_venue(ax, venues_df[with_min_venue_count], 'umuai', 'UMUAI', 40, 200)
label_venue(ax, venues_df[with_min_venue_count], 'iui', 'IUI', 55, -30)
label_venue(ax, venues_df[with_min_venue_count], 'web', 'WWW', 50, 100)
label_venue(ax, venues_df[with_min_venue_count], 'kdd', 'KDD', 10, 150)
# label_venue(ax, venues_df[with_min_venue_count], 'kbs', 'KBS', -5, -20)
label_venue(ax, venues_df[with_min_venue_count], 'cacm', 'CACM', -1, 1000)
label_venue(ax, venues_df[with_min_venue_count], 'tois', 'TOIS', 10, 500)
# label_venue(ax, venues_df[with_min_venue_count], 'journal machine learning research', 'JMLR', 0, 200)
# label_venue(ax, venues_df[with_min_venue_count], 'applied science', 'Applied Science', -40, -2)
label_venue(ax, venues_df[with_min_venue_count], 'ijcai', 'IJCAI', -45, 60)
label_venue(ax, venues_df[with_min_venue_count], 'aaai', 'AAAI', 150, 0)
label_venue(ax, venues_df[with_min_venue_count], 'icml', 'ICML', 25, -50)
label_venue(ax, venues_df[with_min_venue_count], 'cikm', 'CIKM', -130, -20)
# label_venue(ax, venues_df[with_min_venue_count], 'wsdm', 'WSDM', 130, 6)
# label_venue(ax, venues_df[with_min_venue_count], 'tkde', 'TKDE', -0, 150)
# label_venue(ax, venues_df[with_min_venue_count], 'multimedia tools and applications', 'Multimedia Tools &\nApplications', 120, -4.5)
# label_venue(ax, venues_df[with_min_venue_count], 'icdm', 'ICDM', -50, -12)
label_venue(ax, venues_df[with_min_venue_count], 'conference uncertainty artificial intelligence', 'UAI', 25, 150)
label_venue(ax, venues_df[with_min_venue_count], 'recommender system handbook', 'RecSys Handbook', 30, 600)
label_venue(ax, venues_df[with_min_venue_count], 'sigchi', 'CHI', 10, 120)
label_venue(ax, venues_df[with_min_venue_count], 'umap', 'UMAP', -130, 1)
# label_venue(ax, venues_df[with_min_venue_count], 'ieee access', 'IEEE Access', 220, 5)
# label_venue(ax, venues_df[with_min_venue_count], 'toism', 'TOISM', 100, -5)

ax.set_xscale('log')
ax.set_yscale('log')

ax.set_xlabel('Number of RecSys Papers Published')
ax.set_ylabel('Mean Cites / RecSys Paper')

ax.set_xlim(9.5, 3000)
ax.set_ylim(2, 3000)

ax.set_title('Venues (n = {:,} venues with ≥10 RecSys Papers)'.format(with_min_venue_count.sum()))

sm = ScalarMappable(cmap=venue_cmap, norm=venue_norm)
sm.set_array([])
cax = fig.add_axes([1, .12, 0.02, .8])
cbar = plt.colorbar(sm, cax=cax, alpha=.5)
cbar.set_label('Mean Publication Year')
cbar.set_ticks(range(2005, 2023, 3))

ax.set_xlim(9.5, 3000)
ax.set_ylim(2, 3000)

ax.set_xticklabels(['.1', '1', '10', '100', '1,000', '10k', '100k'])
ax.set_yticklabels(['.1', '1', '10', '100', '1,000', '10k', '100l'])


fig.tight_layout()

fig.savefig('../graphs/3000_cites_by_papers_per_venue_ppt.png', dpi=300, bbox_inches='tight')

sns.set_context('paper', font_scale=1.25)


# Analysing ArXiv Papers

Some addiitonal analysis of the ArXiv papers. SS has an ArXiv feature with the ArXiv id of a paper, if it has an ArXiv version. Here are all the RS papers with an ArXiv id.

In [None]:
all_in_arxiv_papers = recsys_papers_df[recsys_papers_df['ArXiv'].notnull()]
all_in_arxiv_papers

But only some of the papers in RS are venued in ArXiv. Here they are...

In [None]:
in_arxiv_venue_papers = recsys_papers_df[recsys_papers_df['venue'].map(lambda v: v.lower() if v is not None else '').str.contains('arxiv')]
in_arxiv_venue_papers

The total number of ArXiv papers and the fraction these that are published elsewhere.

In [None]:
all_in_arxiv_papers['ArXiv'].nunique()

In [None]:
(all_in_arxiv_papers['ArXiv'].nunique()-in_arxiv_venue_papers['ArXiv'].nunique())

The citation counts and mean publication year for these ArXiv papers.

In [None]:
in_arxiv_venue_papers['citationCount'].mean(), in_arxiv_venue_papers['frac_influential_citations'].mean(), in_arxiv_venue_papers['year'].mean()

And the similarly for the 'republished' papers; that is papers that are listed as publications in other venues for for which we an ArXiv id.

In [None]:
republished_arxiv_papers = (
    (recsys_papers_df['ArXiv'].notnull()) 
    & (recsys_papers_df['ArXiv'].isin(in_arxiv_venue_papers['ArXiv'].unique())) 
)

republished_arxiv_papers.sum()

In [None]:
recsys_papers_df[republished_arxiv_papers]['citationCount'].mean(), recsys_papers_df[republished_arxiv_papers]['frac_influential_citations'].mean(), recsys_papers_df[republished_arxiv_papers]['year'].mean()