# Inside/Outside Papers
The main aim of this notebook is to classify RS papers as being from the inside community or the outside community, as defined in the ToRS paper.

In [None]:
import swifter

import os
import json
import random
import time
from datetime import datetime

import string 

import matplotlib.pyplot as plt

import Stemmer

import random
import requests
from itertools import chain
from more_itertools import sliced

import pandas as pd
from matplotlib.pylab import plt
from matplotlib_venn import venn2, venn3, venn2_circles, venn3_circles

import numpy as np

from glob import glob, iglob
from pathlib import Path
                         
from loguru import logger
from IPython.display import display, clear_output

from multiprocessing import Pool

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import nltk
from nltk.stem import WordNetLemmatizer

import seaborn as sns

from itables import init_notebook_mode, show, options
init_notebook_mode(all_interactive=False)


pd.__version__

In [None]:
sns.set_context('paper', font_scale=1.25)


# Setup

## Load the Papers and Authors

In [None]:
papers_dataset = '../data/processed/2200_recsys_papers_cleaned.feather'
papers_df = pd.read_feather(papers_dataset)
papers_df.shape

In [None]:
authors_dataset = '../data/processed/2200_recsys_authors_cleaned.feather'
authors_df = pd.read_feather(authors_dataset)
authors_df.shape

## Get Core RecSys Papers

In [None]:
is_recsys_paper = papers_df['is_core_recsys_paper']

recsys_papers_df = papers_df[is_recsys_paper ].copy()
recsys_author_ids = set(recsys_papers_df['authors'].explode().dropna().unique())

len(recsys_author_ids), recsys_papers_df.shape, is_recsys_paper.sum()

## Define the Venue Papers
These are the subset of papers that are published in the main RS venues (ACM RecSys, ACM ToRS, and the various long-running ACM RecSys workshops).

In [None]:
recsys_venues = ['acm recsys', 'intrsrecsys', 'recsys poster', 'rectourrecsys', 'recsys challenge']
in_recsys_venue = recsys_papers_df['venue'].map(lambda venue: venue.lower()).isin(recsys_venues)

recsys_papers_df['is_venue_paper'] = recsys_papers_df['has_recsys_key'] | recsys_papers_df['clean_venue'].isin(recsys_venues)

recsys_papers_df['is_venue_paper'].sum(), recsys_papers_df['has_recsys_key'].sum(), recsys_papers_df['clean_venue'].isin(recsys_venues).sum()

# The Papers/Authors Venn Diagrams
Produce some Venn diagrams to show hwo the different sets of papers/authors relate to each other.

## Get the groups of papers, cites, and author pubs
We distinsguis between the core RS papers (Rp), the cite of linked papers (Lp) which cite or are cited by Rp, and the author papers (Ap).

In [None]:
def get_paper_groups(recsys_papers_df):

    # The set of all recsys papers.
    Rp = set(recsys_papers_df['paperId'].unique())

    # The authors of these papers.
    recsys_author_ids = set(recsys_papers_df['authors'].explode().dropna().unique())
    
    # Just the citations of the recsys papers; these can/will include non-recsys papers.
    cites = set(recsys_papers_df['updated_citations'].explode().dropna().unique())
    refs = set(recsys_papers_df['references'].explode().dropna().unique())
    Lp = cites.union(refs)

    # The publications of the authors of these papers; these must include the recsys papers themselves and because of some
    # minor data issues there are some that are missing, hence we union with Rp to ensure we have all of Rp in Ap.
    Ap = set(authors_df.set_index('authorId').reindex(list(recsys_author_ids))['papers'].explode().dropna().unique()).union(Rp)

    # The universe is the union of all of these papers.
    Up = Rp.union(Lp).union(Ap)

    return Up, Rp, Lp, Ap

In [None]:
Up, Rp, Lp, Ap = get_paper_groups(recsys_papers_df)
len(Up), len(Rp), len(Lp), len(Ap)

## The author groups
A similar approach for the authors.

In [None]:
# Get the author groups for recsys papers; note we need to pass in the larger
# papers_df in order to to find the authors for citations and pubs that are
# not recsys papers.

def get_author_groups(papers_df, recsys_papers_df):

    Up, Rp, Lp, Ap = get_paper_groups(recsys_papers_df)

    papers_df_by_paper_id = papers_df.set_index('paperId')

    # The authors of recsys papers.
    # Ra = set(papers_df_by_paper_id.reindex(Rp)['authors'].explode().dropna().unique())
    Ra = set(recsys_papers_df['authors'].explode().dropna().unique())

    # The authors of citations to recsys papers
    La = set(papers_df_by_paper_id.reindex(Lp)['authors'].explode().dropna().unique())

    Aa = set(papers_df_by_paper_id.reindex(Ap)['authors'].explode().dropna().unique())

    Ua = Ra.union(La).union(Aa)

    return Ua, Ra, La, Aa


In [None]:
Ua, Ra, La, Aa = get_author_groups(papers_df, recsys_papers_df)
len(Ua), len(Ra), len(La), len(Aa)

## Draw the Papers and Authors Venns

In [None]:
papers_by_id = papers_df.set_index('paperId')
Up_df = papers_by_id.reindex(list(Up)).reset_index()
Up_df.shape

In [None]:
all_paper_ids = set(Up_df['paperId'].unique())
all_citation_ids = set(Up_df['updated_citations'].explode().dropna().unique())
all_reference_ids = set(Up_df['references'].explode().dropna().unique())
all_author_pub_ids = set(authors_df['papers'].explode().dropna().unique())

(
    len(all_paper_ids), 
    len(all_citation_ids), 
    len(all_reference_ids), 
    len(all_author_pub_ids), 
    len(all_paper_ids.union(all_citation_ids).union(all_reference_ids).union(all_author_pub_ids))
)

The above reflects the total number of paperids that we have collected. There are 34.7M vs the 2.695M that are 1-step from a RS paper.

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))

v1 = venn3(
        ax=ax,
        subsets=[Rp, Lp, Ap],
        set_labels=['$R_p$ ({:,})'.format(len(Rp)), '$L_p$ ({:,})'.format(len(Lp)), '$A_p$ ({:,})'.format(len(Ap))],
        set_colors=['tab:blue', 'tab:orange', 'tab:green'],
        subset_label_formatter=lambda v: '{:,}'.format(v),
        alpha=.5
) 
c1=venn3_circles(ax=ax, subsets=[Rp, Lp, Ap], linestyle='-', linewidth=1, color="k")


# Scale based on radii of the largest circle.
# (2037307/3.1416)**.5/(2635519/3.1416)**.5

ax.set_xlim(-1, 1)
ax.set_ylim(-1, 1)



# Adjust the labelling for the first venn
Rp_label = v1.get_label_by_id('A')
ax.annotate(Rp_label.get_text(), xy=Rp_label.get_position()+np.array([-.02, -.035]), xytext=Rp_label.get_position()+np.array([-25, 35]),
             ha='center', xycoords='data', textcoords='offset points',
             arrowprops=dict(arrowstyle='->',color='k'), fontsize=Rp_label.get_fontsize())
Rp_label.set_text('')

label_101 = v1.get_label_by_id('101')
ax.annotate(label_101.get_text(), xy=label_101.get_position()+np.array([.005, 0]), xytext=label_101.get_position()+np.array([-35, 0]),
             ha='right', va='center', xycoords='data', textcoords='offset points',
             arrowprops=dict(arrowstyle='->',color='k'), fontsize=label_101.get_fontsize())
label_101.set_text('')

label_111 = v1.get_label_by_id('111')
ax.annotate(label_111.get_text(), xy=label_111.get_position()+np.array([-.04, .02]), xytext=label_111.get_position()+np.array([-40, -40]),
             ha='center', va='center', xycoords='data', textcoords='offset points',
             arrowprops=dict(arrowstyle='->',color='k'), fontsize=label_111.get_fontsize())
label_111.set_text('')

label_011 = v1.get_label_by_id('011')
label_011.set_position(label_011.get_position() + np.array([.03, -.02]))

label_010 = v1.get_label_by_id('010')
label_010.set_position(label_010.get_position() + np.array([-.05, -.04]))



ax.text(
    0, .91, 
    '$U_p$ (n = {:,})'.format(len(Up)),
    ha='center', va='center', fontsize=Rp_label.get_fontsize()
)

fig.tight_layout()

fig.savefig('../graphs/2300_papers_venn.png', dpi=300, bbox_inches='tight')

In [None]:
fig, (ax, bx) = plt.subplots(figsize=(6, 12), nrows=2)

v1 = venn3(
        ax=ax,
        subsets=[Rp, Lp, Ap],
        set_labels=['$R_p$ ({:,})'.format(len(Rp)), '$L_p$ ({:,})'.format(len(Lp)), '$A_p$ ({:,})'.format(len(Ap))],
        set_colors=['tab:blue', 'tab:orange', 'tab:green'],
        subset_label_formatter=lambda v: '{:,}'.format(v),
        alpha=.5
) 
c1=venn3_circles(ax=ax, subsets=[Rp, Lp, Ap], linestyle='-', linewidth=1, color="k")



v2 = venn3(
        ax=bx,
        subsets=[Ra, La, Aa],
        set_labels=['$R_a$ ({:,})'.format(len(Ra)), '$L_a$ ({:,})'.format(len(La)), '$A_a$ ({:,})'.format(len(Aa))],
        set_colors=['tab:blue', 'tab:orange', 'tab:green'],
        subset_label_formatter=lambda v: '{:,}'.format(v),
        alpha=.5
)
c2=venn3_circles(ax=bx, subsets=[Ra, La, Aa], linestyle='-', linewidth=1, color="k")


# Scale based on radii of the largest circle.
# (2037307/3.1416)**.5/(2635519/3.1416)**.5

ax.set_xlim(-1, 1)
ax.set_ylim(-1, 1)
bx.set_xlim(-.87, .87)
bx.set_ylim(-.87, .87)



ax.text(
    0, .91, 
    '(a) All Papers (n = {:,})'.format(len(Rp.union(Lp).union(Ap))),
    ha='center', va='center'
)

bx.text(
    0, .8, 
    '(b) All Authors (n = {:,})'.format(len(Ra.union(La).union(Aa))),
    ha='center', va='center'
)



# Adjust the labelling for the first venn
Rp_label = v1.get_label_by_id('A')
ax.annotate(Rp_label.get_text(), xy=Rp_label.get_position()+np.array([-.02, -.035]), xytext=Rp_label.get_position()+np.array([-25, 35]),
             ha='center', xycoords='data', textcoords='offset points',
             arrowprops=dict(arrowstyle='->',color='k'), fontsize=Rp_label.get_fontsize())
Rp_label.set_text('')

label_101 = v1.get_label_by_id('101')
ax.annotate(label_101.get_text(), xy=label_101.get_position()+np.array([.005, 0]), xytext=label_101.get_position()+np.array([-50, 0]),
             ha='right', va='center', xycoords='data', textcoords='offset points',
             arrowprops=dict(arrowstyle='->',color='k'), fontsize=label_101.get_fontsize())
label_101.set_text('')

label_111 = v1.get_label_by_id('111')
ax.annotate(label_111.get_text(), xy=label_111.get_position()+np.array([-.04, .02]), xytext=label_111.get_position()+np.array([-40, -40]),
             ha='center', va='center', xycoords='data', textcoords='offset points',
             arrowprops=dict(arrowstyle='->',color='k'), fontsize=label_111.get_fontsize())
label_111.set_text('')

label_011 = v1.get_label_by_id('011')
label_011.set_position(label_011.get_position() + np.array([.04, 0]))

label_010 = v1.get_label_by_id('010')
label_010.set_position(label_010.get_position() + np.array([-.025, -.04]))

# Adjust the labelling for the second venn
Ra_label = v2.get_label_by_id('A')
bx.annotate(Ra_label.get_text(), xy=Ra_label.get_position()+np.array([-.01, -.035]), xytext=Ra_label.get_position()+np.array([-25, 35]),
             ha='center', xycoords='data', textcoords='offset points',
             arrowprops=dict(arrowstyle='->',color='k'), fontsize=Ra_label.get_fontsize())
Ra_label.set_text('')

label_101 = v2.get_label_by_id('101')
bx.annotate(label_101.get_text(), xy=label_101.get_position()+np.array([.01, 0]), xytext=label_101.get_position()+np.array([-25, 0]),
             ha='right', va='center', xycoords='data', textcoords='offset points',
             arrowprops=dict(arrowstyle='->',color='k'), fontsize=label_101.get_fontsize())
label_101.set_text('')

label_111 = v2.get_label_by_id('111')
bx.annotate(label_111.get_text(), xy=label_111.get_position()+np.array([-.04, .02]), xytext=label_111.get_position()+np.array([-40, -40]),
             ha='center', va='center', xycoords='data', textcoords='offset points',
             arrowprops=dict(arrowstyle='->',color='k'), fontsize=label_111.get_fontsize())
label_111.set_text('')

# label_011 = v2.get_label_by_id('011')
# label_011.set_position(label_011.get_position() + np.array([.03, 0]))

label_010 = v2.get_label_by_id('010')
label_010.set_position(label_010.get_position() + np.array([-.025, -.04]))




ax.text(
    0, .91, 
    '(a) All Papers (n = {:,})'.format(len(Rp.union(Lp).union(Ap))),
    ha='center', va='center', fontsize=label_111.get_fontsize()
)

bx.text(
    0, .8, 
    '(b) All Authors (n = {:,})'.format(len(Ra.union(La).union(Aa))),
    ha='center', va='center', fontsize=label_111.get_fontsize()
)


fig.tight_layout()

fig.savefig('../graphs/2300_papers_authors_venn.png', dpi=300, bbox_inches='tight')

# Inside & Outside Communities

## The Core/Venue Papers
These are the papers that have been published at RecSys or one of its associated venues (workshops, TORS).

In [None]:
venue_papers_df = recsys_papers_df[recsys_papers_df['is_venue_paper']]
venue_paper_ids = set(venue_papers_df['paperId'].unique())

venue_papers_df.shape, len(venue_paper_ids)

In [None]:
venue_author_ids = set(venue_papers_df['authors'].explode().dropna().unique())
len(venue_author_ids)

## Identifying Inside/Outside Papers/Authors
This is where we define the inside and outside communities based on publication related to the main/core RS venues.

### Inside Papers & Authors
An inside paper is a recsys paper with an author who has published in the core venues.

In [None]:
# All the papers published by core authors/recsys venue authors.
has_venue_author = (
    papers_df['authors']
    .swifter
    .apply(lambda authors: len(set(authors).intersection(venue_author_ids))>0)
)

has_venue_author.sum()

In [None]:
# An inside paper is a recsys paper with a recsys author or co-author.
is_inside_paper = is_recsys_paper & has_venue_author

inside_papers = papers_df[is_inside_paper]

inside_paper_ids = set(inside_papers['paperId'].unique())

len(inside_paper_ids), inside_papers.shape

In [None]:
# The inside authors are all those who have authored inside papers.
inside_author_ids = set(inside_papers['authors'].explode().dropna().unique())

len(inside_author_ids)

### Outside Papers & Authors
Similarly we need to define the otside papers/authors. An outside paper is a RS paper that is not an inside paper. An outside author is an author of an outside paper.

In [None]:
outside_papers = papers_df[is_recsys_paper & (~is_inside_paper)]

outside_paper_ids = set(outside_papers['paperId'].unique())

len(outside_paper_ids), outside_papers.shape

In [None]:
outside_author_ids = set(outside_papers['authors'].explode().dropna().unique())

len(outside_author_ids)

In [None]:
outside_paper_ids.intersection(inside_paper_ids)

In [None]:
len(outside_author_ids.intersection(inside_author_ids))

### Mark papers/authors with inside/outside indicator
We need to be careful with the ordering here -- there might be a better way to do this ... -- so that the papers that are both inside and outside are considered to be inside, as this is what is intended.

In [None]:
recsys_papers_df['paper_type'] = np.where(recsys_papers_df['paperId'].isin(outside_paper_ids), 'outside', None)
recsys_papers_df['paper_type'] = np.where(recsys_papers_df['paperId'].isin(inside_paper_ids), 'inside', recsys_papers_df['paper_type'])

recsys_papers_df['paper_type'].unique(), recsys_papers_df.groupby('paper_type').size(), recsys_papers_df.groupby('paper_type').size().sum()

In [None]:
is_outside_author = authors_df['authorId'].isin(outside_author_ids)
is_inside_author = authors_df['authorId'].isin(inside_author_ids)

authors_df.loc[is_outside_author, 'author_type'] = 'outside'
authors_df.loc[is_inside_author, 'author_type'] = 'inside'
authors_df.loc[(~is_outside_author) & (~is_inside_author), 'author_type'] = 'non-inside_outside'


# authors_df['author_type'] = np.where(authors_df['authorId'].isin(outside_author_ids), 'outside', None)
# authors_df['author_type'] = np.where(authors_df['authorId'].isin(inside_author_ids), 'inside', authors_df['author_type'])

# # If an author is neither inside nor outside then its a non-recsys author, presumably it cites of refs a recsys paper.
# authors_df['author_type'] = np.where(authors_df['author_type'].isnull(), 'non_recsys', authors_df['author_type'])

authors_df.shape, authors_df['author_type'].unique(), authors_df.groupby('author_type').size(), authors_df.groupby('author_type').size().sum()

In [None]:
# Also add author type to the papers DF
has_inside_author = papers_df['authors'].swifter.apply(lambda authors: len(set(authors).intersection(inside_author_ids))>0)
has_outside_author = papers_df['authors'].swifter.apply(lambda authors: len(set(authors).intersection(outside_author_ids))>0)

papers_df.loc[has_outside_author, 'author_type'] = 'outside'
papers_df.loc[has_inside_author, 'author_type'] = 'inside'
papers_df.loc[(~has_outside_author) & (~has_inside_author), 'author_type'] = 'non-inside_outside'


# papers_df['author_type'] = np.where(has_outside_author, 'outside', None)
# papers_df['author_type'] = np.where(has_inside_author, 'inside', papers_df['author_type'])

papers_df['author_type'].unique(), papers_df.groupby('author_type').size(), papers_df.groupby('author_type').size().sum()

## Fix some issues noticed along the way
A few manual fixes that are appropriate for a very small number of issues that have been noted during the analysis. Mostly these are due to some dodgy SS data records. FOrtunately, there are very few examples, at least that I have found.

In [None]:
recsys_papers_df.loc[1833975, 'paper_type'] = 'inside'   # F. Ricci vs Francesco Ricci

recsys_papers_df.at[14164, 'venue'] = 'Computer Supported Collaborative Work'
recsys_papers_df.at[14164, 'clean_venue'] = 'cscw'
recsys_papers_df.at[14164, 'title'] = 'GroupLens: An open architecture for collaborative filtering of netnews'

recsys_papers_df.at[14164, 'author_names'] = ['Paul Resnick', 'Neophytos Iacovou', 'Mitesh Suchak', 'Peter Bergstrom', 'John Riedl']


## The Inside/Outside Venn Diagrams
Produce the Venn diagrams to show the relationships between the Inside and Outside sets.

In [None]:
fig, (ax, bx) = plt.subplots(figsize=(7, 14), nrows=2)

v1 = venn3(
    ax=ax,
    subsets=[venue_paper_ids, inside_paper_ids, outside_paper_ids],
    set_labels=['$Vp$ ({:,})'.format(len(venue_paper_ids)), '$I_p$ ({:,})'.format(len(inside_paper_ids)), '$O_p$ ({:,})'.format(len(outside_paper_ids))],
    set_colors=['tab:blue', 'tab:orange', 'tab:green'],
    subset_label_formatter=lambda v: '{:,}'.format(v),
    alpha=.5
      
)
c1=venn3_circles(ax=ax, subsets=[venue_paper_ids, inside_paper_ids, outside_paper_ids], linestyle='-', linewidth=1, color="k")



v2 = venn3(
    ax=bx,
    subsets=[venue_author_ids, inside_author_ids, outside_author_ids],
    set_labels=['$Va$ ({:,})'.format(len(venue_author_ids)), '$I_a$ ({:,})'.format(len(inside_author_ids)), '$O_a$ ({:,})'.format(len(outside_author_ids))],
    set_colors=['tab:blue', 'tab:orange', 'tab:green'],
    subset_label_formatter=lambda v: '{:,}'.format(v),
    alpha=.5
      
)
c2=venn3_circles(ax=bx, subsets=[venue_author_ids, inside_author_ids, outside_author_ids], linestyle='-', linewidth=1, color="k")



# A bit of scaling to improve the correctness of the areas across the pair of venn.
# (31265/3.1416)**.5/(62426/3.1416)**.5 = 0.7
# 1/.7 = 1.42

ax.set_xlim(-1, 1)
ax.set_ylim(-1, 1)
bx.set_xlim(-.8, .8)
bx.set_ylim(-.8, .8)


# Adjust the labelling for the first venn
Cp_label = v1.get_label_by_id('A')
ax.annotate('$V_p$', xy=Cp_label.get_position()+np.array([-0.05, .17]), xytext=Cp_label.get_position()+np.array([-25, 60]),
             ha='center', xycoords='data', textcoords='offset points',
             arrowprops=dict(arrowstyle='->',color='k'), fontsize=Cp_label.get_fontsize())
Cp_label.set_text('')

# Adjust the labelling for the second venn
Ca_label = v2.get_label_by_id('A')
bx.annotate('$V_a$', xy=Ca_label.get_position()+np.array([0, .18]), xytext=Ca_label.get_position()+np.array([10, 60]),
             ha='center', xycoords='data', textcoords='offset points',
             arrowprops=dict(arrowstyle='->',color='k'), fontsize=Cp_label.get_fontsize())

# Ca_label.set_position(Ca_label.get_position() + np.array([0, .2]))
Ca_label.set_text('')

# label_110 = v2.get_label_by_id('110')
# label_110.set_text('')

label_010 = v2.get_label_by_id('010')
label_010.set_position(label_010.get_position() + np.array([-0.05, -.19]))



ax.text(
    0, 1, 
    '(a) Inside & Outside Recsys Papers (n = {:,})'.format(len(inside_paper_ids.union(outside_paper_ids))),
    ha='center', va='center', fontsize=Cp_label.get_fontsize()
)

bx.text(
    0, .8, 
    '(b) Inside & Outside Recsys Authors (n = {:,})'.format(len(inside_author_ids.union(outside_author_ids))),
    ha='center', va='center', fontsize=Cp_label.get_fontsize()
)



fig.tight_layout()

fig.savefig('../graphs/2300_inside_outside_venn.png', dpi=300, bbox_inches='tight')


An inside paper is either a paper that is published in one of the core venues or its a paper published by someone who has published in the core venyes; in other words, inside papers are papers that have an authorship connection to the core venues. Then, inside authors are all of the authors of inside papers. Note that some inside authors will not have published in the core venues but they may have co-authored with someone who has.

This subset of the inside authors provides an important bridget to the outside papers/authors. By definition, an outside paper is a recsys paper that is not an inside paper and an outside author is an author of an outside paper. Most of these outside authors are entirely separate from the inside community. They have no published a core venue paper  nor have they co-authored any papers with an author that has published a core venue paper.

Abive we see that 60k authors of recsys papers have no connection to the core recsys community, which is 4x the noumber of inside authors. Moreover, these outside authors have published more than 3x then number of recsys papers as the inside community.

The significance of this is that it points to a vibrant community of recsys researchers that exists beyond the core venues.



# Save papers with inside/outside indicators

## The main inside/outside dataset of RS papers

In [None]:
recsys_papers_df.to_feather('../data/processed/2300_inside_outside_papers.feather')
recsys_papers_df.shape

## The main RS universe dataset

In [None]:
recsys_universe_papers_df = papers_df[papers_df['paperId'].isin(Up)].copy()
recsys_universe_papers_df.shape

In [None]:
recsys_universe_papers_df.to_feather('../data/processed/2300_recsys_universe_papers.feather')
recsys_universe_papers_df.shape

## The inside/outside authors dataset

In [None]:
authors_df.to_feather('../data/processed/2300_inside_outside_authors.feather')
authors_df.shape