# Comparing the Inside and Outside Communities
In this notebook we cokpare a number of poublication output/impact features of the inside and outside communities to produce several visualisations used in the main analysis.

In [None]:
import swifter

import os
import json
import random
import time
from datetime import datetime

import string 

import matplotlib.pyplot as plt

import Stemmer

import random
import requests
from itertools import chain
from more_itertools import sliced

import pandas as pd
import matplotlib as mpl
from matplotlib.pylab import plt
from matplotlib import colormaps
from matplotlib.lines import Line2D

import numpy as np

from glob import glob, iglob
from pathlib import Path
                         
from loguru import logger
from IPython.display import display, clear_output

from multiprocessing import Pool

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import nltk
from nltk.stem import WordNetLemmatizer

import seaborn as sns

from itables import init_notebook_mode, show, options
init_notebook_mode(all_interactive=False)


pd.__version__

In [None]:
sns.set_context('paper', font_scale=1.25)

# Load Datasets

In [None]:
recsys_papers_dataset = '../data/processed/2300_inside_outside_papers.feather'

recsys_df = pd.read_feather(recsys_papers_dataset)
recsys_df.shape

In [None]:
recsys_df.groupby('paper_type').size()

In [None]:
recsys_df.set_index('paper_type')['authors'].explode().dropna().groupby('paper_type').apply(lambda g: g.nunique())

# Papers, Authors, Citations
A series of visusalisations comparing the papers, authors, and citations for the inside and outside communities.

In [None]:
num_papers_per_year_by_paper_type = (
    recsys_df.groupby(['year', 'paper_type'])
    .size()
    .unstack()
    .fillna(0)
    .loc[1990:2023]
)

num_authors_per_year_by_paper_type = (
    recsys_df
    .set_index(['year', 'paper_type'])['authors']
    .explode().dropna()
    .groupby(['year', 'paper_type'])
    .nunique()
    .unstack()
    .fillna(0)
    .loc[1990:2023]
)

num_venues_per_year_by_paper_type = (
    recsys_df
    .set_index(['year', 'paper_type'])['clean_venue']
    .explode().dropna()
    .groupby(['year', 'paper_type'])
    .nunique()
    .unstack()
    .fillna(0)
    .loc[1990:2023]
)

num_citations_per_year_by_paper_type = (
    recsys_df.set_index('paper_type')['citation_years']
    .explode().dropna().reset_index()
    .groupby(['citation_years', 'paper_type'])
    .size()
    .unstack()
    .fillna(0)
    .loc[1990:2023]
)

mean_citations_per_paper_per_year_by_paper_type = (
    num_citations_per_year_by_paper_type
    .div(num_papers_per_year_by_paper_type.cumsum(), axis=1)
    .fillna(0)
    .loc[1999:]
)


In [None]:
def plot_inside_outside_regplot(ax, data, order=4):

    # growth = data.loc[2000:].pct_change().agg(['mean', 'std'])

    sns.regplot(ax=ax, x=data.index, y=data.filter(like='inside'), order=order, line_kws=dict(lw=1), scatter_kws=dict(s=15, fc='w'), ci=False)
    sns.regplot(ax=ax, x=data.index, y=data.filter(like='outside'), order=order, line_kws=dict(lw=1), scatter_kws=dict(s=15, fc='w'), ci=False)

    ax.set_ylim(0)
    ax.set_xlabel('')
    ax.set_ylabel('')

    custom_lines = [
        Line2D([0], [0], color='tab:blue', lw=1),
        Line2D([0], [0], color='tab:orange', lw=1),
    ]
    
    # ax.legend(
    #     custom_lines, 
    #     ['Inside ({:.2f}±{:.2f})'.format(*growth['inside']), 'Outside ({:.2f}±{:.2f})'.format(*growth['outside'])], 
    #     ncol=1, frameon=False
    # )

    ax.legend(custom_lines, ['Inside', 'Outside'], ncol=1, frameon=False)

fig, ((ax, bx), (cx, dx)) = plt.subplots(figsize=(8, 5), ncols=2, nrows=2, sharex=True)

plot_inside_outside_regplot(ax, num_papers_per_year_by_paper_type)
plot_inside_outside_regplot(bx, num_authors_per_year_by_paper_type)
plot_inside_outside_regplot(cx, num_venues_per_year_by_paper_type)
plot_inside_outside_regplot(dx, num_citations_per_year_by_paper_type)
# plot_inside_outside_regplot(dx, mean_citations_per_paper_per_year_by_paper_type)

for xx in [ax, bx, cx]:
    xx.set_yticklabels(['{:.0f}'.format(n/1000) for n in xx.get_yticks()])
    
ax.set_ylabel("Num Papers ('000s)")
bx.set_ylabel("Num Authors ('000s)")
cx.set_ylabel("Num Venues ('000s)")
dx.set_ylabel("Num Citations ('000s)")

ax.set_title('(a) Papers x Year')
bx.set_title('(b) Unique Authors x Year')
cx.set_title('(c) Unique Venues x Year')
dx.set_title('(d) Citations x Year')

fig.tight_layout()

fig.savefig('../graphs/3100_papers_authors_citations_by_year.png', dpi=300, bbox_inches='tight')

In [None]:
num_papers_per_year_by_paper_type.loc[2000:].pct_change().median()

In [None]:
num_authors_per_year_by_paper_type.loc[2000:].pct_change().median()

In [None]:
num_citations_per_year_by_paper_type.loc[2000:].pct_change().median()

# Author Engagement
The author engagement graphs show various features of author engagement in the inside/outside communities including the number of active, new, returning,. churning authors by year.

In [None]:
authors_by_year = pd.DataFrame(
        recsys_df
        .groupby(['year', 'paper_type'])
        .apply(lambda g: np.unique(np.concatenate(g['authors'].values)))
        , columns=['active_authors']
).unstack().applymap(lambda v: [] if v is np.nan else v).add_suffix('_active')

authors_by_year.columns = authors_by_year.columns.get_level_values(1)

authors_by_year.head()

## The known/future authors

In [None]:
def aggregate_author_lists(author_lists):
    return pd.Series([
        np.unique(np.concatenate(author_lists.loc[:year].values))
        for year in author_lists.index
    ], index=author_lists.index)



In [None]:
known_authors = (
    authors_by_year
    .apply(aggregate_author_lists, axis=0)
    .rename(columns={'inside_active': 'inside_known', 'outside_active':'outside_known'})
)

prev_known_authors = known_authors.shift(1).applymap(lambda v: [] if v is None else v).add_suffix('_prev')

known_authors = pd.concat([known_authors, prev_known_authors], axis=1).applymap(lambda v: [] if v is np.nan else v)

known_authors.head()

In [None]:
future_authors = (
    authors_by_year.loc[::-1]
    .apply(aggregate_author_lists, axis=0)
    .rename(columns={'inside_active': 'inside_future', 'outside_active':'outside_future'})
).loc[::-1]

next_future_authors = future_authors.shift(-1).applymap(lambda v: [] if v is None else v).add_suffix('_next')

future_authors = pd.concat([future_authors, next_future_authors], axis=1).applymap(lambda v: [] if v is np.nan else v)

future_authors.head()

## Combine active/known/future authors

In [None]:
authors_by_year = pd.concat([
    authors_by_year,
    known_authors,
    future_authors.filter(like='next')
], axis=1).applymap(lambda v: [] if v is np.nan else v)

authors_by_year.head()

## Calculate new, returning, churning authors

In [None]:
# New authors are the differences between teh active authors in a year and the known authors for the previous years
authors_by_year['inside_new'] = authors_by_year.apply(lambda r: list(set(r['inside_active']).difference(set(r['inside_known_prev']))), axis=1)
authors_by_year['outside_new'] = authors_by_year.apply(lambda r: list(set(r['outside_active']).difference(set(r['outside_known_prev']))), axis=1)

# The returning authors are the active authors that are not new authors.
authors_by_year['inside_returning'] = authors_by_year.apply(lambda r: list(set(r['inside_active']).difference(set(r['inside_new']))), axis=1)
authors_by_year['outside_returning'] = authors_by_year.apply(lambda r: list(set(r['outside_active']).difference(set(r['outside_new']))), axis=1)

# Churning authors are the active authors who never appear again in the future.
authors_by_year['inside_churning'] = authors_by_year.apply(lambda r: list(set(r['inside_active']).difference(set(r['inside_future_next']))), axis=1)
authors_by_year['outside_churning'] = authors_by_year.apply(lambda r: list(set(r['outside_active']).difference(set(r['outside_future_next']))), axis=1)

authors_by_year.head()

## The fraction of active, new, returning, churning authors

In [None]:
author_counts = authors_by_year.applymap(len).add_prefix('num_')
author_counts.filter(like='outside').tail()

In [None]:

fig, axs = plt.subplots(figsize=(8, 5), nrows=2, ncols=2, sharey=True, sharex=True)

ax, bx, cx, dx = axs.flatten()

from_year, to_year = 2000, 2020

frac_active_authors = (
    pd
    .concat([
        (author_counts['num_inside_active']/author_counts['num_inside_known']),
        (author_counts['num_outside_active']/author_counts['num_outside_known'])
    ], axis=1)
    .rename(columns={0:'inside', 1:'outside'})
).loc[from_year:to_year]


frac_new_authors = (
    pd
    .concat([
        (author_counts['num_inside_new']/author_counts['num_inside_active']),
        (author_counts['num_outside_new']/author_counts['num_outside_active'])
    ], axis=1)
    .rename(columns={0:'inside', 1:'outside'})
).loc[from_year:to_year]

frac_returning_authors = (
    pd
    .concat([
        (author_counts['num_inside_returning']/author_counts['num_inside_active']),
        (author_counts['num_outside_returning']/author_counts['num_outside_active'])
    ], axis=1)
    .rename(columns={0:'inside', 1:'outside'})
).loc[from_year:to_year]

frac_churning_authors = (
    pd
    .concat([
        (author_counts['num_inside_churning']/author_counts['num_inside_active']),
        (author_counts['num_outside_churning']/author_counts['num_outside_active'])
    ], axis=1)
    .rename(columns={0:'inside', 1:'outside'})
).loc[from_year:to_year]




plot_inside_outside_regplot(ax, frac_active_authors, order=2)

plot_inside_outside_regplot(bx, frac_new_authors, order=2)

plot_inside_outside_regplot(cx, frac_returning_authors, order=2)

plot_inside_outside_regplot(dx, frac_churning_authors, order=2)

ax.set_ylim(0, 1)

ax.set_title('(a) Fraction of Active Authors')
bx.set_title('(b) Fraction of New Authors')
cx.set_title('(c) Fraction of Returning Authors')
dx.set_title('(d) Fraction of Churning Authors')

fig.tight_layout()

fig.savefig('../graphs/3100_active_authors_new_returning_churning_authors.png', dpi=300, bbox_inches='tight')

In [None]:
(
    frac_active_authors.pct_change(axis=1).iloc[:, -1].agg(['mean', 'std']),
    frac_new_authors.pct_change(axis=1).iloc[:, -1].agg(['mean', 'std']),
    frac_returning_authors.pct_change(axis=1).iloc[:, -1].agg(['mean', 'std']),
    frac_churning_authors.pct_change(axis=1).iloc[:, -1].agg(['mean', 'std']),
)

# Author Longevity & Maturity

In [None]:
publication_lifespan = (
    recsys_df
    .set_index(['paper_type', 'year'])['authors'].explode()
    .reset_index()
    .groupby(['authors', 'paper_type'])['year']
    .agg(['min', 'max']).diff(axis=1)['max']
)

num_inside_outside_papers_by_author = (
    recsys_df
    .set_index('paper_type')['authors']
    .explode().dropna()
    .reset_index()
    .groupby(['authors', 'paper_type'])
    .size()
)

mean_citations_per_author = (
    recsys_df.set_index(['paper_type', 'citationCount'])['authors']
    .explode().dropna().reset_index()
    .groupby(['authors', 'paper_type'])['citationCount']
    .mean()                         # Mean citations per paper per author
    .groupby('paper_type').mean()   # Mean citations per author by papert type
)


In [None]:
fig, axs = plt.subplots(figsize=(8, 5), nrows=2, ncols=2, sharex=True)
ax, bx, cx, dx = axs.flatten()

(
    num_inside_outside_papers_by_author
    .groupby('paper_type').mean()
    .plot(ax=ax, kind='bar', color=['tab:blue', 'tab:orange'], ec='k', lw=1)
)

(
    publication_lifespan
    .groupby('paper_type').mean()
    .plot(ax=bx, kind='bar', color=['tab:blue', 'tab:orange'], ec='k', lw=1)
)


# Once-off authors
num_once_off_authors = num_inside_outside_papers_by_author.groupby('paper_type').apply(lambda g: (g==1).sum())
frac_once_off_authors = num_inside_outside_papers_by_author.groupby('paper_type').apply(lambda g: (g==1).mean())

num_once_off_authors.plot(ax=cx, kind='bar', color=['tab:blue', 'tab:orange'], ec='k', lw=1)
cx.text(0, num_once_off_authors['inside'], '{:.1f}%'.format(100*frac_once_off_authors['inside']), ha='center', va='bottom')
cx.text(1, num_once_off_authors['outside'], '{:.1f}%'.format(100*frac_once_off_authors['outside']), ha='center', va='bottom')
cx.set_ylim(0, 90000)


num_mature_authors = num_inside_outside_papers_by_author.groupby('paper_type').apply(lambda g: (g>20).sum())
frac_mature_authors = num_inside_outside_papers_by_author.groupby('paper_type').apply(lambda g: (g>20).mean())

num_mature_authors.plot(ax=dx, kind='bar', color=['tab:blue', 'tab:orange'], ec='k', lw=1)
dx.text(0, num_mature_authors['inside'], '{:.1f}%'.format(100*frac_mature_authors['inside']), ha='center', va='bottom')
dx.text(1, num_mature_authors['outside'], '{:.1f}%'.format(100*frac_mature_authors['outside']), ha='center', va='bottom')
dx.set_ylim(0, 450)


# mean_citations_per_author.plot(ax=ex, kind='bar', color=['tab:blue', 'tab:orange'], ec='k', lw=1)

for xx in (ax, bx, cx, dx):
    xx.set_xlabel('')
    xx.set_xticklabels(['Inside', 'Outside'], rotation=0)


ax.set_ylabel('Mean Papers')
bx.set_ylabel('Num Years')
cx.set_ylabel('Num Authors')
dx.set_ylabel('Num Authors')

ax.set_title('(a) Num Papers/Author')
bx.set_title('(b) Num Years/Author')
cx.set_title('(c) Num Once-Off Authors')
dx.set_title('(d) Num Authors >20 Papers')

fig.tight_layout()

fig.savefig('../graphs/3100_papers_citations_per_author.png', dpi=300, bbox_inches='tight')

# Citations by Paper

In [None]:
recsys_df['frac_influentialCitationCount'] = recsys_df['influentialCitationCount']/recsys_df['citationCount']

fig, axs = plt.subplots(figsize=(9, 5), nrows=2, ncols=3)
ax, bx, cx, dx, ex, fx = axs.flatten()


fx.axis("off")

num_citations_by_paper_type = recsys_df.groupby('paper_type')['citationCount'].mean()
frac_influential_citations_by_paper_type = recsys_df.groupby('paper_type')['frac_influentialCitationCount'].mean()

num_citations_by_paper_type.plot(ax=ax, kind='bar', color=['tab:blue', 'tab:orange'], ec='k', lw=1)

# aax = ax.twinx()
# frac_influential_citations_by_paper_type.plot(ax=aax, lw=1, marker='x', markersize=6, c='k')

ax.set_ylim(0, 55)
# aax.set_ylim(0, .06)
# aax.set_ylabel('Frac Influential')

num_zero_citations = recsys_df.groupby('paper_type')['citationCount'].apply(lambda g: (g==0).sum())
frac_zero_citations = recsys_df.groupby('paper_type')['citationCount'].apply(lambda g: (g==0).mean())
num_zero_citations.plot(ax=bx, kind='bar', color=['tab:blue', 'tab:orange'], ec='k', lw=1)
# bbx = bx.twinx()
# frac_zero_citations.plot(ax=bbx, c='k', lw=1, marker='x', markersize=6)
# bbx.set_ylim(0, .5)
# bbx.set_ylabel('Frac Papers')
bx.text(0, num_zero_citations['inside'], '{:.1f}%'.format(100*frac_zero_citations['inside']), ha='center', va='bottom')
bx.text(1, num_zero_citations['outside'], '{:.1f}%'.format(100*frac_zero_citations['outside']), ha='center', va='bottom')
bx.set_ylim(0, 16000)



# Years to first citation
if 'min_citation_year' in recsys_df.columns:
    recsys_df = recsys_df.drop(columns=['min_citation_year', 'max_citation_year'])

min_max_citation_year = (
    recsys_df[recsys_df['citation_years'].map(lambda y: len(y)>0 if y is not None else False)]['citation_years']
    .swifter
    .apply(lambda years: pd.Series([years.min(), years.max()], index=['min_citation_year', 'max_citation_year']))
)

recsys_df = recsys_df.join(min_max_citation_year)

recsys_df['years_to_first_citation'] = recsys_df['min_citation_year']-recsys_df['year']

years_to_first_cite = recsys_df.groupby('paper_type')['years_to_first_citation'].mean()
years_to_first_cite.plot(ax=cx, kind='bar', color=['tab:blue', 'tab:orange'], ec='k', lw=1)


# Citation half-life
recsys_df['sorted_citation_years'] = recsys_df['citation_years'].swifter.apply(lambda years: sorted(years) if years is not None else [])

recsys_df['citation_half_life_year'] = (
    recsys_df[recsys_df['citation_years'].map(lambda y: len(y)>5 if y is not None else False)]['sorted_citation_years']
    .swifter
    .apply(lambda years: years[:len(years)//2][-1])
)

recsys_df['citation_half_life'] = recsys_df['citation_half_life_year']-recsys_df['year']

citation_half_life = recsys_df.groupby('paper_type')['citation_half_life'].mean()
citation_half_life.plot(ax=dx, kind='bar', color=['tab:blue', 'tab:orange'], ec='k', lw=1)


# Sources of citations
inside_paper_ids = set(recsys_df[recsys_df['paper_type']=='inside']['paperId'].unique())
outside_paper_ids = set(recsys_df[recsys_df['paper_type']=='outside']['paperId'].unique())

recsys_df['citations_from_inside'] = recsys_df['citations'].swifter.apply(lambda papers: set(papers).intersection(inside_paper_ids) if papers is not None else set())
recsys_df['citations_from_outside'] = recsys_df['citations'].swifter.apply(lambda papers: set(papers).intersection(outside_paper_ids)if papers is not None else set())

for col in ['citations_from_inside', 'citations_from_outside']:
    recsys_df['num_' + col] = recsys_df[col].map(len)

num_citations_by_source_by_paper_type = recsys_df.groupby('paper_type')[['num_citations_from_inside', 'num_citations_from_outside']].mean()

num_citations_by_source_by_paper_type.plot(ax=ex, kind='bar', stacked=True, lw=.5, ec='k', color=['tab:blue', 'tab:orange', 'w'])

ex.set_ylim(0, 32)

ex.set_ylabel('Num Citations/Paper')

ex.set_xticklabels(['To Inside', 'To Outside'], rotation=0)
ex.set_xlabel('')

ex.legend(['From Inside', 'From Outside'], ncol=1, loc='upper right', frameon=False)


for xx in (ax, bx, cx, dx):
    xx.set_xlabel('')
    xx.set_xticklabels(['Inside', 'Outside'], rotation=0)

ax.set_ylabel('Num Citations')
bx.set_ylabel('Num Papers')
cx.set_ylabel('Years')
dx.set_ylabel('Years')

ax.set_title('(a) Mean Citations/Paper')
bx.set_title('(b) Num Papers with Zero Cites')
cx.set_title('(c) Mean Years to 1st Cite')
dx.set_title('(d) Mean Half-Life (Years)')
ex.set_title('(e) Citation Sources')


fig.tight_layout()

# fig.savefig('../graphs/3100_citations_per_paper.png', dpi=300, bbox_inches='tight')

In [None]:
recsys_df['frac_influentialCitationCount'] = recsys_df['influentialCitationCount']/recsys_df['citationCount']

fig, axs = plt.subplots(figsize=(9, 5), nrows=2, ncols=2)
ax, bx, cx, dx = axs.flatten()

num_citations_by_paper_type = recsys_df.groupby('paper_type')['citationCount'].mean()
frac_influential_citations_by_paper_type = recsys_df.groupby('paper_type')['frac_influentialCitationCount'].mean()

num_citations_by_paper_type.plot(ax=ax, kind='bar', color=['tab:blue', 'tab:orange'], ec='k', lw=1)

ax.set_ylim(0, 55)

num_zero_citations = recsys_df.groupby('paper_type')['citationCount'].apply(lambda g: (g==0).sum())
frac_zero_citations = recsys_df.groupby('paper_type')['citationCount'].apply(lambda g: (g==0).mean())
num_zero_citations.plot(ax=bx, kind='bar', color=['tab:blue', 'tab:orange'], ec='k', lw=1)
bx.text(0, num_zero_citations['inside'], '{:.1f}%'.format(100*frac_zero_citations['inside']), ha='center', va='bottom')
bx.text(1, num_zero_citations['outside'], '{:.1f}%'.format(100*frac_zero_citations['outside']), ha='center', va='bottom')
bx.set_ylim(0, 16000)



# Years to first citation
if 'min_citation_year' in recsys_df.columns:
    recsys_df = recsys_df.drop(columns=['min_citation_year', 'max_citation_year'])

min_max_citation_year = (
    recsys_df[recsys_df['citation_years'].map(lambda y: len(y)>0 if y is not None else False)]['citation_years']
    .swifter
    .apply(lambda years: pd.Series([years.min(), years.max()], index=['min_citation_year', 'max_citation_year']))
)

recsys_df = recsys_df.join(min_max_citation_year)

recsys_df['years_to_first_citation'] = recsys_df['min_citation_year']-recsys_df['year']

years_to_first_cite = recsys_df.groupby('paper_type')['years_to_first_citation'].mean()
years_to_first_cite.plot(ax=cx, kind='bar', color=['tab:blue', 'tab:orange'], ec='k', lw=1)


# Citation half-life
recsys_df['sorted_citation_years'] = recsys_df['citation_years'].swifter.apply(lambda years: sorted(years) if years is not None else [])

recsys_df['citation_half_life_year'] = (
    recsys_df[recsys_df['citation_years'].map(lambda y: len(y)>5 if y is not None else False)]['sorted_citation_years']
    .swifter
    .apply(lambda years: years[:len(years)//2][-1])
)

recsys_df['citation_half_life'] = recsys_df['citation_half_life_year']-recsys_df['year']

citation_half_life = recsys_df.groupby('paper_type')['citation_half_life'].mean()
citation_half_life.plot(ax=dx, kind='bar', color=['tab:blue', 'tab:orange'], ec='k', lw=1)



for xx in (ax, bx, cx, dx):
    xx.set_xlabel('')
    xx.set_xticklabels(['Inside', 'Outside'], rotation=0)

ax.set_ylabel('Num Citations')
bx.set_ylabel('Num Papers')
cx.set_ylabel('Years')
dx.set_ylabel('Years')

ax.set_title('(a) Mean Citations/Paper')
bx.set_title('(b) Num Papers with Zero Cites')
cx.set_title('(c) Mean Years to 1st Cite')
dx.set_title('(d) Mean Half-Life (Years)')


fig.tight_layout()

fig.savefig('../graphs/3100_citations_per_paper.png', dpi=300, bbox_inches='tight')

In [None]:
num_citations_by_paper_type

In [None]:
num_citations_by_paper_type