# Analysis of the RS Topics
We perform a similar analysis of the RS topics including producing several graphs and wordcloud grids. We also provdie an in-depth analysis and visualisation of the momentum (publication and citation) for each topic over time.

In [None]:
import swifter
import Stemmer

import re

import os
# Should prevent "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. " warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"  

import string 

import nltk
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import words
# nltk.download('words')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
import matplotlib as mpl

from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from matplotlib.cm import ScalarMappable

import random
from itertools import chain

import pandas as pd
from matplotlib.pylab import plt

import numpy as np

from glob import glob, iglob
from pathlib import Path

from loguru import logger
from IPython.display import display, clear_output

import seaborn as sns

!pwd

In [None]:
sns.set_context('paper', font_scale=1.25)

# Load Datasets

In [None]:
universe_df = pd.read_feather('../data/processed/2410_universe_papers_with_topics.feather')
recsys_df = pd.read_feather('../data/processed/2410_recsys_papers_with_topics.feather')

universe_df.shape, recsys_df.shape

In [None]:
recsys_df['recsys_adj_topic_name'].unique()

In [None]:
universe_df_by_id = universe_df.set_index('paperId')
recsys_df_by_id = recsys_df.set_index('paperId')

len(universe_df_by_id), len(recsys_df_by_id)

In [None]:
universe_ids = set(universe_df_by_id.index)
recsys_ids = set(recsys_df_by_id.index)

len(universe_ids), len(recsys_ids)

In [None]:
inside_recsys_paper_ids = set(recsys_df[recsys_df['paper_type']=='inside']['paperId'].unique())
outside_recsys_paper_ids = set(recsys_df[recsys_df['paper_type']=='outside']['paperId'].unique())

len(inside_recsys_paper_ids), len(outside_recsys_paper_ids), len(inside_recsys_paper_ids) + len(outside_recsys_paper_ids)

In [None]:
recsys_df['recsys_topic_id'].notnull().sum()

# RecSys Topics Wordclouds

In [None]:
unique_tokens_by_topic = recsys_df.groupby('recsys_adj_topic_name').apply(
    lambda g: [token 
               for token in np.unique(np.concatenate(g['reversed_text_tokens'].values))
               if (not(token.isdigit())) & (not(bool(re.search(r'\d', token)))) & (not(token in STOPWORDS))
              ]
).explode().dropna()


unique_tokens_by_topic_value_counts = unique_tokens_by_topic.value_counts()      

allowed_tokens = set(unique_tokens_by_topic_value_counts[unique_tokens_by_topic_value_counts.between(3, 39)].index)

def draw_wordcloud(ax, papers, col):

    text = ' '.join([
        word for word in ' '.join(papers[col].values).lower().split()
        if word in allowed_tokens
    ])
    
    wc = WordCloud(
        width=500, height=500,
        min_font_size=10, max_font_size=96,
        background_color='white', colormap='twilight',
        relative_scaling=0  # Use ranks only for scaling
        ).generate_from_text(text)
        
    ax.imshow(wc, interpolation="bilinear")

    # ax.axis("off")

    ax.set_xlim(-20, 520)
    ax.set_ylim(520, -20)
    
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.tick_params('both', length=0)


## The Wordcloud Grids

### Grid 1

In [None]:
num_papers_by_topic = recsys_df.groupby(['recsys_adj_topic_name']).size().sort_values(ascending=False)


ncols = 4
# nrows = (len(num_papers_by_topic)//ncols) if len(num_papers_by_topic)%ncols==0 else (len(num_papers_by_topic)//ncols)+1
nrows = 6
s = 2.5

fig, axs = plt.subplots(figsize=(ncols*s, nrows*s), nrows=nrows, ncols=ncols, gridspec_kw=dict(wspace=0, hspace=.3))

axs = axs.flatten()

all_papers_by_topic = recsys_df.groupby('recsys_adj_topic_name')['paperId'].apply(lambda g: g.values)

recsys_papers_by_id = recsys_df.set_index('paperId')

for ax, topic_name in zip(axs, num_papers_by_topic.index[:24]):
    papers = recsys_papers_by_id.loc[all_papers_by_topic.loc[topic_name]]

    frac_inside = (papers['paper_type']=='inside').sum()/(recsys_df['paper_type']=='inside').sum()
    frac_outside = (papers['paper_type']=='outside').sum()/(recsys_df['paper_type']=='outside').sum()
    inside_bias = frac_inside/(frac_inside+frac_outside)

    
    draw_wordcloud(ax, papers, 'title')

    
    title = ', '.join(topic_name.split(', ')[:2])
    if len(title)>18:
        title = title[:18]+'...'
    

    title += '\n({:,}, {:,}, {:.1f})'.format(len(papers), papers['citationCount'].sum(), inside_bias)

    ax.set_title(title, ha='center')

    # Remove the empty graphs.
    num_empty = (ncols*nrows)-len(num_papers_by_topic)
    if num_empty>0:
        for ax in axs[-num_empty:]: ax.axis("off")

fig.tight_layout()

fig.savefig('../graphs/3410_recsys_topics_word_clouds_a.png', dpi=300, bbox_inches='tight')

### Grid 2

In [None]:


ncols = 4
# nrows = (len(num_papers_by_topic)//ncols) if len(num_papers_by_topic)%ncols==0 else (len(num_papers_by_topic)//ncols)+1
nrows = 5
s = 2.5

fig, axs = plt.subplots(figsize=(ncols*s, nrows*s), nrows=nrows, ncols=ncols, gridspec_kw=dict(wspace=0, hspace=.3))

axs = axs.flatten()

all_papers_by_topic = recsys_df.groupby('recsys_adj_topic_name')['paperId'].apply(lambda g: g.values)

recsys_papers_by_id = recsys_df.set_index('paperId')

for ax, topic_name in zip(axs, num_papers_by_topic.index[25:]):
    papers = recsys_papers_by_id.loc[all_papers_by_topic.loc[topic_name]]

    frac_inside = (papers['paper_type']=='inside').sum()/(recsys_df['paper_type']=='inside').sum()
    frac_outside = (papers['paper_type']=='outside').sum()/(recsys_df['paper_type']=='outside').sum()
    inside_bias = frac_inside/(frac_inside+frac_outside)

    
    draw_wordcloud(ax, papers, 'title')


    title = ', '.join(topic_name.split(', ')[:2])
    if len(title)>18:
        title = title[:18]+'...'
        
    title += '\n({:,}, {:,}, {:.1f})'.format(len(papers), papers['citationCount'].sum(), inside_bias)

    ax.set_title(title, ha='center')

    # Remove the empty graphs.
    num_empty = (ncols*nrows)-len(num_papers_by_topic[25:])
    if num_empty>0:
        for ax in axs[-num_empty:]: ax.axis("off")

fig.tight_layout()

fig.savefig('../graphs/3410_recsys_topics_word_clouds_b.png', dpi=300, bbox_inches='tight')

## The Latex table

In [None]:
print(
    pd.concat([
        recsys_df.groupby('recsys_adj_topic_name').size(), 
        recsys_df.groupby('recsys_adj_topic_name')['citationCount'].sum(),
        recsys_df.groupby('recsys_adj_topic_name')['citationCount'].sum()/recsys_df.groupby('recsys_adj_topic_name').size()

    ], axis=1).sort_values(by=0, ascending=False).applymap(lambda v: '{:,.0f}'.format(v)).to_latex()
)

# The RecSys Topic Analysis

## RecSys Topics Stats

In [None]:
papers_by_topic = pd.DataFrame(all_papers_by_topic)
papers_by_topic.columns=['papers']

# papers_by_topic['papers'] = recsys_df.groupby('recsys_adj_topic_name')['paperId'].apply(lambda p: p.values)

papers_by_topic['num_authors'] = papers_by_topic['papers'].swifter.apply(lambda papers: len(np.unique(np.concatenate(recsys_df_by_id.loc[papers]['authors'].values))))

papers_by_topic['inside_papers'] = papers_by_topic['papers'].swifter.apply(
    lambda papers: list(set(papers).intersection(inside_recsys_paper_ids))
)

papers_by_topic['outside_papers'] = papers_by_topic['papers'].swifter.apply(
    lambda papers: list(set(papers).intersection(outside_recsys_paper_ids))
)

for col in ['papers', 'inside_papers', 'outside_papers']:
    papers_by_topic['num_' + col] = papers_by_topic[col].map(len)

papers_by_topic['frac_inside_papers'] = papers_by_topic['num_inside_papers']/papers_by_topic['num_inside_papers'].sum()
papers_by_topic['frac_outside_papers'] = papers_by_topic['num_outside_papers']/papers_by_topic['num_outside_papers'].sum()
papers_by_topic['inside_bias'] = papers_by_topic['frac_inside_papers']/(papers_by_topic['frac_inside_papers']+papers_by_topic['frac_outside_papers'])

papers_by_topic['num_citations'] = (
    papers_by_topic['papers']
    .swifter
    .apply(lambda papers: universe_df_by_id.loc[papers]['citationCount'].sum())
)

papers_by_topic['num_inside_citations'] = (
    papers_by_topic['inside_papers']
    .swifter
    .apply(lambda papers: universe_df_by_id.loc[papers]['citationCount'].sum())
)

papers_by_topic['num_outside_citations'] = (
    papers_by_topic['outside_papers']
    .swifter
    .apply(lambda papers: universe_df_by_id.loc[papers]['citationCount'].sum())
)

papers_by_topic['num_inside_citations_per_paper'] = (
    papers_by_topic['inside_papers']
    .swifter
    .apply(lambda papers: universe_df_by_id.loc[papers]['citationCount'].values)
    .map(np.mean)
)

papers_by_topic['num_outside_citations_per_paper'] = (
    papers_by_topic['outside_papers']
    .swifter
    .apply(lambda papers: universe_df_by_id.loc[papers]['citationCount'].values)
    .map(np.mean)
)


papers_by_topic['inside_outside_citation_ratio'] = papers_by_topic['num_inside_citations_per_paper']/papers_by_topic['num_outside_citations_per_paper']

papers_by_topic = papers_by_topic.sort_values(by='num_papers', ascending=False)

papers_by_topic.head()

## Visualisation 1

In [None]:
fig, (ax, bx, cx, dx) = plt.subplots(figsize=(10, 13), nrows=4, sharex=True)



# The number of inside/outside papers by topic.
papers_by_topic[['num_inside_papers', 'num_outside_papers']].plot(ax=ax, kind='bar', stacked=True, ec='k', lw=.5)

ax.set_ylabel('Total Papers')
ax.set_ylim(0, 5000)
ax.legend(['Inside', 'Outside'], ncol=2, frameon=False, loc='upper left')
ax.set_title('(a) Total Inside & Outside Papers per Topic')

papers_by_topic[['frac_inside_papers', 'frac_outside_papers']].plot(ax=bx, kind='bar', stacked=True, ec='k', lw=.5)

bx.set_ylabel('Frac Papers')
bx.set_ylim(0, .15)
bx.legend(['Inside', 'Outside'], ncol=2, frameon=False, loc='upper left')
bx.set_title('(b) Fraction of Inside & Outside Papers per Topic')

bbx = bx.twinx()
bbx.scatter(papers_by_topic['inside_bias'].index, papers_by_topic['inside_bias'], lw=.5, c='w',ec='k',  marker='o')

bbx.set_ylabel('Inside Bias')
bbx.legend(['Inside Bias'], ncol=1, frameon=False, loc='upper right')
bbx.axhline(.5, c='k', lw=.5, ls='--')
bbx.set_ylim(0, 1)


papers_by_topic[['num_inside_citations', 'num_outside_citations']].plot(ax=cx, kind='bar', stacked=True, ec='k', lw=.5)

cx.set_ylim(0, 99000)
cx.set_xlabel('')
cx.set_ylabel('Total Citations')
cx.legend(['Inside', 'Outside'], ncol=2, frameon=False, loc='upper left')
cx.set_title('(c) Total Inside & Outside Citations per Paper per Topic')

papers_by_topic[['num_inside_citations_per_paper', 'num_outside_citations_per_paper']].plot(ax=dx, kind='bar', stacked=True, ec='k', lw=.5)

dx.set_xlabel('')
dx.set_ylabel('Mean Citations')
dx.legend(['Inside', 'Outside'], ncol=2, frameon=False, loc='upper left')
dx.set_title('(d) Mean Inside & Outside Citations per Paper per Topic')

# ccx = cx.twinx()
# ccx.scatter(papers_by_topic['inside_outside_citation_ratio'].index, papers_by_topic['inside_outside_citation_ratio'], lw=.5, c='w',ec='k',  marker='o')
# ccx.axhline(1, c='k', lw=.5, ls='--')

# ccx.set_ylabel('Inside/Outside Ratio')
# ccx.legend(['Inside/Outside Ratio'], ncol=1, frameon=False, loc='upper right')


dx.set_xlim(-1, len(papers_by_topic))
dx.set_xticklabels([', '.join(label.get_text().split(', ')[:2]) for label in dx.get_xticklabels()])

fig.tight_layout()

fig.savefig('../graphs/3410_inside_outside_papers_citations_by_topic_abcd.png', dpi=300, bbox_inches='tight')


## Visualisation 2

In [None]:
fig, (ax, bx) = plt.subplots(figsize=(10, 8), nrows=2, sharex=True)


# The number of inside/outside papers by topic.
papers_by_topic[['num_inside_papers', 'num_outside_papers']].plot(ax=ax, kind='bar', stacked=True, ec='k', lw=.5)

ax.set_ylabel('Total Papers')
ax.set_ylim(0, 5000)
ax.legend(['Inside', 'Outside'], ncol=2, frameon=False, loc='upper left')
ax.set_title('(a) Total Inside & Outside Papers per Topic')

papers_by_topic[['frac_inside_papers', 'frac_outside_papers']].plot(ax=bx, kind='bar', stacked=True, ec='k', lw=.5)

bx.set_ylabel('Frac Papers')
bx.set_ylim(0, .15)
bx.legend(['Inside', 'Outside'], ncol=2, frameon=False, loc='upper left')
bx.set_title('(b) Fraction of Inside & Outside Papers per Topic')

bbx = bx.twinx()
bbx.scatter(papers_by_topic['inside_bias'].index, papers_by_topic['inside_bias'], lw=.5, c='w',ec='k',  marker='o')

bbx.set_ylabel('Inside Bias')
bbx.legend(['Inside Bias'], ncol=1, frameon=False, loc='upper right')
bbx.axhline(.5, c='k', lw=.5, ls='--')
bbx.set_ylim(0, 1)


bx.set_xlim(-1, len(papers_by_topic))
bx.set_xticklabels([', '.join(label.get_text().split(', ')[:2]) for label in dx.get_xticklabels()])

bx.set_xlabel('')

fig.tight_layout()

fig.savefig('../graphs/3410_inside_outside_papers_by_topic.png', dpi=300, bbox_inches='tight')


# Topic Publication/Citation Profiles
Here we analyse the publictaion and citation profiles of each of the topic.

In [None]:
papers_by_topic['paper_years'] = papers_by_topic['papers'].swifter.apply(
    lambda papers: sorted([year for year in universe_df_by_id.loc[papers]['year'].values if not(np.isnan(year))])
)

papers_by_topic['start_year'] = papers_by_topic['paper_years'].map(min)
papers_by_topic['end_year'] = papers_by_topic['paper_years'].map(max)

papers_by_topic['citation_years'] = papers_by_topic['papers'].swifter.apply(
    lambda papers: sorted([year for year in universe_df_by_id.reindex(np.concatenate(universe_df_by_id.loc[papers]['updated_citations'].values))['year'].values if not(np.isnan(year))])
)

papers_by_topic['total_citation_count'] = papers_by_topic['citation_years'].map(len)

papers_by_topic.head()

## Calculate Momentum (publication output and citation impact)

In [None]:
momentum_window = 4

def get_momentum(years, window=momentum_window):

    # eg window = 5, end_year = 2023, start_year = 2014, mid_year = 2019, => 10 years including start/end
    end_year = max(years)
    start_year = (end_year-(window*2))+1
    mid_year = start_year+window

    # The year at which the topic surpassed 10% of papers or citations
    threshold_year = years[len(years)//10]

    num_early = len([year for year in years if start_year<=year<mid_year])
    num_late = len([year for year in years if mid_year<=year<=end_year])

    momentum = (num_late-num_early)/num_early

    return pd.Series([start_year, mid_year, end_year, window, num_early, num_late, momentum, threshold_year], 
                     index=['start_year', 'mid_year', 'end_year', 'window', 'num_early', 'num_late', 'momentum', 'threshold_year'])


paper_momentum = papers_by_topic['paper_years'].swifter.apply(get_momentum).add_suffix('_papers')
citation_momentum = papers_by_topic['citation_years'].swifter.apply(get_momentum).add_suffix('_citations')

papers_by_topic = pd.concat([papers_by_topic, paper_momentum, citation_momentum], axis=1).copy()


papers_by_topic.loc['Trust, Social, User, Reputation'].tail(35)

## Add momemntum information

In [None]:
papers_by_topic['momentum'] = ((papers_by_topic['momentum_citations']**2) + (papers_by_topic['momentum_papers']**2)).map(np.sqrt)

papers_by_topic['growing_papers'] = papers_by_topic['momentum_papers']>0
papers_by_topic['growing_citations'] = papers_by_topic['momentum_citations']>0

papers_by_topic[(papers_by_topic['growing_papers']) & (papers_by_topic['growing_citations'])].sort_values(by='momentum', ascending=False).loc[:, 'num_early_papers':]

## Main Visualisation

In [None]:
def get_momentum_label(momentum):

    threshold = 0.05

    if momentum>threshold:
        return '↑'
    elif -threshold<=momentum<=threshold:
        return ' ↔︎'
    elif momentum<-threshold:
        return '↓'
            
def plot_topic_publication_sparklines(ax, topics_df):
    
    for i, (topic_id, topic_data) in enumerate(topics_df.iterrows()):

        start_year, end_year = topic_data['start_year'], topic_data['end_year']
        
        scale = 0.8

        # We start the profiles only once we exceed 5% of the data.
        # This avoids some issues with premature papers/cites.
        profile_threshold = 0.05
        
        pub_profile = pd.Series(topic_data['paper_years']).value_counts().sort_index().loc[start_year:end_year]
        rel_pub_profile = (pub_profile/pub_profile.max())
        rel_pub_profile = rel_pub_profile.loc[rel_pub_profile.cumsum()[rel_pub_profile.cumsum()>profile_threshold].index]*scale

        cite_profile = pd.Series(topic_data['citation_years']).value_counts().sort_index().loc[start_year:end_year]
        rel_cite_profile = (cite_profile/cite_profile.max())
        rel_cite_profile = rel_cite_profile.loc[rel_cite_profile.cumsum()[rel_cite_profile.cumsum()>profile_threshold].index]*scale


        # The colour of the graph is based on the inside bias.
        inside_bias = topic_data['inside_bias']
        norm = mpl.colors.Normalize(vmin=0, vmax=1)
        cmap = plt.get_cmap('coolwarm_r') 
        c = cmap(norm(inside_bias))

        # Draw and fill the graph
        ax.plot(rel_pub_profile.index, i+rel_pub_profile.values, lw=1.5, ls='-', c=c)
        ax.fill_between(rel_pub_profile.index, i, i+rel_pub_profile, color=c, alpha=.5)
        # (i+profile).plot(ax=ax, lw=.75, c='k')

        ax.plot(rel_cite_profile.index, i+rel_cite_profile.values, lw=.6, ls='-', c='k')

        # ax.plot([topic_data['start_year_papers']]*2, [i, i+scale], lw=.5, ls=':', c='k')
        # ax.plot([topic_data['mid_year_papers']]*2, [i, i+scale], lw=.75, ls='-', c='k')

        threshold_year = topic_data['threshold_year_papers']
        # ax.plot([threshold_year]*2, [i, i+rel_pub_profile.loc[threshold_year]], lw=.5, c='k')
        
        # ax.plot([topic_data['end_year_papers']]*2, [i, i+scale], lw=.5, ls=':', c='k')

        num_authors = topic_data['num_authors']
        topic_count = topic_data['num_papers']
        total_citation_count = topic_data['total_citation_count']
        pub_momentum_label = get_momentum_label(topic_data['momentum_papers'])
        cite_momentum_label = get_momentum_label(topic_data['momentum_citations'])
        cites_per_paper = total_citation_count/topic_count

        # Add the total topic count and citation count and the momentums.
        # ax.text(rel_pub_profile.index[0], i, '{:,.0f}{}, {:,.0f}{}'.format(
        #     topic_count, pub_momentum_label, total_citation_count, cite_momentum_label
        # ), ha='right', va='bottom')

        ax.text(rel_pub_profile.index[0], i, '{:,.0f} papers'.format(
                topic_count, total_citation_count
            ), ha='right', va='bottom')

    ax.set_ylim(0, len(topics_df))
    ax.set_yticks(range(0, len(topics_df)))
     
    y_labels = topics_df.apply(
        lambda topic: '{} ({:.1f}%, {:.1f}%)'.format(', '.join(topic.name.split(', ')[:2]), topic['frac_inside_papers']*100, topic['frac_outside_papers']*100), axis=1
    )
    ax.set_yticklabels(y_labels, va='bottom')
    
    ax.tick_params(axis='y', pad=-5)
    
    # cax = inset_axes(ax, width="5%", height="30%", loc='lower left') 
    
    # cmappable = mpl.cm.ScalarMappable(cmap=cmap)
    # cb = fig.colorbar(cmappable, cax=cax, orientation='vertical', location='left')
    # cb.set_ticklabels(['100%\nOutside', '', '', '', '', '100%\nInside'], ha='right')

    norm = mpl.colors.Normalize(vmin=0, vmax=1)
    cmap = plt.get_cmap('coolwarm_r') 
    
    sm = ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])
    cax = fig.add_axes([.93, 0.03, 0.02, .25])
    cbar = plt.colorbar(sm, cax=cax, alpha=.5)
    cbar.set_label('Inside Bias')


    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.yaxis.tick_right()
    ax.yaxis.set_tick_params(which='major', right=False)

    # ax.axvline(2007, lw=.5, ls=':', c='k', zorder=-100)
    
    ax.set_xlim(1990, 2023.2)
    ax.set_xticks(range(1990, 2023, 10))
    ax.set_xlabel('')

    # ax.axvline(2020, c='k', ls='--', lw=.5)
    

In [None]:
w, h = 10, 13

fig, ax = plt.subplots(figsize=(w, h))

plot_topic_publication_sparklines(ax, papers_by_topic.sort_values(by='num_papers'))

fig.tight_layout()

fig.savefig('../graphs/3410_recsys_topic_profiles.png', dpi=300, bbox_inches='tight')


# Save Updated DF

In [None]:
papers_by_topic.to_feather('../data/processed/3410_papers_by_topic.feather')
papers_by_topic.shape