# Universe Topic Analysis
This notebook looks at teh topic analysis of the universe of RS papers ($U_p$) and produces various analyses and visualisations used in the main study.

In [None]:
import swifter
import Stemmer

import re

import os
# Should prevent "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. " warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"  

import string 

import nltk
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import words
# nltk.download('words')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.patches import Patch


from mpl_toolkits.axes_grid1.inset_locator import inset_axes

import random
from itertools import chain

import pandas as pd
from matplotlib.pylab import plt

import numpy as np


from glob import glob, iglob
from pathlib import Path

from loguru import logger
from IPython.display import display, clear_output

import seaborn as sns
sns.set_context('paper', font_scale=1.25)

!pwd

# Load Datasets

In [None]:
universe_df = pd.read_feather('../data/processed/2410_universe_papers_with_topics.feather')
recsys_df = pd.read_feather('../data/processed/2410_recsys_papers_with_topics.feather')

universe_df.shape, recsys_df.shape

In [None]:
universe_df_by_id = universe_df.set_index('paperId')
recsys_df_by_id = recsys_df.set_index('paperId')

len(universe_df_by_id), len(recsys_df_by_id)

In [None]:
universe_ids = set(universe_df_by_id.index)
recsys_ids = set(recsys_df_by_id.index)

len(universe_ids), len(recsys_ids)

# The Universe of Topics

In [None]:
has_universe_topic = universe_df['universe_topic_id'].notnull()

topic_universe_df = universe_df[has_universe_topic].copy()
topic_universe_df.shape

In [None]:
num_all_papers_per_topic = topic_universe_df.groupby('universe_adj_topic_name').size()
num_recsys_papers_per_topic = recsys_df.groupby('recsys_adj_topic_name').size()
num_non_recsys_papers_per_topic = num_all_papers_per_topic-num_recsys_papers_per_topic

topic_universe_paper_ids = set(topic_universe_df['paperId'].unique())
recsys_paper_ids = set(recsys_df['paperId'].unique())

inside_recsys_paper_ids = set(recsys_df[recsys_df['paper_type']=='inside']['paperId'].unique())
outside_recsys_paper_ids = set(recsys_df[recsys_df['paper_type']=='outside']['paperId'].unique())

len(topic_universe_paper_ids), len(recsys_paper_ids), len(inside_recsys_paper_ids), len(outside_recsys_paper_ids)

In [None]:
sorted_topics = topic_universe_df.groupby('universe_adj_topic_name').size().sort_values(ascending=False)

papers_by_topic = (
    topic_universe_df
    .groupby(['universe_adj_topic_name', 'is_core_recsys_paper'])
    .apply(lambda g: g['paperId'].values)
    .unstack()
    .applymap(lambda v: v if type(v) is np.ndarray else [])
    .rename(columns={False:'non_recsys_papers', True:'recsys_papers'})
    .loc[sorted_topics.index]
)

papers_by_topic['papers'] = topic_universe_df.groupby('universe_adj_topic_name')['paperId'].apply(lambda p: p.values)

papers_by_topic['num_papers'] = papers_by_topic['papers'].map(len)
papers_by_topic['num_non_recsys_papers'] = papers_by_topic['non_recsys_papers'].map(len)
papers_by_topic['num_recsys_papers'] = papers_by_topic['recsys_papers'].map(len)

papers_by_topic['total_papers'] = papers_by_topic['num_non_recsys_papers']+papers_by_topic['num_recsys_papers']

papers_by_topic['inside_recsys_papers'] = papers_by_topic['recsys_papers'].swifter.apply(
    lambda papers: list(set(papers).intersection(inside_recsys_paper_ids))
)

papers_by_topic['outside_recsys_papers'] = papers_by_topic['recsys_papers'].swifter.apply(
    lambda papers: list(set(papers).intersection(outside_recsys_paper_ids))
)

papers_by_topic['num_inside_recsys_papers'] = papers_by_topic['inside_recsys_papers'].map(len)
papers_by_topic['num_outside_recsys_papers'] = papers_by_topic['outside_recsys_papers'].map(len)

papers_by_topic['citations'] = (
    papers_by_topic['papers']
    .swifter
    .apply(lambda papers: np.concatenate(universe_df_by_id.loc[papers]['updated_citations'].values))
)

papers_by_topic['references'] = (
    papers_by_topic['papers']
    .swifter
    .apply(lambda papers: np.concatenate(universe_df_by_id.loc[papers]['references'].values))
)

papers_by_topic['recsys_citations'] = (
    papers_by_topic['citations']
    .swifter
    .apply(lambda papers: list(set(papers).intersection(recsys_ids)))
)

papers_by_topic['non_recsys_citations'] = (
    papers_by_topic['citations']
    .swifter
    .apply(lambda papers: list(set(papers).difference(recsys_ids)))
)

papers_by_topic['recsys_references'] = (
    papers_by_topic['references']
    .swifter
    .apply(lambda papers: list(set(papers).intersection(recsys_ids)))
)

papers_by_topic['non_recsys_references'] = (
    papers_by_topic['references']
    .swifter
    .apply(lambda papers: list(set(papers).difference(recsys_ids)))
)

for col in ['citations', 'references', 'recsys_citations', 'non_recsys_citations', 'recsys_references', 'non_recsys_references']:
    papers_by_topic['num_'+col] = papers_by_topic[col].map(len)

papers_by_topic.head()

## Main Bar Chart

In [None]:
fig, (ax, bx, cx) = plt.subplots(figsize=(12, 13), nrows=3, sharex=True)


# The number of non-RS and RS papers by topic.
papers_by_topic[['num_non_recsys_papers', 'num_recsys_papers']].plot(ax=ax, kind='bar', stacked=True, ec='k', lw=.5)

# Add the number of RecSys papers to each topic bar
for x, (_, data) in enumerate(papers_by_topic[['num_non_recsys_papers', 'num_recsys_papers']].iterrows()):
    y = data.filter(like='num').sum()
    ax.text(x, y, '  {:,}'.format(data['num_recsys_papers']), ha='center', va='bottom', rotation=90, fontsize=10)

ax.set_ylim(0, 250000)
ax.legend(['Non $R_p$ Papers', '$R_p$ Papers'], ncol=2, frameon=False, loc='upper left')
ax.set_title('(a) Number of Papers per Topic')

papers_by_topic_without_recsys = papers_by_topic.copy()
papers_by_topic_without_recsys.loc['Recommendation, User, Filtering, Items', 'num_inside_recsys_papers'] = 0
papers_by_topic_without_recsys.loc['Recommendation, User, Filtering, Items', 'num_outside_recsys_papers'] = 0
papers_by_topic_without_recsys.loc['Recommendation, User, Filtering, Items', 'num_recsys_papers'] = 0

papers_by_topic_without_recsys['num_recsys_papers'].plot(ax=bx, kind='bar', stacked=True, ec='k', lw=.5, color='tab:orange')

bx.set_ylim(0, 3500)
bx.legend(['$R_p$ Papers'], ncol=1, frameon=False, loc='upper left')
bx.set_title('(b) Number of Core RS Papers ($R_p$) per Topic')

# The number of citations to RS papers from a topic and to a topic from RS papers.
citations_by_topic_without_recsys = papers_by_topic[['num_recsys_citations', 'num_recsys_references']].copy()
citations_by_topic_without_recsys.loc['Recommendation, User, Filtering, Items', 'num_recsys_citations'] = 0
citations_by_topic_without_recsys.loc['Recommendation, User, Filtering, Items', 'num_recsys_references'] = 0

citations_by_topic_without_recsys[['num_recsys_citations', 'num_recsys_references']].plot(ax=cx, kind='bar', stacked=True, ec='k', lw=.5, color=['tab:olive', 'tab:cyan'])

# links_per_paper = citations_by_topic_without_recsys[['num_recsys_citations', 'num_recsys_references']].sum(axis=1)/papers_by_topic['num_papers']

# ccx = cx.twinx()
# links_per_paper.plot(ax=ccx, lw=.5, c='k', ls='--', marker='x')

cx.set_xlabel('')

cx.set_ylim(0, 32000)

cx.set_xticklabels(papers_by_topic.index, rotation=90)
cx.legend(['Citations From $R_p$', 'Citations To $R_p$'], ncol=2, frameon=False, loc='upper left')
cx.set_title('(c) Number of Citations To/From Core RS Papers ($R_p$) per Topic.')

# ccx.set_ylabel('Citations/Paper')
# ccx.set_ylim(0, 1)
# ccx.legend(['Citations To/From per Paper'], ncol=1, frameon=False, loc='upper right')

cx.set_xlim(-1, len(papers_by_topic))
# cx.set_xticklabels([', '.join(label.get_text().split(', ')[:2]) for label in cx.get_xticklabels()])

ax.set_ylabel("Num Papers ('000s)")
bx.set_ylabel("Num RS Papers ('000s)")
cx.set_ylabel("Num Citations ('000s)")

bx.set_yticks(range(0, 3001, 1000))

for xx in [ax, bx, cx]:
    xx.set_yticklabels([int(float(label.get_text())//1000) for label in xx.get_yticklabels()])


fig.tight_layout()

fig.savefig('../graphs/3400_papers_citations_by_topic_abc.png', dpi=300, bbox_inches='tight')


# Topic Wordclouds

## The wordcloud grids

In [None]:
unique_tokens_by_topic = universe_df.groupby('universe_adj_topic_name').apply(
    lambda g: [token 
               for token in np.unique(np.concatenate(g['reversed_text_tokens'].values))
               if (not(token.isdigit())) & (not(bool(re.search(r'\d', token)))) & (not(token in STOPWORDS))
              ]
).explode().dropna()


unique_tokens_by_topic_value_counts = unique_tokens_by_topic.value_counts()      

allowed_tokens = set(unique_tokens_by_topic_value_counts[unique_tokens_by_topic_value_counts.between(3, 39)].index)

def draw_wordcloud(ax, papers, col):

    text = ' '.join([
        word for word in ' '.join(papers[col].values).lower().split()
        if word in allowed_tokens
    ])
    
    wc = WordCloud(
        width=500, height=500,
        min_font_size=10, max_font_size=96,
        background_color='white', colormap='twilight',
        relative_scaling=0  # Use ranks only for scaling
        ).generate_from_text(text)
        
    ax.imshow(wc, interpolation="bilinear")

    # ax.axis("off")

    ax.set_xlim(-20, 520)
    ax.set_ylim(520, -20)
    
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.tick_params('both', length=0)


### Grid 1 

In [None]:
num_papers_by_topic = universe_df.groupby(['universe_adj_topic_name']).size().sort_values(ascending=False)

ncols = 4
# nrows = (len(num_papers_by_topic)//ncols) if len(num_papers_by_topic)%ncols==0 else (len(num_papers_by_topic)//ncols)+1
nrows = 6
s = 2.5

fig, axs = plt.subplots(figsize=(ncols*s, nrows*s), nrows=nrows, ncols=ncols, gridspec_kw=dict(wspace=0, hspace=.3))
axs = axs.flatten()

universe_papers_by_topic = universe_df.groupby('universe_adj_topic_name')['paperId'].apply(lambda g: g.values)

universe_papers_by_id = universe_df.set_index('paperId')

for ax, topic_name in zip(axs, num_papers_by_topic.index[:24]):
    papers = universe_papers_by_id.loc[universe_papers_by_topic.loc[topic_name]]
    
    draw_wordcloud(ax, papers, 'title')

    # if len(topic_name)>22:
    #     title = (', '.join(topic_name.split(', ')[:2]) + '\n' + ', '.join(topic_name.split(', ')[2:])).title()
    # else:
    #     title = (topic_name).title()

    title = ', '.join(topic_name.split(', ')[:2])

    title += '\n({:,}, {:,})'.format(len(papers), papers['citationCount'].sum())

    ax.set_title(title, ha='center')

    # Remove the empty graphs.
    num_empty = (ncols*nrows)-len(num_papers_by_topic)
    if num_empty>0:
        for ax in axs[-num_empty:]: ax.axis("off")

fig.tight_layout()

fig.savefig('../graphs/3400_universe_topics_word_clouds_a.png', dpi=300, bbox_inches='tight')

### Grid 2

In [None]:
ncols = 4
# nrows = (len(num_papers_by_topic)//ncols) if len(num_papers_by_topic)%ncols==0 else (len(num_papers_by_topic)//ncols)+1
nrows = 6
s = 2.5

fig, axs = plt.subplots(figsize=(ncols*s, nrows*s), nrows=nrows, ncols=ncols, gridspec_kw=dict(wspace=0, hspace=.3))
axs = axs.flatten()

universe_papers_by_topic = universe_df.groupby('universe_adj_topic_name')['paperId'].apply(lambda g: g.values)

universe_papers_by_id = universe_df.set_index('paperId')

for ax, topic_name in zip(axs, num_papers_by_topic.index[-23:]):
    papers = universe_papers_by_id.loc[universe_papers_by_topic.loc[topic_name]]
    
    draw_wordcloud(ax, papers, 'title')

    # if len(topic_name)>22:
    #     title = (', '.join(topic_name.split(', ')[:2]) + '\n' + ', '.join(topic_name.split(', ')[2:])).title()
    # else:
    #     title = (topic_name).title()

    title = ', '.join(topic_name.split(', ')[:2])

    title += '\n({:,}, {:,})'.format(len(papers), papers['citationCount'].sum())

    ax.set_title(title, ha='center')

    # Remove the empty graphs.
    num_empty = (ncols*nrows)-len(num_papers_by_topic.index[-23:])
    if num_empty>0:
        for ax in axs[-num_empty:]: ax.axis("off")

fig.tight_layout()

fig.savefig('../graphs/3400_universe_topics_word_clouds_b.png', dpi=300, bbox_inches='tight')

## Latex Table

In [None]:
print(
    pd.concat([
        universe_df.groupby('universe_adj_topic_name').size(), 
        universe_df.groupby('universe_adj_topic_name')['citationCount'].sum(),
        universe_df.groupby('universe_adj_topic_name')['citationCount'].sum()/universe_df.groupby('universe_adj_topic_name').size()

    ], axis=1).sort_values(by=0, ascending=False).applymap(lambda v: '{:,.0f}'.format(v)).to_latex()
)