# Emerging Topics
In this notebook we perform and emerging topic analysis to identify those recent topics that exhibit the greatest momentum in terms of publication output and impact.

In [None]:
import swifter

import os
import json
import random
import time
import re
from datetime import datetime
from collections import defaultdict

import string 

import matplotlib.pyplot as plt

import Stemmer

import random
import requests
from itertools import chain
from more_itertools import sliced

import pandas as pd
from matplotlib.pylab import plt
from matplotlib_venn import venn2, venn3
import matplotlib as mpl
from matplotlib.cm import ScalarMappable
from matplotlib import pyplot as plt, patches


import numpy as np

from glob import glob, iglob
from pathlib import Path
                         
from loguru import logger
from IPython.display import display, clear_output

from multiprocessing import Pool

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import nltk
nltk.download('punkt')

from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')

import seaborn as sns

from itables import init_notebook_mode, show, options
init_notebook_mode(all_interactive=False)

import networkx as nx


import seaborn as sns
sns.set_context('paper', font_scale=1.25)

pd.__version__

# Load the RecSys Papers

In [None]:
recsys_papers_df = pd.read_feather('../data/processed/3500_recsys_papers_with_influence_ranks.feather')

recsys_papers_df.shape

In [None]:
recsys_papers_df['bibtex_key'] = recsys_papers_df['bibtex'].map(
    lambda bibtex: bibtex.split('{')[1].split(',')[0] if bibtex is not None else ''
)

In [None]:
papers_by_topic_df = pd.read_feather('../data/processed/3410_papers_by_topic.feather')
papers_by_topic_df[(papers_by_topic_df['growing_papers']) & (papers_by_topic_df['growing_citations'])].sort_values(by='momentum', ascending=False)

# Emerging Topics Analysis
Here we focus on recent topics with growing papers and citations.

In [None]:
with_titles = recsys_papers_df['title'].map(lambda title: len(title.split())>2 if title is not None else '')
with_authors = recsys_papers_df['author_names'].map(lambda a: len(a)>0 if a is not None else False)
with_venues = recsys_papers_df['venue'].map(lambda v: len(v)>3 if v is not None else False)
with_topic = recsys_papers_df['recsys_adj_topic_name'].notnull()


# Based on the last 4 (2020-2023 inclusive) years to match up with how we calculated momentum in topics.
is_emerging = recsys_papers_df['year'] >= 2020
is_emerging.sum()

## Main Scatter Plot

In [None]:
growing_papers = papers_by_topic_df['growing_papers'] = papers_by_topic_df['momentum_papers']>0
growing_citations = papers_by_topic_df['growing_citations'] = papers_by_topic_df['momentum_citations']>0

papers_by_topic_df['topic_age'] = 2024-papers_by_topic_df['threshold_year_papers']

fig, ax = plt.subplots(figsize=(8, 8))


for name, data in papers_by_topic_df.iterrows():

    x, y = data.filter(like='momentum_')

    size_scale = 40
    s = (data['num_papers']**1.3)/size_scale

    age = data['topic_age']
    norm = mpl.colors.Normalize(vmin=papers_by_topic_df['topic_age'].min(), vmax=papers_by_topic_df['topic_age'].max())
    cmap = plt.get_cmap('coolwarm_r') 
    c = cmap(norm(age))

    ax.scatter(x, y, marker='o', s=s, color=c, ec='k', lw=.5, alpha=.5)


ax.axhline(0, lw=.5, ls='--', c='k', zorder=-100)
ax.axvline(0, lw=.5, ls='--', c='k', zorder=-100)

ax.set_xlabel('Publication Momentum')
ax.set_ylabel('Citation Momentum')


# Add manual annotations
fontsize = 10

x, y, year = papers_by_topic_df.loc['Graph, Knowledge, Items, Network'][['momentum_papers', 'momentum_citations', 'threshold_year_papers']]
ax.text(x-.3, y, 'Graph, Knowledge ({:.0f})'.format(year), ha='right', va='center')

x, y, year = papers_by_topic_df.loc['Fairness, Bias, Unfairness, Gender'][['momentum_papers', 'momentum_citations', 'threshold_year_papers']]
ax.text(x, y, '  Fairness, Bias ({:.0f})'.format(year), ha='left', va='center')

x, y, year = papers_by_topic_df.loc['Crop, Soil, Agricultural, Farmers'][['momentum_papers', 'momentum_citations', 'threshold_year_papers']]
ax.text(x, y, '  Crop, Soil ({:.0f})'.format(year), ha='left', va='center')

x, y, year = papers_by_topic_df.loc['Session, Sequential, Items, Model'][['momentum_papers', 'momentum_citations', 'threshold_year_papers']]
ax.text(x-.3, y-.01, 'Session, Sequential ({:.0f})'.format(year), ha='right', va='center')

x, y, year = papers_by_topic_df.loc['Conversational, Explanations, User, Dialogue'][['momentum_papers', 'momentum_citations', 'threshold_year_papers']]
ax.text(x+.3, y, 'Conversational, Explanation ({:.0f})'.format(year), ha='left', va='center')

x, y, year = papers_by_topic_df.loc['Learning, Reinforcement, Bandit, User'][['momentum_papers', 'momentum_citations', 'threshold_year_papers']]
ax.text(x-.3, y, 'Learning, Reinforcement ({:.0f})'.format(year), ha='right', va='center')

x, y, year = papers_by_topic_df.loc['Deep, Neural, Learning, Model'][['momentum_papers', 'momentum_citations', 'threshold_year_papers']]
ax.text(x+.3, y, 'Deep, Neural ({:.0f})'.format(year), ha='left', va='center')

sm = ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cax = fig.add_axes([1, .12, 0.03, .85])
cbar = plt.colorbar(sm, cax=cax, alpha=.5)
cbar.set_label('Topic Age')



if (growing_citations & ~growing_papers).sum()>0:
    ax.text(-1.5, 6, '{:.0f} topics ({:.1f}±{:.1f}, {:.1f}±{:.1f})'.format(
        (growing_citations & ~growing_papers).sum(), 
        *papers_by_topic_df[growing_citations & ~growing_papers].filter(like='momentum_').agg(['mean', 'std']).T.values.flatten()), 
        ha='left', fontstyle='italic', bbox=dict(boxstyle='round', lw=.5, facecolor='w', alpha=1))

if (growing_citations & growing_papers).sum()>0:
    ax.text(6.25, 6, '{:.0f} topics ({:.1f}±{:.1f}, {:.1f}±{:.1f})'.format(
        (growing_citations & growing_papers).sum(), 
        *papers_by_topic_df[growing_citations & growing_papers].filter(like='momentum_').agg(['mean', 'std']).T.values.flatten()),
        ha='right', fontstyle='italic', bbox=dict(boxstyle='round', lw=.5, facecolor='w', alpha=1))

if (~growing_citations & growing_papers).sum()>0:
    ax.text(6.25, -1.6, '{:.0f} topics ({:.1f}±{:.1f}, {:.1f}±{:.1f})'.format(
        (~growing_citations & growing_papers).sum(), 
        *papers_by_topic_df[~growing_citations & growing_papers].filter(like='momentum_').agg(['mean', 'std']).T.values.flatten()),
        ha='right', fontstyle='italic', bbox=dict(boxstyle='round', lw=.5, facecolor='w', alpha=1))

if (~growing_citations & ~growing_papers).sum()>0:
    ax.text(-1.8, -1.6, '{:.0f} topics\n({:.1f}±{:.1f}, {:.1f}±{:.1f})'.format(
        (~growing_citations & ~growing_papers).sum(), 
        *papers_by_topic_df[~growing_citations & ~growing_papers].filter(like='momentum_').agg(['mean', 'std']).T.values.flatten()),
        ha='left', fontstyle='italic', bbox=dict(boxstyle='round', lw=.5, facecolor='w', alpha=1))


ax.set_xlim(-1.99, 6.5)
ax.set_ylim(-1.99, 6.5)


growth_rect = patches.Rectangle((0, 0), 6.5, 6.5, facecolor="whitesmoke", linewidth=0, zorder=-100, alpha=.5)
ax.add_patch(growth_rect)

contract_rect = patches.Rectangle((-2, -2), 2, 2, facecolor="whitesmoke", linewidth=0, zorder=-100, alpha=.5)
ax.add_patch(contract_rect)

stable_rect = patches.Rectangle((-1, -1), 2, 2, facecolor='none', ec='k', ls=':', linewidth=.5)
ax.add_patch(stable_rect)

ax.plot(ax.get_xlim(), ax.get_ylim(), lw=.5, ls='--', c='k', zorder=-100)


fig.tight_layout()

fig.savefig('../graphs/3600_recsys_emerging_topics.png', dpi=300, bbox_inches='tight')

## The Emerging Topics & Papers

In [None]:
papers_by_topic_df['momentum'] = ((papers_by_topic_df['momentum_citations']**2) + (papers_by_topic_df['momentum_papers']**2)).map(np.sqrt)

emerging_topics = papers_by_topic_df['momentum']>2**.5

sorted_emerging_topics_df = papers_by_topic_df[emerging_topics].sort_values(by='momentum', ascending=False)

sorted_emerging_topics_df.filter(like='momentum').agg(['mean', 'std'])

In [None]:
recsys_papers_by_id = recsys_papers_df.set_index('paperId')

emerging_papers_df = recsys_papers_by_id.loc[list(sorted_emerging_topics_df['papers'].explode().dropna().values)]
emerging_papers_df.shape

In [None]:
after_threshold_year = emerging_papers_df.swifter.apply(lambda paper: paper['year']>=papers_by_topic_df.loc[paper['recsys_adj_topic_name']]['threshold_year_papers'], axis=1)
after_threshold_year.mean()

## Top Paper for each Emerging Topic

In [None]:
def top_papers_table(top_papers):

    top_papers = top_papers[['recsys_adj_topic_name', 'year', 'title', 'author_names', 'citationCount', 'bibtex_key']].copy()

    top_papers['recsys_adj_topic_name'] = top_papers['recsys_adj_topic_name'].map(lambda topic: ', '.join([term for term in topic.split(', ')[:-2]]))
    top_papers['year'] = top_papers['year'].map(int)

    top_papers['title'] = (
        top_papers['title'].map(lambda title: title[:65]+' ...') 
        + top_papers['bibtex_key'].map(lambda key: '\cite{{{}}}'.format(key))
    )
    
    # Just take the surnames to save space.
    top_papers['author_names'] =  top_papers['author_names'].map(lambda author_names: author_names[0].split()[-1]+' et al.' if len(author_names)>1 else author_names[0].split()[-1])
    top_papers['citationCount'] = top_papers['citationCount'].map(lambda count: '{:,.0f}'.format(count))

    top_papers = top_papers[['recsys_adj_topic_name', 'year', 'title', 'author_names', 'citationCount']]
    top_papers.columns = ['Topic', 'Year', 'Title', 'Authors', 'Cites']
    top_papers.index = range(1, len(top_papers)+1)

    return top_papers
    
    
top_emerging_paper_by_topic_df = (
    emerging_papers_df[emerging_papers_df['year']>=2020]
    .sort_values(by='citationCount', ascending=False)
    .groupby('recsys_adj_topic_name')
    .apply(lambda g: g.head(3))
    .sort_values(by='citationCount', ascending=False)
)

top_emerging_papers_table = top_papers_table(top_emerging_paper_by_topic_df).set_index('Year')
top_emerging_papers_table

In [None]:
print(top_emerging_papers_table.to_latex())

In [None]:
print('\n'.join(top_emerging_paper_by_topic_df['bibtex'].values))

In [None]:
(
    len(emerging_papers_df[after_threshold_year]),
    len(recsys_papers_df[recsys_papers_df['year']>=emerging_papers_df[after_threshold_year]['year'].min()]),
    len(emerging_papers_df[after_threshold_year])/len(recsys_papers_df[recsys_papers_df['year']>=emerging_papers_df[after_threshold_year]['year'].min()])
)

# Emerging Topic Wordclouds

In [None]:
STOPWORDS.add('facctrec')
STOPWORDS.add('dialog')


unique_tokens_by_topic = emerging_papers_df.groupby('recsys_adj_topic_name').apply(
    lambda g: [token 
               for token in np.unique(np.concatenate(g['reversed_text_tokens'].values))
               if (not(token.isdigit())) & (not(bool(re.search(r'\d', token)))) & (not(token in STOPWORDS))
              ]
).explode().dropna()


unique_tokens_by_topic_value_counts = unique_tokens_by_topic.value_counts()      

allowed_tokens = set(unique_tokens_by_topic_value_counts[unique_tokens_by_topic_value_counts.between(0, 6)].index)
len(allowed_tokens)

In [None]:
def draw_wordcloud(ax, papers, col):

    text = ' '.join([
        word for word in ' '.join(papers[col].values).lower().split()
        if word in allowed_tokens
    ])
    
    wc = WordCloud(
        width=500, height=500,
        prefer_horizontal=0.33,
        min_font_size=12, max_font_size=96,
        background_color='white', colormap='twilight',
        relative_scaling=0  # Use ranks only for scaling
        ).generate_from_text(text)
        
    ax.imshow(wc, interpolation="bilinear")

    # ax.axis("off")

    ax.set_xlim(-20, 520)
    ax.set_ylim(520, -20)
    
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.tick_params('both', length=0)


In [None]:
emerging_papers_df['recsys_adj_topic_name'].unique()

In [None]:
papers_by_emerging_topic = emerging_papers_df[after_threshold_year].reset_index().groupby('recsys_adj_topic_name')['paperId'].apply(lambda g: g.values)


# ncols = 7
# nrows = (len(papers_by_emerging_topic)//ncols) if len(papers_by_emerging_topic)%ncols==0 else (len(papers_by_emerging_topic)//ncols)+1
# s = 3

# fig, axs = plt.subplots(figsize=(ncols*s, nrows*s), nrows=nrows, ncols=ncols, gridspec_kw=dict(wspace=.15, hspace=.5))
# axs = axs.flatten()



ncols = 4
# nrows = (len(papers_by_emerging_topic)//ncols) if len(papers_by_emerging_topic)%ncols==0 else (len(papers_by_emerging_topic)//ncols)+1
nrows = 2
s = 2.5

fig, axs = plt.subplots(figsize=(ncols*s, nrows*s), nrows=nrows, ncols=ncols, gridspec_kw=dict(wspace=0, hspace=.3))

axs = axs.flatten()

recsys_papers_by_id = recsys_papers_df.set_index('paperId')

for ax, topic_name in zip(axs, emerging_papers_df.groupby('recsys_adj_topic_name').size().sort_values(ascending=False).index):
    papers = recsys_papers_by_id.loc[papers_by_emerging_topic.loc[topic_name]]

    recent_frac_inside = (papers['paper_type']=='inside').sum()/(emerging_papers_df[after_threshold_year]['paper_type']=='inside').sum()
    recent_frac_outside = (papers['paper_type']=='outside').sum()/(emerging_papers_df[after_threshold_year]['paper_type']=='outside').sum()
    recent_inside_bias = recent_frac_inside/(recent_frac_inside+recent_frac_outside)
    
    draw_wordcloud(ax, papers, 'title')

    title = ', '.join(topic_name.split(', ')[:2])
    if len(title)>18:
        title = title[:18]+'...'

    title += '\n({:,}, {:,}, {:.1f})'.format(len(papers), papers['citationCount'].sum(), recent_inside_bias)

    ax.set_title(title, fontsize=10, ha='center')

    # Remove the empty graphs.
    num_empty = (ncols*nrows)-len(papers_by_emerging_topic)
    if num_empty>0:
        for ax in axs[-num_empty:]: ax.axis("off")

fig.tight_layout()

fig.savefig('../graphs/3600_emerging_topic_word_clouds.png', dpi=300, bbox_inches='tight')