# Combining Topics & Papers
The purpose of this notebook is to combine the topic analysis with the main datasets. We have experimented with various different settings to produce several variations of the topic analyses (e.g. based on various minimum topic sizes etc.) Here we use the best performing models. The modules themselves are available in this repo in the models/ folder.

In [None]:
import swifter
import Stemmer

import os
# Should prevent "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. " warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"  

import string 

import nltk
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import words
# nltk.download('words')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud

import matplotlib.pyplot as plt
import matplotlib as mpl

from mpl_toolkits.axes_grid1.inset_locator import inset_axes

import random
from itertools import chain

import pandas as pd
from matplotlib.pylab import plt

import numpy as np

from glob import glob, iglob
from pathlib import Path

from loguru import logger
from IPython.display import display, clear_output

!pwd

# Load Datasets

## The Papers

### The Universe Papers

In [None]:
all_papers_dataset = '../data/processed/2300_recsys_universe_papers.feather'
all_papers_df = pd.read_feather(all_papers_dataset)
all_papers_df.shape

In [None]:
all_papers_by_id = all_papers_df.set_index('paperId')
all_papers_by_id.head()

### The RecSys Papers

In [None]:
recsys_papers_df = pd.read_feather('../data/processed/2300_inside_outside_papers.feather')
recsys_papers_df.head()

In [None]:
recsys_papers_by_id = recsys_papers_df.set_index('paperId')
recsys_papers_by_id.head()

## The Topic Models
The topic models below already exist in this repo and do not have to be regenerated. In case there are issues with th e.pkl versions, .csv files have also been included.

### The Universe Topics (Best Model)

In [None]:
best_all_topic_model_df_dataset = '../data/models/2400_best_topic_model_df_all_with_recsys_47_5000.pkl'
best_all_topic_model_df = pd.read_pickle(best_all_topic_model_df_dataset)
best_all_topic_model_df.shape              

In [None]:
# Remove any topic papers that are not in the universe.

all_topic_model_df = best_all_topic_model_df.copy()

all_topic_model_df['papers'] = all_topic_model_df['papers'].swifter.apply(
    lambda papers: [paper for paper in papers if paper in all_papers_by_id.index]
)

all_topic_model_df['topic_count'] = all_topic_model_df['papers'].map(len)

all_topic_model_df['topic_count'].sum(), len(all_papers_by_id)

### The RecSys Topics (Best Model)

In [None]:
best_recsys_topic_model_df_dataset = '../data/models/2400_best_topic_model_df_recsys_only_42_200.pkl'
best_recsys_topic_model_df = pd.read_pickle(best_recsys_topic_model_df_dataset)
best_recsys_topic_model_df.shape
                   

In [None]:
best_recsys_topic_model_df

In [None]:
# Remove any topic papers that are not in the universe.

recsys_topic_model_df = best_recsys_topic_model_df.copy()

recsys_topic_model_df['papers'] = recsys_topic_model_df['papers'].swifter.apply(
    lambda papers: [paper for paper in papers if paper in recsys_papers_by_id.index]
)

recsys_topic_model_df['topic_count'] = recsys_topic_model_df['papers'].map(len)

recsys_topic_model_df['topic_count'].sum(), len(recsys_papers_by_id)

## Improve RecSys Topic Names
We drop thigs like 'recommender' from the topic representations as its not especially useful.

In [None]:
class Tokenizer:

    def  __init__(self, stemmer=PorterStemmer(), lemmatizer=WordNetLemmatizer()
):

        self.token_map = {}
        self.stemmer = stemmer
        self.lemmatizer=lemmatizer


    def reset(self):
        self.token_map = {}

    def word_to_token(self, word):

        # Stem to produce a root token
        token = self.stemmer.stem(self.lemmatizer.lemmatize(word))

        # If the token exists then check the words producing it.
        if token in self.token_map:
            word_counts = self.token_map[token]
    
            # If the word already exists then update its count.
            if word in word_counts:
                word_counts[word] = word_counts[word]+1
                
            # Otherwise add a new count.
            else:
                word_counts[word] = 1
    
            # Update the token map.
            self.token_map[token] = word_counts
    
        # If there is no token then add a new one with a new word count.
        else:
            self.token_map[token] = {word: 1}
    
        return token

    def token_to_word(self, token):
        
        word_counts = self.token_map[token]
        
        return sorted(word_counts.keys(), key=lambda key: word_counts[key], reverse=True)[0]

    
    def words_to_tokens(self, word_list):

        return [self.word_to_token(word) for word in word_list]

    def tokens_to_words(self, token_list):

        return [self.token_to_word(token) for token in token_list]


tokenizer = Tokenizer(stemmer=LancasterStemmer())
(
    tokenizer.words_to_tokens(['bias', 'biases', 'moba', 'crs']), 
    tokenizer.tokens_to_words(tokenizer.words_to_tokens(['bias', 'biases', 'moba', 'crs']))
)

tokenizer = Tokenizer(stemmer=PorterStemmer())

recsys_topic_model_df['adj_topic_representation'] = (
    recsys_topic_model_df['topic_representation']
    .map(tokenizer.words_to_tokens)
    .map(lambda tokens: sorted(set(tokens), key=tokens.index))
    .map(tokenizer.tokens_to_words)
)



In [None]:
def improve_topic_name(terms, n=4):
    drop_terms = ['tourist', 'recommend', 'recommender', 'recommendation', 'check', 'systems', 'study', 'based', 'contextual', 'tv', 'twitter', 'iptv', 'crs', 'effectiveness', 'explainable']

    return ', '.join([term.title() for term in terms if term not in drop_terms][:n])

recsys_topic_model_df['adj_topic_name'] = recsys_topic_model_df['adj_topic_representation'].map(improve_topic_name)

recsys_topic_model_df['adj_topic_name'].unique()

# Add topics to papers
Next we add the relevant topic information to the main papers dataframes. This means adding topic identifiers and names to to the corresponding paper records.

In [None]:
universe_topic_id_by_paper_id = all_topic_model_df['papers'].explode().dropna().reset_index().set_index('papers').add_prefix('universe_')
universe_topic_name_by_paper_id = all_topic_model_df.set_index('adj_topic_name')['papers'].explode().dropna().reset_index().set_index('papers').add_prefix('universe_')

universe_papers_df_with_topics = (
    all_papers_df.set_index('paperId')
    .join(universe_topic_id_by_paper_id, how='left')
    .join(universe_topic_name_by_paper_id, how='left')
    .reset_index()
)

universe_papers_df_with_topics

In [None]:
recsys_topic_id_by_paper_id = recsys_topic_model_df['papers'].explode().dropna().reset_index().set_index('papers').add_prefix('recsys_')
recsys_topic_name_by_paper_id = recsys_topic_model_df.set_index('adj_topic_name')['papers'].explode().dropna().reset_index().set_index('papers').add_prefix('recsys_')

recsys_papers_df_with_topics = (
    recsys_papers_df.set_index('paperId')
    .join(recsys_topic_id_by_paper_id, how='left')
    .join(recsys_topic_name_by_paper_id, how='left')
    .reset_index()
)

recsys_papers_df_with_topics['recsys_adj_topic_name'].unique()

# Save the Updated Datsets
These updated datasets will be used in the topic analysis later.

In [None]:
recsys_papers_df_with_topics.to_feather('../data/processed/2410_recsys_papers_with_topics.feather')
universe_papers_df_with_topics.to_feather('../data/processed/2410_universe_papers_with_topics.feather')

universe_papers_df_with_topics.shape, recsys_papers_df_with_topics.shape, best_recsys_topic_model_df.shape