# Topic Modelling
In this notebook we generate BERT topic models for our RS papers and the wider usniverse of papers. This is a time consuming and labour intensive process. I tried various topic modelling settings (e.g. min topic size etc.) before landing on a given setting which produced an acceptable topic model for the universe and RS papers.

To save time and effort best models for the universe and RS papers are incldued in:

* Universe Model - ../data/models/2400_best_topic_model_df_all_with_recsys_47_5000.pkl (.csv also)
* RS Model - ../data/models/2400_best_topic_model_df_recsys_only_42_200.pkl (.csv also)

The above were used in the remaining analysis and rather than re-running an expensive topic modelling process to reproduce similar models, th einterested researcher can instead skip this notebook and use the 2410_ notebook to incorporate the above models into th emain papers datasets.

In [None]:
import swifter
import Stemmer

import os
# Should prevent "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. " warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"  

import string 

import nltk
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import words
# nltk.download('words')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud

import matplotlib.pyplot as plt
import plotly.express as px

import random
from itertools import chain

import pandas as pd
from matplotlib.pylab import plt

import numpy as np

from glob import glob, iglob
from pathlib import Path

from loguru import logger
from IPython.display import display, clear_output

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired

from sklearn.decomposition import PCA

from sklearn.feature_extraction.text import CountVectorizer

from umap import UMAP


from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True) 

from IPython.utils import io
with io.capture_output() as captured:
    !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz

#Import NLP librarys and the spacy package to preprocess the abstract text
import spacy
from spacy.lang.en.stop_words import STOP_WORDS #import commen list of stopword
# import en_core_sci_lg  # import downlaoded model
import string

from minisom import MiniSom  
from sklearn.cluster import SpectralClustering 
import scipy.cluster.hierarchy as sch
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score


import plotly.io as pio
import plotly.express as px

pio.renderers.default = 'iframe'

!pwd

# Load Datasets

In [None]:
papers_dataset = '../data/processed/2300_recsys_universe_papers.feather'
# '../data/processed/2200_recsys_papers_cleaned.feather'

papers_df = pd.read_feather(papers_dataset)

papers_df.shape

In [None]:
recsys_dataset = '../data/processed/2300_inside_outside_papers.feather'

recsys_papers_df = pd.read_feather(recsys_dataset)

recsys_papers_df.shape

# Process for Clustering

## Clean up the title/abstract texts
Some sbtracts seem to just contain coneference names and regions. Note that we are not removing stop words here as per the advice on the BERT project page.

* From https://maartengr.github.io/BERTopic/faq.html#how-do-i-reduce-topic-outliers: ``At times, stop words might end up in our topic representations. This is something we typically want to avoid as they contribute little to the interpretation of the topics. However, removing stop words as a preprocessing step is not advised as the transformer-based embedding models that we use need the full context to create accurate embeddings.``

## Normalise the titles and text columns

In [None]:
papers_df['clean_title'] = papers_df['title'].map(lambda s: s.lower())
recsys_papers_df['clean_title'] = recsys_papers_df['title'].map(lambda s: s.lower())

In [None]:
def remove_punctuation(text):

    # Add the single quote and drop the hyphen
    punctuation = string.punctuation

    # Create a translation table mapping punctuation characters to None
    translator = str.maketrans('', '', punctuation)
    
    # Remove punctuation using translate method
    return text.translate(translator)


drop_words = set(['conference', 'international', 'journal', 'acm', 'ieee', 'springer', 'elsevier', 'transactions', 'nationa', 'symposium', 'workshop'])

papers_df['clean_text'] = papers_df['text'].swifter.apply(
    lambda text: ' '.join([
        word
        for word in remove_punctuation(text.replace('-', ' ')).lower().split()
        if word not in drop_words
    ]))

recsys_papers_df['clean_text'] = recsys_papers_df['text'].swifter.apply(
    lambda text: ' '.join([
        word
        for word in remove_punctuation(text.replace('-', ' ')).lower().split()
        if word not in drop_words
    ]))

recsys_papers_df['clean_text'].head()

## Focus on papers with english titles and something in the abstract

In [None]:
paper_has_english_title = papers_df['has_english_title']
paper_has_english_abstract = papers_df['has_english_abstract']
papers_with_abstracts = papers_df['clean_text'].map(len)>(papers_df['title'].map(len)+25)

use_english_papers = paper_has_english_abstract & paper_has_english_title

english_papers_df = papers_df[use_english_papers & papers_with_abstracts].copy()

english_papers_df.shape, len(english_papers_df)/len(papers_df)

# Topic Modeling for the RecSys Papers

## Get the documents

In [None]:
recsys_papers_with_english_title = recsys_papers_df['has_english_title']
recsys_papers_with_english_title.mean(), recsys_papers_with_english_title.sum()

In [None]:
recsys_paper_ids = set(recsys_papers_df[recsys_papers_with_english_title]['paperId'].unique())

recsys_docs = recsys_papers_df[recsys_papers_with_english_title].set_index('paperId')['clean_text']
recsys_titles = recsys_papers_df[recsys_papers_with_english_title].set_index('paperId')['clean_title']

len(recsys_paper_ids), recsys_docs

In [None]:
english_papers_df_by_id = english_papers_df.set_index('paperId')
english_papers_df_by_id

In [None]:
all_english_with_recsys_docs = english_papers_df_by_id['clean_text']
all_english_with_recsys_titles = english_papers_df_by_id['clean_title']

all_english_with_recsys_docs.shape

## Build the topic model

In [None]:
random_state = 42
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")    # SentenceTransformer("all-MiniLM-L6-v2")

def gen_topic_model(docs, embeddings, min_topic_size, nr_topics, sentence_model=sentence_model, random_state=random_state): 

    top_n_words = 100
    
    dim_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=random_state)
    
    vectorizer_model = CountVectorizer(stop_words="english")
    representation_model = MaximalMarginalRelevance(top_n_words=top_n_words, diversity=0.33)

    logger.info('Building model...')
    topic_model = BERTopic(
        embedding_model=sentence_model,
        umap_model=dim_model, 
        vectorizer_model=vectorizer_model, 
        calculate_probabilities=False, 
        min_topic_size=min_topic_size,
        top_n_words=top_n_words,
        low_memory=True,
        verbose=True
    )
    
    topics, _ = topic_model.fit_transform(docs, embeddings)
    logger.info('Model has {} topics'.format(len(set(topics))))

    logger.info('Reducing outliers...')
    new_topics = topic_model.reduce_outliers(docs, topics, strategy="embeddings")  
    topic_model.update_topics(docs, topics=new_topics, vectorizer_model=vectorizer_model, representation_model=representation_model, top_n_words=top_n_words)

    if nr_topics: topic_model.reduce_topics(docs, nr_topics=nr_topics)
    
    return new_topics, topic_model



def gen_adj_topic_name(word_list, n=4):

    drop_tokens = set(['cr', 'cdr', 'iptv', 'mob', 'wkh', 'artic', 'xvhu', 'dqg', 'acm', 'bm3d', 'la', 'bas', 'method'])
    
    tokenizer = Tokenizer(stemmer=LancasterStemmer())

    # Get tokens, remove duplicates, preserve order
    tokens = list(dict.fromkeys([token for token in tokenizer.words_to_tokens(word_list) if token not in drop_tokens]))
    words = tokenizer.tokens_to_words(tokens)

    return ', '.join(words[:n]).title()



def build_topics(docs, min_topic_size, nr_topics):

    doc_text = docs.to_numpy()
    
    embeddings = sentence_model.encode(doc_text, show_progress_bar=True)
    
    topics, topic_model = gen_topic_model(doc_text, embeddings, min_topic_size=min_topic_size, nr_topics=nr_topics)
    
    topic_model_df = topic_model.get_topic_info()
    topic_model_df.columns = ['topic_id', 'topic_count', 'topic_name', 'topic_representation', 'topic_representative_docs']

    topic_model_df['top_n_words'] = topic_model_df['topic_id'].map(lambda topic_id: topic_model.get_topic(topic_id))

    papers_by_topic = (
        pd
        .DataFrame({'topic_id': topics, 'paper_id': docs.index})
        .groupby('topic_id')
        .apply(lambda g: np.concatenate(g.values), include_groups=False)
    )

    topic_model_df = topic_model_df.set_index('topic_id')
    topic_model_df['papers'] = papers_by_topic

    topic_model_df['adj_topic_name'] = topic_model_df['topic_representation'].map(gen_adj_topic_name)


    return topic_model_df, topic_model, embeddings


def build_and_save_model(docs, min_topic_size, nr_topics, label):

    # Build the model
    topics_df, topic_model, embeddings = build_topics(docs, min_topic_size=min_topic_size, nr_topics=nr_topics)

    # Save the model and the model df
    topic_model.save("../data/models/3400_topic_model_{}_{}_{}.pkl".format(label, len(topics_df), min_topic_size), serialization="pickle")
    topics_df.to_csv("../data/models/3400_topic_model_df_{}_{}_{}.csv".format(label, len(topics_df), min_topic_size), index=True)
    topics_df.to_pickle("../data/models/3400_topic_model_df_{}_{}_{}.pkl".format(label, len(topics_df), min_topic_size))

    return topics_df, topic_model, embeddings


# Build the Universe Topic Model

In [None]:
all_with_recsys_topics_df, all_with_recsys_topic_model, all_with_recsys_embeddings = build_and_save_model(
    all_english_with_recsys_docs, min_topic_size=5000, nr_topics=False, label='all_with_recsys'
)

all_with_recsys_topics_df.shape

# Build the RecSys Topic Model

In [None]:
recsys_topics_df, recsys_topic_model, recsys_embeddings = build_and_save_model(recsys_docs, min_topic_size=200, label='recsys_only')
len(recsys_docs), recsys_topics_df.shape