# Recurse

1. Load the trained LDA model
2. Load the train data.
3. Assign topics to documents in the train dataset
4. For each set of documents, train a LDA model
5. save the trained models


### Load the trained model
The model was trained in notebook 01_Train_model

In [1]:
import pickle

#ldaModel = LdaModel.load('../models/lda_nlp2.model')
with open('../models/lda_nlp_train.pickle', 'rb') as handle:
    ldamodel = pickle.load(handle)

## Load the train data

In [2]:
import gensim
import pandas as pd
import statistics
from matplotlib import pyplot as plt 

%matplotlib inline

In [3]:
%%time

arxiv_nlp_train = pd.read_csv('../data/arxiv_nlp_train.csv.zip', index_col=0)

CPU times: user 281 ms, sys: 44.1 ms, total: 325 ms
Wall time: 324 ms


In [4]:
print(f"There are {len(arxiv_nlp_train)} in the train dataset.")

There are 27275 in the train dataset.


In [5]:
import tok

texts = tok.clean(arxiv_nlp_train['abstract'])
dictionary, corpus = tok.dict_corpus(texts)

## Assign topics to the data

Aggregate topic information in a dataframe (see: https://campus.datacamp.com/courses/fraud-detection-in-python/fraud-detection-using-text?ex=11)

In [6]:
def get_topic_details(ldamodel, corpus):
    topic_details_list = []
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_details_list.append([topic_num, prop_topic, row])
    topic_details_df = pd.DataFrame(topic_details_list)
    topic_details_df.columns = ['Dominant_Topic', '% Score', 'Topics']
    return topic_details_df

In [7]:
def assign_topics(ldamodel, corpus, df):
    # put the arxiv id, original categories data and creation date in a dataframe
    # combine with result of topic details function
    topics_df = pd.DataFrame()
    topic_details = get_topic_details(ldamodel, corpus)
    topics_df['id'] = list(df['id'])
    topics_df['year'] = list(df['year'])
    topics_df['month'] = list(df['month'])
    topics_df['Dominant Topic'] = topic_details['Dominant_Topic']
    topics_df['% Score'] = topic_details['% Score']
    topics_df['Topics'] = topic_details['Topics']
    return topics_df

In [8]:
topics_nlp_train = assign_topics(ldamodel, corpus, arxiv_nlp_train)

## For each set of documents, train a LDA model
Fit the model on the training set, for different values of k.

Validate the model by computing perplexity for different values of k. The best value for k is that which yields a perplexity closest to 0.

Since LDA has a random component, the best k might vary between runs. Therefore the model is fitted several times and the best value is compiled from all runs (mode of best k for all runs).

see: https://radimrehurek.com/gensim/models/ldamodel.html

In [9]:
max_runs = 5
min_k = 2
max_k = 12

def fit_lda(dictionary, corpus):
    perplexity = []
    for run in range(max_runs):
        print(f"Run {run + 1} / {max_runs}")
        px = []
        num_topics = []
        for k in range(min_k, max_k + min_k):
            # Define the LDA model
            ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=k, id2word=dictionary, passes=15)
            num_topics.append(k)
            px.append(ldamodel.log_perplexity(corpus))
        perplexity.append(pd.DataFrame.from_dict({'k': num_topics, 'val': px}))
    return perplexity

In [10]:
def best_k_from_perplexity(perplexity):
    best_k_runs = []
    for run in range(max_runs):
        best_k_runs.append(perplexity[run].sort_values('val', ascending=False).iloc[0]['k'])
    best_k = statistics.mode(best_k_runs)
    return best_k

In [11]:
%%time

topics = set(topics_nlp_train['Dominant Topic'])
topic_model = {}
for topic in topics:
    idx = topics_nlp_train['Dominant Topic'] == topic
    if (idx.sum() == 0):
        topic_model[topic] = None
        continue
    df_topic = arxiv_nlp_train.reset_index()[idx]
    texts = tok.clean(df_topic['abstract'])
    dictionary, corpus = tok.dict_corpus(texts)
    if not any(corpus):
        topic_model[topic] = None
        continue
    perplexity = fit_lda(dictionary, corpus)
    best_k = best_k_from_perplexity(perplexity)
    topic_model[topic] = gensim.models.ldamodel.LdaModel(corpus, num_topics=best_k, id2word=dictionary, passes=15)

Run 1 / 5
Run 2 / 5
Run 3 / 5
Run 4 / 5
Run 5 / 5
Run 1 / 5
Run 2 / 5
Run 3 / 5
Run 4 / 5
Run 5 / 5
Run 1 / 5
Run 2 / 5
Run 3 / 5
Run 4 / 5
Run 5 / 5
Run 1 / 5
Run 2 / 5
Run 3 / 5
Run 4 / 5
Run 5 / 5
Run 1 / 5
Run 2 / 5
Run 3 / 5
Run 4 / 5
Run 5 / 5
Run 1 / 5
Run 2 / 5
Run 3 / 5
Run 4 / 5
Run 5 / 5
Run 1 / 5
Run 2 / 5
Run 3 / 5
Run 4 / 5
Run 5 / 5
Run 1 / 5
Run 2 / 5
Run 3 / 5
Run 4 / 5
Run 5 / 5
CPU times: user 1h 44min 56s, sys: 420 ms, total: 1h 44min 56s
Wall time: 1h 44min 56s


In [13]:
for topic in topics:
    print(f"Topic {topic}")
    if not topic_model[topic]:
        print("Empty topic list\n")
        continue
    model_topics = topic_model[topic].print_topics(num_words=5)
    for model_topic in model_topics:
        print(model_topic)
    print()

Topic 0
(0, '0.053*"word" + 0.032*"embed" + 0.018*"represent" + 0.017*"task" + 0.016*"method"')
(1, '0.016*"semant" + 0.011*"structur" + 0.010*"process" + 0.010*"syntact" + 0.010*"natur"')
(2, '0.013*"concept" + 0.011*"topic" + 0.011*"semant" + 0.010*"embed" + 0.010*"similar"')
(3, '0.024*"word" + 0.015*"text" + 0.010*"base" + 0.009*"analysi" + 0.008*"us"')
(4, '0.016*"gener" + 0.015*"base" + 0.012*"pars" + 0.012*"algorithm" + 0.009*"problem"')

Topic 1
(0, '0.043*"llm" + 0.015*"larg" + 0.015*"prompt" + 0.013*"gpt" + 0.011*"reason"')
(1, '0.017*"code" + 0.015*"learn" + 0.014*"problem" + 0.012*"natur" + 0.012*"program"')
(2, '0.020*"explan" + 0.013*"human" + 0.012*"predict" + 0.012*"method" + 0.010*"learn"')
(3, '0.051*"text" + 0.020*"video" + 0.012*"sentenc" + 0.011*"stori" + 0.010*"base"')
(4, '0.042*"evalu" + 0.024*"metric" + 0.023*"human" + 0.021*"summar" + 0.017*"summari"')
(5, '0.046*"imag" + 0.031*"visual" + 0.019*"caption" + 0.018*"modal" + 0.015*"train"')
(6, '0.067*"question" 

## Save the models
save a dictionary of models, one for each topic.

In [14]:
import pickle

with open('../models/topic_model.pickle', 'wb') as handle:
    pickle.dump(topic_model, handle, protocol=pickle.HIGHEST_PROTOCOL)