# 6 - Latent Dirichlet Allocation

## Import Library

In [None]:
from tqdm import tqdm

import joblib

import pandas as pd

K = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

alpha_list = [1, 10, 50, 100, 250]

eta_list = [0.001, 0.01, 0.1, 0.5, 1.0]

## Import Data

In [None]:
from octis.dataset.dataset import Dataset

dataset = Dataset()

dataset.load_custom_dataset_from_folder("inputs/dataset")

## Train and Export Model

In [None]:
from octis.models.LDA import LDA

for i in tqdm(range(len(K))):
    for a in alpha_list:
        for e in eta_list:
            model = LDA(num_topics = K[i],
                        alpha = a / K[i],
                        eta = e,
                        iterations = 1000,
                        random_state = 0)
            output = model.train_model(dataset, top_words = 30)
            # joblib.dump(output, "outputs/models/lda/lda_k_{0}_a_{1}_e_{2}.sav".format(K[i], a, e))

## Import Model

In [None]:
outputs = [joblib.load("outputs/models/lda/lda_k_{0}_a_{1}_e_{2}.sav".format(K[i], a, e)) for i in tqdm(range(len(K))) for a in alpha_list for e in eta_list]

## OCTIS

In [None]:
# output = outputs[0]

In [None]:
dataset.get_vocabulary()[:5] # the vocabulary of the dataset after imported to octis

In [None]:
sorted(dataset.get_vocabulary())[:5] # the vocabulary of the dataset before imported to octis

In [None]:
from octis.models.ETM import ETM

model = ETM(num_topics = 15)

output = model.train_model(dataset, top_words = 30)

In [None]:
output["topics"][0][:5]

In [None]:
df = pd.DataFrame(data = output["topic-word-matrix"], columns = dataset.get_vocabulary()).T.sort_values(by = 0, ascending = False)
df.head()

In [None]:
df = pd.DataFrame(data = output["topic-word-matrix"], columns = sorted(dataset.get_vocabulary())).T.sort_values(by = 0, ascending = False)
df.head()

In [None]:
from octis.models.NMF import NMF

model = NMF(num_topics = 15)

output = model.train_model(dataset, top_words = 30)

In [None]:
output["topics"][0][:5]

In [None]:
df = pd.DataFrame(data = output["topic-word-matrix"], columns = dataset.get_vocabulary()).T.sort_values(by = 0, ascending = False)
df.head()

In [None]:
df = pd.DataFrame(data = output["topic-word-matrix"], columns = sorted(dataset.get_vocabulary())).T.sort_values(by = 0, ascending = False)
df.head()

## Metrics

In [None]:
alpha = list()

eta = list()

for i in K:
    for a in alpha_list:
        for e in eta_list:
            for j in [10, 20, 30]:
                alpha.append(a)
                eta.append(e)

In [None]:
from octis.evaluation_metrics.coherence_metrics import Coherence

k, m, u_mass, c_v, c_uci, c_npmi = list(), list(), list(), list(), list(), list()

for i in tqdm(range(len(outputs))):
    for j in [10, 20, 30]:
        k.append(len(outputs[i]["topics"]))
        m.append(j)
        u_mass.append(Coherence(texts = dataset.get_corpus(), topk = j, measure = "u_mass").score(outputs[i]))
        c_v.append(Coherence(texts = dataset.get_corpus(), topk = j, measure = "c_v").score(outputs[i]))
        c_uci.append(Coherence(texts = dataset.get_corpus(), topk = j, measure = "c_uci").score(outputs[i]))
        c_npmi.append(Coherence(texts = dataset.get_corpus(), topk = j, measure = "c_npmi").score(outputs[i]))
        
metrics = pd.DataFrame(data = {"k" : k,
                               "m" : m,
                               "u_mass" : u_mass,
                               "c_v" : c_v,
                               "c_uci" : c_uci,
                               "c_npmi" : c_npmi,
                               "alpha" : alpha,
                               "eta" : eta})

# metrics.to_csv("outputs/metrics/lda.csv", index = False)