# 6 - Biterm Topic Model

## Import Library

In [1]:
import pandas as pd

import bitermplus as btm

from tqdm import tqdm

import joblib

import numpy as np

K = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

alpha_list = [1, 10, 50, 100, 250]

eta_list = [0.001, 0.01, 0.1, 0.5, 1.0]

## Import Data

In [None]:
df = pd.read_csv("inputs/tweets_sample.csv")

texts = df["content_10"].str.strip().tolist()

X, vocabulary, vocab_dict = btm.get_words_freqs(texts)

docs_vec = btm.get_vectorized_docs(texts, vocabulary)

biterms = btm.get_biterms(docs_vec)

## Train and Export Model

In [None]:
for i in range(len(K)):
    for a in alpha_list:
        for e in eta_list:
            model = btm.BTM(X,
                            vocabulary,
                            T = K[i],
                            alpha = a / K[i],
                            beta = e,
                            seed = 0,
                            has_background = True)
            model.fit(biterms, iterations = 1000)
            # joblib.dump(model, "outputs/models/btm/btm_k_{0}_a_{1}_e_{2}.sav".format(K[i], a, e))

## Import Model

In [None]:
models = [joblib.load("outputs/models/btm/btm_k_{0}_a_{1}_e_{2}.sav".format(K[i], a, e)) for i in tqdm(range(len(K))) for a in alpha_list for e in eta_list]

## Export Results

In [None]:
for i in range(len(models)):
    a = int(models[i].alpha_ * len(models[i].theta_))
    e = models[i].beta_
    p_wz = pd.DataFrame(data = models[i].matrix_topics_words_, columns = models[i].vocabulary_)
    # p_wz.to_csv("outputs/btm/p_wz_k_{0}_a_{1}_e_{2}.csv".format(len(models[i].theta_), a, e))
    p_zd = pd.DataFrame(data = models[i].transform(docs_vec)).T
    # p_zd.to_csv("outputs/btm/p_zd_k_{0}_a_{1}_e_{2}.csv".format(len(models[i].theta_), a, e))

## Metrics

In [None]:
alpha = list()

eta = list()

for i in K:
    for a in alpha_list:
        for e in eta_list:
            for j in [10, 20, 30]:
                alpha.append(a)
                eta.append(e)

In [None]:
outputs = []

for i in tqdm(range(len(K))):
    for a in alpha_list:
        for e in eta_list:
            
            p_wz = pd.read_csv("outputs/btm/p_wz_k_{0}_a_{1}_e_{2}.csv".format(K[i], a, e), index_col = 0)
            
            p_zd = pd.read_csv("outputs/btm/p_zd_k_{0}_a_{1}_e_{2}.csv".format(K[i], a, e), index_col = 0)
            
            output = {"topic-word-matrix" : p_wz.to_numpy(),
                      "topics" : [list(p_wz.T.sort_values(by = j, ascending = False).index[:30]) for j in range(K[i])],
                      "topic-document-matrix" : p_zd.to_numpy(),
                      "test-topic-document-matrix" : np.array([])}
            
            outputs.append(output)

In [None]:
from octis.dataset.dataset import Dataset

dataset = Dataset()

dataset.load_custom_dataset_from_folder("inputs/dataset")

In [None]:
from octis.evaluation_metrics.coherence_metrics import Coherence

k, m, u_mass, c_v, c_uci, c_npmi = list(), list(), list(), list(), list(), list()

for i in tqdm(range(len(outputs))):
    for j in [10, 20, 30]:
        k.append(len(outputs[i]["topics"]))
        m.append(j)
        u_mass.append(Coherence(texts = dataset.get_corpus(), topk = j, measure = "u_mass").score(outputs[i]))
        c_v.append(Coherence(texts = dataset.get_corpus(), topk = j, measure = "c_v").score(outputs[i]))
        c_uci.append(Coherence(texts = dataset.get_corpus(), topk = j, measure = "c_uci").score(outputs[i]))
        c_npmi.append(Coherence(texts = dataset.get_corpus(), topk = j, measure = "c_npmi").score(outputs[i]))
        
metrics = pd.DataFrame(data = {"k" : k,
                               "m" : m,
                               "u_mass" : u_mass,
                               "c_v" : c_v,
                               "c_uci" : c_uci,
                               "c_npmi" : c_npmi,
                               "alpha" : alpha,
                               "eta" : eta})

# metrics.to_csv("outputs/metrics/btm.csv", index = False)