In [1]:
import numpy as np
import yaml
import pandas as pd
from glob import glob
from pathlib import Path
from topic_modeling_metrics.maut import Maut
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
method_topics = "tm"

In [3]:
with open(f"config_{method_topics}.yml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

In [4]:
methods = config["methods"]
dataset = config["dataset"]
datapath = config["datapath"]
topics_documents_path = config["topics_documents_path"]
topic_words_path = config["topic_words_path"]
embedding_word_path = config["embedding_word_path"]
embedding_doc_name_path = config["embedding_doc_name_path"]
n_words = config["n_words"]
save_path = config["save_path"]

In [5]:
df = pd.read_csv(f"{datapath}/{dataset}.csv")
df["id"] = df["id"].astype(int)
topic_words = {}
topic_labels = {}
for method in methods:
    topic_labels[method] = pd.read_csv(f"{topics_documents_path}/{dataset}/{method}/Topicos_Dominantes.csv", sep="|", dtype={"id": int})\
    .merge(df, on="id")["dominant_topic"].values

In [8]:
{method: topic_labels[method].shape for method in methods}

{'gpt': (11967,),
 'llama': (11967,),
 'cluwords': (11967,),
 'berttopic': (11967,),
 'nmf': (11967,),
 'lda': (11967,),
 'class': (11967,)}

In [9]:
# Necessário apenas para llms, onde os tópicos podem conter palavras fora do vocabulário
vec = CountVectorizer(tokenizer=lambda x: x.split())
vec.fit(df["pp"].values.tolist())
vocab = vec.get_feature_names_out()



In [10]:
min_lentgh = {}
for method in methods:
    df_words = pd.read_csv(f"{topic_words_path}/{dataset}/{method}/tfidf/words.csv")
    topics = df_words.to_dict(orient="index")
    topics = {id: list(topics[id].values()) for id in topics}
    topics = {id: [value for value in topics[id] if value in vocab] for id in topics}
    min_lentgh[method] = np.min([len(topics[id]) for id in topics])
    # topics = {id: words.split() for id, words in [topic.split(" - ") for topic in topics]}
    topic_words[method] = topics

In [12]:
min_lentgh

{'gpt': 10,
 'llama': 10,
 'cluwords': 10,
 'berttopic': 10,
 'nmf': 10,
 'lda': 10,
 'class': 10}

In [13]:
maut = Maut(
    docs=df["pp"].values.tolist(),
    topic_words=topic_words,
    topic_labels=topic_labels,
    embedding_word_path=embedding_word_path,
    embedding_doc_name_path=embedding_doc_name_path,
    n_words=n_words)

In [14]:
maut.get_tradicional_metrics()

2024-06-11 15:18:02,030 - traditional_metrics - INFO - 1.0 de palavras em tópicos mapeadas...
2024-06-11 15:18:02,050 - maut - INFO - Traditional metrics for gpt calculated.
2024-06-11 15:21:03,216 - traditional_metrics - INFO - 1.0 de palavras em tópicos mapeadas...
2024-06-11 15:21:03,216 - traditional_metrics - INFO - 1.0 de palavras em tópicos mapeadas...
2024-06-11 15:21:03,239 - maut - INFO - Traditional metrics for llama calculated.
2024-06-11 15:24:07,456 - traditional_metrics - INFO - 0.996969696969697 de palavras em tópicos mapeadas...
2024-06-11 15:24:07,456 - traditional_metrics - INFO - 0.996969696969697 de palavras em tópicos mapeadas...
2024-06-11 15:24:07,456 - traditional_metrics - INFO - 0.996969696969697 de palavras em tópicos mapeadas...
2024-06-11 15:24:07,479 - maut - INFO - Traditional metrics for cluwords calculated.
2024-06-11 15:27:07,215 - traditional_metrics - INFO - 0.9787878787878788 de palavras em tópicos mapeadas...
2024-06-11 15:27:07,215 - traditional_

In [16]:
topic_result_path = Path(f"{save_path}/{method_topics}/{dataset}")

In [17]:
df_result = maut.get_maut()

In [19]:
topic_result_path.mkdir(parents=True, exist_ok=True)

In [20]:
df_result.to_csv(f"{topic_result_path}/maut.csv", index=False)
maut.metrics_dataframe.reset_index().to_csv(f"{topic_result_path}/maut_metrics.csv", index=False)

In [21]:
algorithms = maut.metrics_by_topic.keys()
dfs = []
for algorithm in algorithms:
    df = pd.DataFrame(maut.metrics_by_topic[algorithm])
    df["algorithm"] = algorithm
    dfs.append(df)
pd.concat(dfs).to_csv(f"{topic_result_path}/result_by_topic.csv", index=False)