In [1]:
import os
os.chdir("..")
os.chdir("src")

In [2]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import json

params_path = "/home/bruno/mestrado-ufrgs/mestrado-ufrgs-cmp617-tce/src/opt_outputs/results_stemmer_with_filter_500.json"

with open(params_path, "r") as json_file:
    params_dict = json.load(json_file)

In [3]:
import json
from typing import Union, List, Dict
from logging import Logger
import optuna
from tqdm import tqdm
import nltk
from gensim.models import CoherenceModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary
from data_cleaning import DataCleaning
from utils.utils import logger
from utils.nlp import preprocess, remove_stop_words, stemmer_pt, lemma_pt
import spacy
os.chdir("..")

In [4]:
cleaning_pipeline = DataCleaning()
df = cleaning_pipeline.run()

2024-04-03 20:19:35 - INFO - Data loaded!
2024-04-03 20:19:35 - INFO - Null values cleaned!
2024-04-03 20:19:35 - INFO - Data types asserted!
2024-04-03 20:19:35 - INFO - Full data cleaned!


In [5]:
df = df.assign(
    DS_OBJETO_NLP=df["DS_OBJETO"]
    .apply(
        lambda x: nltk.word_tokenize(x.lower(), language="portuguese")
    )  # Tokenize
    .apply(lambda x: [preprocess(word) for word in x])  # Other preprocessing
    .apply(lambda x: list(filter(None, x)))  # Removes items with none
    .apply(remove_stop_words)  # Removes stop words
    .apply(
        lambda x: [word for word in x if "rs" not in word]
    )  # Remove tokens containing "rs" (which are cities)
)

In [6]:
tqdm.pandas()
df["DS_OBJETO_NLP"] = df["DS_OBJETO_NLP"].progress_apply(
    stemmer_pt
)  # Applies stemming

100%|██████████| 128753/128753 [00:46<00:00, 2748.36it/s]


In [7]:
vec = df["DS_OBJETO_NLP"].values.tolist()

In [15]:
nlp_normalization_method = params_dict["nlp_normalization_method"]
n_filter = params_dict["filter"]


In [9]:
id2word = Dictionary(vec)
tokens_a_remover = [
    token for token, freq in id2word.dfs.items() if freq < n_filter
]
id2word.filter_tokens(bad_ids=tokens_a_remover)
id2word.compactify()
corpus = [id2word.doc2bow(text) for text in vec]


In [10]:
model_params = params_dict["params"]

In [11]:
model_params["corpus"] = corpus
model_params["id2word"] = id2word
model_params["per_word_topics"] = True
model_params["random_state"] = 42 

In [12]:
lda_model = LdaMulticore(**model_params)


In [38]:
import pickle

In [49]:
# for i in range(0, 60):

#     print(f"Running iteration number {i}")

#     lda_model = LdaMulticore(**model_params)
#     coherence_model_lda = CoherenceModel(
#         model=lda_model, texts=vec, dictionary=id2word, coherence="c_v"
#     )
#     coherence_lda = coherence_model_lda.get_coherence()

#     pickle.dump(lda_model, open(f'models/model_{str(coherence_lda).lstrip("0.")}.pkl', 'wb'))


In [50]:
# lda_model = pickle.load(open('models/lda/model_5348854098362079.pkl', 'rb'))

In [51]:
coherence_model_lda = CoherenceModel(
    model=lda_model, texts=vec, dictionary=id2word, coherence="c_v"
)
coherence_lda = coherence_model_lda.get_coherence()


In [32]:
import pickle

In [37]:
pickle.dump(lda_model, open('models/model_novo_05347.pkl', 'wb')) # Salvando o modelo

In [45]:
str(coherence_lda).lstrip("0.")

'5347081193510318'

## SEGUIR CONSTRUINDO O CODIGO DO SCRIPPT

In [31]:
lda_model.show_topics()

[(0,
  '0.176*"veicul" + 0.077*"maquin" + 0.057*"frot" + 0.037*"ole" + 0.035*"combusti" + 0.029*"plac" + 0.029*"corre" + 0.026*"empr" + 0.025*"diesel" + 0.024*"abastec"'),
 (1,
  '0.064*"eletr" + 0.039*"cont" + 0.036*"port" + 0.030*"equip" + 0.030*"iluminaca" + 0.027*"proteca" + 0.026*"min" + 0.026*"caminha" + 0.025*"abert" + 0.024*"agricol"'),
 (2,
  '0.100*"serv" + 0.048*"transport" + 0.043*"tecn" + 0.040*"agu" + 0.037*"instalaca" + 0.023*"informa" + 0.022*"integr" + 0.018*"estabelec" + 0.017*"sol" + 0.017*"descrit"'),
 (3,
  '0.152*"saud" + 0.098*"medic" + 0.059*"famil" + 0.051*"eletron" + 0.038*"basic" + 0.025*"hospital" + 0.025*"hospit" + 0.024*"odontolog" + 0.020*"paci" + 0.018*"modal"'),
 (4,
  '0.169*"escol" + 0.075*"alimentici" + 0.072*"educaca" + 0.056*"agricult" + 0.048*"ensin" + 0.045*"famili" + 0.041*"alimentaca" + 0.035*"cult" + 0.033*"rural" + 0.032*"infantil"'),
 (5,
  '0.075*"limp" + 0.065*"obr" + 0.047*"pneu" + 0.045*"hidraul" + 0.041*"predi" + 0.033*"movel" + 0.033*"

In [None]:
class TopicModellingLDA:
    """
    Performs Latent Dirichlet Allocation (LDA) model on selected parameters.

    Attributes
    ----------
    nlp_normalization_method : str
        NLP normalization method to choose: either 'stemmer' or 'lemmatization'
    n_filter : int
        Minimum frequency to retain a token in the dictionary.
    n_trials : int
        Number of trials in optimization.
    logger : Logger, defaults to logger
        logger.
    """

    def __init__(
        self,
        nlp_normalization_method: str,
        n_filter: int,
        n_trials: int,
        logger: Logger = logger,
    ):

        self.nlp_normalization_method = nlp_normalization_method
        self.n_filter = n_filter
        self.n_trials = n_trials
        self.logger = logger

    def nlp_preprocessing(self) -> List[str]:
        cleaning_pipeline = DataCleaning()
        df = cleaning_pipeline.run()

        self.logger.info("Running NLP treatment")

        df = df.assign(
            DS_OBJETO_NLP=df["DS_OBJETO"]
            .apply(
                lambda x: nltk.word_tokenize(x.lower(), language="portuguese")
            )  # Tokenize
            .apply(lambda x: [preprocess(word) for word in x])  # Other preprocessing
            .apply(lambda x: list(filter(None, x)))  # Removes items with none
            .apply(remove_stop_words)  # Removes stop words
            .apply(
                lambda x: [word for word in x if "rs" not in word]
            )  # Remove tokens containing "rs" (which are cities)
        )

        if self.nlp_normalization_method == "stemmer":
            self.logger.info("Running stemmer")

            tqdm.pandas()
            df["DS_OBJETO_NLP"] = df["DS_OBJETO_NLP"].progress_apply(
                stemmer_pt
            )  # Applies stemming

        elif self.nlp_normalization_method == "lemmatization":
            self.logger.info("Running lemmatization")

            nlp = spacy.load("pt_core_news_md", disable=["parser", "tagger", "ner"])

            tqdm.pandas()
            df["DS_OBJETO_NLP"] = df["DS_OBJETO_NLP"].progress_apply(
                lambda x: lemma_pt(nlp, x)
            )  # Applies lemmatization

        else:
            self.logger.error("TypeError")
            raise TypeError(
                "Please choose either 'stemmer' or 'lemmatization' as the nlp_normalization_method"
            )

        vec = df["DS_OBJETO_NLP"].values.tolist()
        return vec
    
    def create_corpus(
        self, vec: List[str]
    ) -> Dict[str, Union[int, str, float, Dict[str, Union[int, float]]]]:
        """
        Perform optimization of LDA (Latent Dirichlet Allocation) model parameters using Bayesian Optimization.

        Parameters
        ----------
        vec : List[str]
            List of text data for model optimization.

        Returns
        -------
        Dict[str, Union[int, str, float, Dict[str, Union[int, float]]]]
        """

        # Create corpus
        id2word = Dictionary(vec)
        tokens_a_remover = [
            token for token, freq in id2word.dfs.items() if freq < self.n_filter
        ]
        id2word.filter_tokens(bad_ids=tokens_a_remover)
        id2word.compactify()
        corpus = [id2word.doc2bow(text) for text in vec]

        return corpus
    
    def fit(
        self, vec: List[str]
    ) -> Dict[str, Union[int, str, float, Dict[str, Union[int, float]]]]:
        """
        Perform optimization of LDA (Latent Dirichlet Allocation) model parameters using Bayesian Optimization.

        Parameters
        ----------
        vec : List[str]
            List of text data for model optimization.

        Returns
        -------
        Dict[str, Union[int, str, float, Dict[str, Union[int, float]]]]
        """

        # Create corpus
        id2word = Dictionary(vec)
        tokens_a_remover = [
            token for token, freq in id2word.dfs.items() if freq < self.n_filter
        ]
        id2word.filter_tokens(bad_ids=tokens_a_remover)
        id2word.compactify()
        corpus = [id2word.doc2bow(text) for text in vec]

        return corpus
    