In [6]:
import nltk
from logging import Logger
from utils.utils import logger
from gensim.models import CoherenceModel, ldamulticore
from gensim.corpora import Dictionary
import json
from typing import Union, List, Dict
import optuna
# from data_cleaning import DataCleaning
from utils.nlp import preprocess, remove_stop_words, stemmer_pt, lemma_pt


class LDAOptimization:
    """
    Performs Bayesian Optimization on Latent Dirichlet Allocation (LDA) model.

    Attributes
    ----------
    nlp_normalization_method : str
        NLP normalization method to choose: either 'stemmer' or 'lemmatization'
    n_filter : int
        Minimum frequency to retain a token in the dictionary.
    n_trials : int
        Number of trials in optimization.
    logger : Logger, defaults to logger
        logger.
    """

    def __init__(
        self,
        nlp_normalization_method: str,
        n_filter: int,
        n_trials: int,
        logger: Logger = logger,
    ):

        self.nlp_normalization_method = nlp_normalization_method
        self.n_filter = n_filter
        self.n_trials = n_trials
        self.logger = logger

    def nlp_preprocessing(self) -> List[str]:
        cleaning_pipeline = DataCleaning()
        df = cleaning_pipeline.run()

        # TODO - REMOVER AQUI DEPOIS
        df = df[df["ANO_LICITACAO"] >= 2021]

        if self.nlp_normalization_method == "stemmer":

            df = df.assign(
                DS_OBJETO_NLP=df["DS_OBJETO"]
                .apply(
                    lambda x: nltk.word_tokenize(x.lower(), language="portuguese")
                )  # Tokenize
                .apply(
                    lambda x: [preprocess(word) for word in x]
                )  # Other preprocessing
                .apply(lambda x: list(filter(None, x)))  # Removes items with none
                .apply(remove_stop_words)  # Removes stop words
                .apply(
                    lambda x: [word for word in x if "rs" not in word]
                )  # Remove tokens containing "rs" (which are cities)
                .apply(stemmer_pt)  # Applies stemming
            )

            vec = df["DS_OBJETO_NLP"].values.tolist()

            self.logger.info("Vector with stemmer created")
            return vec

        if self.nlp_normalization_method == "lemmatization":

            df = df.assign(
                DS_OBJETO_NLP=df["DS_OBJETO"]
                .apply(
                    lambda x: nltk.word_tokenize(x.lower(), language="portuguese")
                )  # Tokenize
                .apply(
                    lambda x: [preprocess(word) for word in x]
                )  # Other preprocessing
                .apply(lambda x: list(filter(None, x)))  # Removes items with none
                .apply(remove_stop_words)  # Removes stop words
                .apply(
                    lambda x: [word for word in x if "rs" not in word]
                )  # Remove tokens containing "rs" (which are cities)
                .apply(lemma_pt)  # Applies lemmatization
            )

            vec = df["DS_OBJETO_NLP"].values.tolist()

            self.logger.info("Vector with lemmatization created")
            return vec

        else:
            self.logger.error("TypeError")
            raise TypeError(
                "Please choose either 'stemmer' or 'lemmatization' as the nlp_normalization_method"
            )

    def bayesian_opt_objective(
        self, trial: optuna.Trial, id2word: Dictionary, corpus: List, vec: List[str]
    ) -> float:
        """
        Objective function for Bayesian Optimization to tune parameters for LDA (Latent Dirichlet Allocation) model,
        returning the Coherence score c-v (Cosine Similarity).

        Parameters
        ----------
        trial : optuna.Trial
            A single trial of an optimization experiment. The objective function uses this to suggest new parameters.
        id2word : gensim.corpora.Dictionary
            Gensim dictionary mapping of word IDs to words.
        corpus : List
            List of Bag-of-Words corpus.
        vec : List[str]
            List of text data for model coherence calculation.

        Returns
        -------
        float
            Coherence score c-v (Cosine Similarity) of the LDA model.
        """

        num_topics = trial.suggest_int("num_topics", 5, 7, step=1)
        chunksize = trial.suggest_int("chunksize", 80, 180, step=10)
        passes = trial.suggest_int("passes", 5, 20, step=1)
        alpha = trial.suggest_float("alpha", 0.01, 1, step=0.01)
        eta = trial.suggest_float("eta", 0.01, 0.91, step=0.01)
        decay = trial.suggest_float("decay", 0.5, 1, step=0.1)

        lda_model = ldamulticore.LdaMulticore(
            corpus=corpus,
            id2word=id2word,
            num_topics=num_topics,
            chunksize=chunksize,
            passes=passes,
            alpha=alpha,
            eta=eta,
            decay=decay,
            random_state=42,
            per_word_topics=True,
        )

        # Coherence Score
        coherence_model_lda = CoherenceModel(
            model=lda_model, texts=vec, dictionary=id2word, coherence="c_v"
        )
        coherence_lda = coherence_model_lda.get_coherence()

        return coherence_lda

    def get_opt(
        self, vec: List[str]
    ) -> Dict[str, Union[int, str, float, Dict[str, Union[int, float]]]]:
        """
        Perform optimization of LDA (Latent Dirichlet Allocation) model parameters using Bayesian Optimization.

        Parameters
        ----------
        vec : List[str]
            List of text data for model optimization.

        Returns
        -------
        Dict[str, Union[int, str, float, Dict[str, Union[int, float]]]]
        """

        # Create corpus
        id2word = Dictionary(vec)
        tokens_a_remover = [
            token for token, freq in id2word.dfs.items() if freq < self.n_filter
        ]
        id2word.filter_tokens(bad_ids=tokens_a_remover)
        id2word.compactify()
        corpus = [id2word.doc2bow(text) for text in vec]

        # Optimizer
        study = optuna.create_study(direction="maximize")
        study.optimize(
            lambda trial: self.bayesian_opt_objective(trial, id2word, corpus, vec),
            n_trials=self.n_trials,
        )
        trial = study.best_trial

        # Store results
        results = {}
        results["filter"] = self.n_filter
        results["nlp_normalization_method"] = self.nlp_normalization_method
        results["best_score"] = trial.value
        results["params"] = trial.params

        self.logger.info("Results stored")
        return results

    def save_results(
        self, results: Dict[str, Union[int, str, float, Dict[str, Union[int, float]]]]
    ):
        """
        Save results to JSON.

        Parameters
        ----------
        results: Dict[str, Union[int, str, float, Dict[str, Union[int, float]]]]
            Dict with results.
        """

        output_path = f"src/opt_outputs/results_{self.nlp_normalization_method}_with_filter_{self.n_filter}.json"
        with open(output_path, "w") as json_file:
            json.dump(results, json_file)

        self.logger.info("Results saved to JSON")

    def run(self):
        """
        Runs the optimizer
        """
        vec = self.nlp_preprocessing()
        results = self.get_opt(vec)
        self.save_results(results)

        self.logger.info("Optimizer completed!")

In [9]:
pwd

'/home/bruno/mestrado-ufrgs/mestrado-ufrgs-cmp617-tce/src'

In [7]:
optimizer = LDAOptimization(
    nlp_normalization_method="lemmatization", n_filter=250, n_trials=50
)

In [8]:
vec = optimizer.nlp_preprocessing()
# results = self.get_opt(vec)
# self.save_results(results)

2024-04-02 18:34:02 - INFO - Data loaded!
2024-04-02 18:34:02 - INFO - Null values cleaned!
2024-04-02 18:34:02 - INFO - Data types asserted!
2024-04-02 18:34:02 - INFO - Full data cleaned!


FileNotFoundError: [Errno 2] No such file or directory: 'src/utils/stop_words.txt'

In [5]:
from utils.schemas import raw_dtypes
import pandas as pd
from pandas.core.frame import DataFrame
from pathlib import Path
from logging import Logger
from utils.utils import logger


class DataCleaning:
    """
    Cleans TCE-RS data.

    Attributes
    ----------
    data_dir: Path, defaults to data/tce_licitations.csv
        CSV Data directory.
    logger: Logger, defaults to logger
        Logger.
    """

    def __init__(
        self, data_dir: Path = "/home/bruno/mestrado-ufrgs/mestrado-ufrgs-cmp617-tce/data/tce_licitations.csv", logger: Logger = logger
    ):
        self.data_dir = data_dir
        self.logger = logger

    def load(self) -> DataFrame:
        """
        Loads data.

        Returns
        -------
        df: DataFrame
            TCE data.
        """

        df = pd.read_csv(self.data_dir, dtype=raw_dtypes)
        self.logger.info("Data loaded!")

        return df

    def clean_nan(self, df: DataFrame) -> DataFrame:
        """
        Selects only necessary columns and replaces null values.

        Parameters
        ----------
        df: DataFrame
            TCE data.

        Returns
        -------
        df_cleaned_nan: DataFrame
            TCE data with cleaned null values.
        """

        cols_to_keep = [
            "CD_ORGAO",
            "NM_ORGAO",
            "ANO_LICITACAO",
            "DS_OBJETO",
            "VL_LICITACAO",
            "DT_HOMOLOGACAO",
            "VL_HOMOLOGADO",
        ]
        df_cleaned_nan = df[cols_to_keep].copy()

        df_cleaned_nan.loc[:, "VL_HOMOLOGADO"] = df_cleaned_nan["VL_HOMOLOGADO"].fillna(
            df_cleaned_nan["VL_LICITACAO"]
        )

        df_cleaned_nan = df_cleaned_nan.dropna(subset=["ANO_LICITACAO"])

        self.logger.info("Null values cleaned!")
        return df_cleaned_nan

    def asserts_data_types(self, df_cleaned_nan) -> DataFrame:
        """
        Asserting the correct data types.

        Parameters
        ----------
        df_cleaned_nan: DataFrame
            TCE data with cleaned null values.

        Returns
        ----------
        df_final: DataFrame
            Cleaned DataFrame.
        """

        # Replaces a few values        
        df_final = df_cleaned_nan[~df_cleaned_nan["ANO_LICITACAO"].isin(["PRD", "PDE"])]
        df_final.loc[:, "ANO_LICITACAO"] = df_final["ANO_LICITACAO"].replace(
            {"2023.0": "2023", "2024.0": "2024"}
        )
        df_final = df_final[
            ~df_final["VL_HOMOLOGADO"].isin(["###############", "#################"])
        ]

        # Assert data types
        df_final["CD_ORGAO"] = df_final["CD_ORGAO"].astype(int)
        df_final["ANO_LICITACAO"] = df_final["ANO_LICITACAO"].astype(int)
        df_final["VL_LICITACAO"] = df_final["VL_LICITACAO"].astype(float)
        df_final["DT_HOMOLOGACAO"] = pd.to_datetime(df_final["DT_HOMOLOGACAO"])
        df_final["VL_HOMOLOGADO"] = df_final["VL_HOMOLOGADO"].astype(float)

        self.logger.info("Data types asserted!")
        return df_final

    def run(self) -> DataFrame:
        """
        Executes the full process

        Returns
        ----------
        df_final: DataFrame
            Cleaned DataFrame.
        """

        df = self.load()
        df_cleaned_nan = self.clean_nan(df)
        df_final = self.asserts_data_types(df_cleaned_nan)

        self.logger.info("Full data cleaned!")

        return df_final


In [10]:
cleaning_pipeline = DataCleaning()
df = cleaning_pipeline.run()


2024-04-02 18:35:43 - INFO - Data loaded!
2024-04-02 18:35:43 - INFO - Null values cleaned!
2024-04-02 18:35:43 - INFO - Data types asserted!
2024-04-02 18:35:43 - INFO - Full data cleaned!


In [11]:
df = df[df["ANO_LICITACAO"] >= 2021]

In [19]:
def nlp_preprocessing(df) -> List[str]:
    
    nlp_normalization_method = "lemmatization"

    # TODO - REMOVER AQUI DEPOIS
    df = df[df["ANO_LICITACAO"] >= 2021]

    if nlp_normalization_method == "stemmer":
        print("stemmer")

        df = df.assign(
            DS_OBJETO_NLP=df["DS_OBJETO"]
            .apply(
                lambda x: nltk.word_tokenize(x.lower(), language="portuguese")
            )  # Tokenize
            .apply(
                lambda x: [preprocess(word) for word in x]
            )  # Other preprocessing
            .apply(lambda x: list(filter(None, x)))  # Removes items with none
            .apply(remove_stop_words)  # Removes stop words
            .apply(
                lambda x: [word for word in x if "rs" not in word]
            )  # Remove tokens containing "rs" (which are cities)
            .apply(stemmer_pt)  # Applies stemming
        )

    elif nlp_normalization_method == "lemmatization":
        print("lemmatization")

        df = df.assign(
            DS_OBJETO_NLP=df["DS_OBJETO"]
            .apply(
                lambda x: nltk.word_tokenize(x.lower(), language="portuguese")
            )  # Tokenize
            .apply(
                lambda x: [preprocess(word) for word in x]
            )  # Other preprocessing
            .apply(lambda x: list(filter(None, x)))  # Removes items with none
            .apply(
                lambda x: [word for word in x if "rs" not in word]
            )  # Remove tokens containing "rs" (which are cities)
            .apply(lemma_pt)  # Applies lemmatization
        )

    else:
        raise TypeError(
            "Please choose either 'stemmer' or 'lemmatization' as the nlp_normalization_method"
        )

    vec = df["DS_OBJETO_NLP"].values.tolist()
    return vec

In [20]:
vec =  nlp_preprocessing(df)

lemmatization


: 

In [None]:
if __name__ == "__main__":
    optimizer = LDAOptimization(
        nlp_normalization_method="lemmatization", n_filter=250, n_trials=50
    )
    optimizer.run()