# LeanDL - 2025 - Testes

Esse é um arquivo de script, focado em processar uma base de dados para o LeanDL 2025.

Criado por Augusto Dalal (projeto OESNPG)

Instalação de Dependecias

In [None]:
%pip install --upgrade pandas
%pip install --upgrade pyarrow
%pip install numpy
%pip install nltk
%pip install sentence_transformers
%pip install sklearn
%pip install gc
%pip install torch
%pip install codecarbon

Módulo BERT

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import gc
import torch

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

class BERTSimilarity:
    """
    Computes sentence-level similarity between user content and strategic themes using multilingual BERT embeddings.

    Attributes:
        df_person (pd.DataFrame): DataFrame containing user content.
        df_strategic_themes (pd.DataFrame): DataFrame containing strategic themes and associated keywords.
        id_col (str): Column name used as unique identifier in df_person.
        uf_col (str): Column name representing the region/state.
        summary_col (str): Column name in df_person containing the summary text.
        theme_col (str): Column name in df_strategic_themes containing the theme.
        keyword_col (str): Column name in df_strategic_themes containing keywords.
        additional_text_cols (list[str]): Optional list of extra text columns in df_person to consider.
        min_length (int): Minimum length of sentence to be considered.
        model (SentenceTransformer): Pretrained multilingual BERT model.
    """

    def __init__(
        self,
        df_person,
        df_strategic_themes,
        id_col: str = "HASH_ID",
        uf_col: str = "UF",
        summary_col: str = "DS_RESUMO",
        theme_col: str = "TEMA",
        keyword_col: str = "PALAVRAS-CHAVE",
        additional_text_cols: list[str] = None,
        embeddings_model: str = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
        min_length: int = 120,
    ):
        self.df_person = df_person
        self.df_strategic_themes = df_strategic_themes
        self.id_col = id_col
        self.uf_col = uf_col
        self.summary_col = summary_col
        self.theme_col = theme_col
        self.keyword_col = keyword_col
        self.additional_text_cols = additional_text_cols or []
        self.min_length = min_length
        self.model = SentenceTransformer(embeddings_model)

        self._validate_columns()

    def _validate_columns(self):
        """
        Validates if all required columns exist in the provided DataFrames.

        Raises:
            ValueError: If any expected column is missing in df_person or df_strategic_themes.
        """
        person_required = {self.id_col, self.summary_col}.union(set(self.additional_text_cols))
        themes_required = {self.uf_col, self.theme_col, self.keyword_col}

        missing_person = person_required - set(self.df_person.columns)
        missing_themes = themes_required - set(self.df_strategic_themes.columns)

        if missing_person:
            raise ValueError(f"Missing columns in df_person: {missing_person}")
        if missing_themes:
            raise ValueError(f"Missing columns in df_strategic_themes: {missing_themes}")

    def preprocess_metadata(self):
        """
        Tokenizes and filters content from df_person, extracting sentences with sufficient length.

        Returns:
            pd.DataFrame: Processed DataFrame with columns [id_col, 'SOURCE', 'CONTENT'].
        """
        output = []

        for _, row in self.df_person.iterrows():
            item_id = row.get(self.id_col)
            summary = row.get(self.summary_col, '') or ''
            if not isinstance(summary, str):
                summary = ''

            if summary.strip():
                sentences = sent_tokenize(summary.strip(), language='portuguese')
                for sentence in sentences:
                    sentence = sentence.strip()
                    if len(sentence) >= self.min_length:
                        output.append({
                            self.id_col: item_id,
                            'SOURCE': self.summary_col,
                            'CONTENT': sentence
                        })

            for col in self.additional_text_cols:
                value = row.get(col)
                if isinstance(value, str) and value.strip():
                    output.append({
                        self.id_col: item_id,
                        'SOURCE': col,
                        'CONTENT': value.strip()
                    })

        self.df_person = pd.DataFrame(output)
        return self.df_person

    def compute_embeddings(self):
        """
        Computes sentence embeddings for both themes and keywords using the multilingual model.

        Returns:
            pd.DataFrame: The df_strategic_themes DataFrame enriched with embedding columns.
        """
        theme_embeddings = []
        keyword_embeddings = []

        for _, row in self.df_strategic_themes.iterrows():
            theme = row.get(self.theme_col, '')
            keywords = row.get(self.keyword_col, '')

            theme_emb = self.model.encode(theme) if isinstance(theme, str) and theme.strip() else np.zeros(self.model.get_sentence_embedding_dimension())
            theme_embeddings.append(theme_emb)

            #alteração aqui
            #keyword_embs = self.model.encode(keywords)
            #avg_emb = np.mean(keyword_embs, axis=0)

            #para isto (problema quando keywords é pequena)
            if isinstance(keywords, list) and keywords:
                keyword_embs = self.model.encode(keywords)
                if keyword_embs.ndim == 1:
                    keyword_embs = np.expand_dims(keyword_embs, axis=0)
                avg_emb = np.mean(keyword_embs, axis=0)
            else:
                avg_emb = np.zeros(self.model.get_sentence_embedding_dimension())
                

            keyword_embeddings.append(avg_emb)

        self.df_strategic_themes[f'{self.theme_col}_EMBEDDING'] = theme_embeddings
        self.df_strategic_themes[f'{self.keyword_col}_EMBEDDING'] = keyword_embeddings

        return self.df_strategic_themes

    def match_content_to_strategic_themes(self):
        """
        Computes cosine similarity between each sentence in df_person and each theme/keyword embedding.

        Returns:
            pd.DataFrame: A DataFrame with all matches and their similarity scores.
        """
        result_rows = []

        for _, person_row in self.df_person.iterrows():
            item_id = person_row.get(self.id_col)
            source = person_row.get('SOURCE')
            content = person_row.get('CONTENT')
            content_emb = self.model.encode(content)

            for _, theme_row in self.df_strategic_themes.iterrows():
                uf = theme_row.get(self.uf_col)
                theme = theme_row.get(self.theme_col)
                #keywords = theme_row.get(self.keyword_col)
                theme_emb = theme_row.get(f'{self.theme_col}_EMBEDDING')
                keyword_emb = theme_row.get(f'{self.keyword_col}_EMBEDDING')

                theme_cos = float(cosine_similarity([content_emb], [theme_emb])[0][0])
                keyword_cos = float(cosine_similarity([content_emb], [keyword_emb])[0][0])

                result_rows.append({
                    self.id_col: item_id,
                    'SOURCE': source,
                    'CONTENT': content,
                    'UF': uf,
                    self.theme_col: theme,
                    f'{self.theme_col}_COSINE': theme_cos,
                    #self.keyword_col: keywords,
                    f'{self.keyword_col}_COSINE': keyword_cos
                })

        self.df_matches = pd.DataFrame(result_rows)
        return self.df_matches

    def run_pipeline(self):
        """
        Runs the full similarity pipeline: preprocess → embed → match.

        Returns:
            pd.DataFrame: Final DataFrame with matched content and similarity scores.
        """
        self.preprocess_metadata()
        self.compute_embeddings()
        return self.match_content_to_strategic_themes()
    
    #adição, visando consumo de memória
    def unload_model(self):
        """
        Unloads the model from memory to free RAM.
        """
        del self.model
        self.model = None

        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        gc.collect()

        print("BERT model successfully unloaded.")


Baixar Base de Dados

In [None]:
%pip install gdown
!gdown 12H957uf6mK-1X_ztT9hgFS1slpN2j-Wh
!gdown 1-QXkqH8HzLcV2JCA4Nm9G5rQhorYKJVe

Script

In [None]:

## Inicio Tracking Carbono
from codecarbon import EmissionsTracker
tracker = EmissionsTracker()
tracker.start()

### Logging ---------------------------------------------------------------------------------
import logging
from datetime import datetime
import platform
import psutil
import os
print("PID: ")
print(os.getpid())

# Configuração do log
bert_logger = logging.getLogger("bert_logger")
bert_logger.setLevel(logging.INFO)

# Handler para arquivo
file_handler = logging.FileHandler("bert_similarity_LeanDL2025_log.txt")
file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s"))
bert_logger.addHandler(file_handler)

# Handler para console
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
bert_logger.addHandler(console_handler)

def log_memory(prefix=""):
    process = psutil.Process(os.getpid())
    mem = process.memory_info().rss / (1024 ** 2)  # em MB
    bert_logger.info(f"{prefix} Memória usada: {mem:.2f} MB")
    return mem

# Info do OS
# Informações do processador
cpu_count = psutil.cpu_count(logical=True)
cpu_freq = psutil.cpu_freq()
cpu_name = platform.processor()

bert_logger.info(f"Processador: {cpu_name}")
bert_logger.info(f"Núcleos lógicos: {cpu_count}")
if cpu_freq:
    bert_logger.info(f"Frequência máxima: {cpu_freq.max:.2f} MHz | Min: {cpu_freq.min:.2f} MHz | Atual: {cpu_freq.current:.2f} MHz")
    
bert_logger.info(f"Sistema operacional: {platform.system()} {platform.release()}")
bert_logger.info(f"Versão do Python: {platform.python_version()}")
log_memory("Uso inicial")

# Timestamp de início
start_time = datetime.now()
bert_logger.info(f"Início do processamento: {start_time}")

### ---------------------------------------------------------------------

### Script --------------------------------------------------------------
import pandas as pd
import  numpy as np
#from filter_cv_bert.bert_similarity import BERTSimilarity
import gc

# Ajuste os caminhos se necessário
path_dict = "./leandl_oesnpg_dicionario.parquet"
path_data = "./leandl_oesnpg_dados.parquet"

# Leitura usando pandas (requer pyarrow ou fastparquet)
dicionario_df = pd.read_parquet(path_dict)
dados_df = pd.read_parquet(path_data)

#separa temas
def to_fset(x):
    if isinstance(x, (list, tuple, np.ndarray)):
        return frozenset(x)
    if pd.isna(x):
        return frozenset()
    return frozenset([x])

df_temas = (
    dados_df.assign(palavras_chave=dados_df["palavras_chave"].apply(to_fset))
            [["tema_id", "tema", "uf_tema_info", "palavras_chave"]]
            .drop_duplicates()
            .assign(palavras_chave=lambda df: df["palavras_chave"].apply(list))
)

df_perfis_id_tema = (
    dados_df
    .groupby("tema_id")["hash_id"]
    .agg(lambda x: list(set(x)))
    .reset_index()
    .rename(columns={"hash_id": "perfis_ids"})
)

df_temas = df_temas.merge(df_perfis_id_tema, on="tema_id")

#separa perfis

df_perfis = (
    dados_df
    .drop_duplicates(subset=["hash_id"]).drop(columns=["tema_id", "tema", "palavras_chave", "uf_tema_info", "modelo_nivel", "modelo_explicacao"])
)

# Deletar dataframes não utilizados
del dicionario_df, dados_df

# Iteração
rows_temas = len(df_temas)

results = []
batch_size = 10

k = 1
for i, tema in df_temas.iterrows():

    df_perfis_tema = df_perfis[df_perfis["hash_id"].isin(tema["perfis_ids"])].copy()

    bert_logger.info(f"Tema {k}/{rows_temas}: {tema['tema']} (id {tema['tema_id']})")
    log_memory(f"Antes do processamento do tema {k}")

    bertAnalyzer = BERTSimilarity(df_person=df_perfis_tema,
                                  df_strategic_themes=tema.to_frame().T,
                                  id_col="hash_id",
                                  uf_col="uf_tema_info",
                                  summary_col="descricao_resumo",
                                  theme_col="tema",
                                  keyword_col="palavras_chave",
                                  additional_text_cols=sorted(set(df_perfis_tema.columns) - set(["hash_id", "descricao_resumo"]))
                                  )
    
    df_resultado = bertAnalyzer.run_pipeline()
    bertAnalyzer.unload_model()

    df_resultado["tema_id"] = tema["tema_id"]

    results.append(df_resultado)

    log_memory(f"Depois do processamento do tema {k}")

    if k % batch_size == 0:
        #Salvar resultados (em partes)
        df_batch = pd.concat(results, ignore_index=True)
        df_batch.to_parquet(f"./bert_similarity_results_batch_{k//batch_size}.parquet", index=False)
        results = []  # limpa lista de resultados da RAM
        del df_batch, df_resultado, df_perfis_tema
        gc.collect()


    k+=1

if results:
    df_batch = pd.concat(results, ignore_index=True)
    df_batch.to_parquet(f"./bert_similarity_results_batch_{k//batch_size + 1}.parquet", index=False)
    results = []
    del df_batch
    gc.collect()


# Fim Emissions
emissions = tracker.stop()
bert_logger.info(f"Total de emissões (kg CO2): {emissions}")

# Timestamp de fim
end_time = datetime.now()
bert_logger.info(f"Fim do processamento: {end_time}")
bert_logger.info(f"Duração total: {end_time - start_time}")
log_memory("Uso final")





Concatenação Resultados

In [None]:
import glob
import pandas as pd

parquet_files = [f"./bert_similarity_results_batch_{i}.parquet" for i in range(1, 47)]
df_final = pd.concat([pd.read_parquet(f) for f in parquet_files], ignore_index=True)
df_final.to_parquet("./bert_similarity_results_LeanDL_2025.parquet", index=False)
