## BERT ANALYSIS

LIBRARIES LOADING

In [1]:
import pandas as pd
import numpy as np
import logging
import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, DistilBertTokenizer
from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

  from .autonotebook import tqdm as notebook_tqdm


This approach will be very useful not only for going step by step with the classification but also for allowing to decide the trade-off of accuracy and digits returned.

DATA PREPROCESSING

In [2]:
data_path = '../data/coverwallet.xlsx'
df = pd.read_excel(data_path)
df = df.dropna()
def truncate_naics_and_prepare_data(df, column_name, num_digits):
    """
    Truncates the NAICS codes in the specified column to the desired number of digits.

    :param df: pandas DataFrame containing the NAICS codes.
    :param column_name: the name of the column with the NAICS codes.
    :param num_digits: the number of digits to truncate to.
    :return: A copy of the DataFrame with the NAICS codes truncated.
    """
    # Validate the number of digits
    if not isinstance(num_digits, int) or num_digits <= 0:
        logging.error("Number of digits must be a positive integer")
        raise ValueError("Number of digits must be a positive integer")
    
    # Make a copy of the DataFrame to avoid modifying the original
    df_copy = df.copy()
    def truncate_code(code):
        """
        Truncates or pads the NAICS code to the specified number of digits.
        :param code: the NAICS code to be truncated.
        :return: The truncated or original NAICS code as a string.
        """
        try:
            # Ensure the code is a string
            code_str = str(code)
            # Truncate the code if it's longer than num_digits
            return code_str[:num_digits].ljust(num_digits, '0')
        except Exception as e:
            logging.exception("Error truncating code: {}".format(code))
            return code
        
    # Apply the truncation function to the specified column
    df_copy[column_name] = df_copy[column_name].apply(truncate_code)
    # Try to convert the truncated column to integers
    try:
        df_copy[column_name] = df_copy[column_name].astype(int)
    except ValueError as e:
        logging.warning("Could not convert truncated codes to integers: {}".format(e))
        # Keep the column as strings if conversion fails
        pass
    
    labels = df_copy['NAICS'].unique().tolist()
    id2label = {idx: label for idx, label in enumerate(labels)}
    label2id = {label: idx for idx, label in enumerate(labels)}
    df_copy['label'] = df_copy['NAICS'].map(label2id)
    logging.info("NAICS codes processed successfully. Here's the head of the processed DataFrame:")
    logging.info("\n%s", df_copy.head())
    df_copy_train, df_copy_final_val = train_test_split(df_copy, test_size=0.15, shuffle=True, random_state=42)
    
    dataset_train = Dataset.from_pandas(df_copy_train)
    dataset_final_val = Dataset.from_pandas(df_copy_final_val)

# Configuration k-fold
    num_folds = 3
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    kfold_datasets = []

    for fold, (train_indices, val_indices) in enumerate(kf.split(dataset_train)):
        train_dataset = dataset_train.select(train_indices)
        val_dataset = dataset_train.select(val_indices)
        
        dataset_dict = {
            'train': train_dataset,
            'validation': val_dataset
        }

        features_dict = {
            "NAICS": dataset_train["NAICS"],
            "BUSINESS_DESCRIPTION": dataset_train["BUSINESS_DESCRIPTION"],
        }
    
        kfold_datasets.append(dataset_dict)
        logging.info(f"Processed fold {fold + 1}")

    for i, dataset_dict in enumerate(kfold_datasets):
        for split in dataset_dict.keys():
            dataset_dict[split] = dataset_dict[split].map(lambda example: {key: example[key] for key in features_dict.keys()})

        logging.info(f"DatasetDict for Fold {i + 1}:")
        for split, dataset in dataset_dict.items():
            logging.info(f"  {split} split: {dataset}")
            
    logging.info("NAICS codes truncated successfully. Here's the head of the truncated DataFrame:")
    logging.info("\n%s", df_copy.head())
    logging.info("Number of unique NAICS labels: %d", len(labels))

    return df_copy, kfold_datasets, dataset_train, dataset_final_val

In [3]:
df_2_digits, kfold_2_digits, dataset_train_2_digits, dataset_final_val_2_digits = truncate_naics_and_prepare_data(df, 'NAICS', 2)


INFO: NAICS codes processed successfully. Here's the head of the processed DataFrame:
INFO: 
   NAICS                               BUSINESS_DESCRIPTION  label
0     72  Zenyai Viet Cajun & Pho Restaurant is dedicate...      0
1     54  Kilduff Underground Engineering, Inc. (KUE) is...      1
2     45  024™ is a premium home fragrance brand that de...      2
3     56  Our Services include Office Cleaning Carpet cl...      3
4     62                    NYS Licensed Home Health Agency      4
INFO: Processed fold 1
INFO: Processed fold 2
INFO: Processed fold 3
Map: 100%|██████████| 8032/8032 [00:02<00:00, 3054.00 examples/s]
Map: 100%|██████████| 4016/4016 [00:01<00:00, 2656.07 examples/s]
INFO: DatasetDict for Fold 1:
INFO:   train split: Dataset({
    features: ['NAICS', 'BUSINESS_DESCRIPTION', 'label', '__index_level_0__'],
    num_rows: 8032
})
INFO:   validation split: Dataset({
    features: ['NAICS', 'BUSINESS_DESCRIPTION', 'label', '__index_level_0__'],
    num_rows: 4016
})
Map: 

MODEL

In [4]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 128 
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 2
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

In [5]:
class NAICSDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        """
        Inicializa el dataset.

        :param dataframe: DataFrame de pandas que contiene las descripciones de negocio y sus etiquetas.
        :param tokenizer: Tokenizador de transformers utilizado para procesar el texto.
        :param max_len: Longitud máxima de la secuencia de tokens.
        """
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['BUSINESS_DESCRIPTION']  
        self.targets = list(dataframe['labels'])  
        self.max_len = max_len

    def __len__(self):
        """
        Devuelve la cantidad de elementos en el dataset.
        """
        return len(self.text)

    def __getitem__(self, index):
        """
        Obtiene el ítem en el índice especificado con tokenización y preparación para el modelo.
        """
        # Extract text and basic cleaning
        text = str(self.text[index])
        text = " ".join(text.split())

        # Codificar el texto
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False  # DistilBERT doesn not use token_type_ids
        )

        # Preparar la salida
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)  # Asegúrate de que esto coincide con el formato esperado por tu modelo
        }

In [10]:
def create_data_loaders_from_datasets(train_dataset, valid_dataset, tokenizer, max_len, train_batch_size, valid_batch_size):
    """
    Crea DataLoader para los conjuntos de datos de entrenamiento y validación.

    :param train_dataset: DataFrame de entrenamiento o NAICSDataset ya inicializado.
    :param valid_dataset: DataFrame de validación o NAICSDataset ya inicializado.
    :param tokenizer: Tokenizador de transformers utilizado para procesar el texto.
    :param max_len: Longitud máxima de la secuencia de tokens.
    :param train_batch_size: Tamaño del lote para el entrenamiento.
    :param valid_batch_size: Tamaño del lote para la validación.
    :return: Tuple de DataLoaders para entrenamiento y validación.
    """
    # Crear DataLoader para el conjunto de entrenamiento
    train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
    
    # Crear DataLoader para el conjunto de validación
    valid_loader = DataLoader(valid_dataset, batch_size=valid_batch_size, shuffle=False)
    
    return train_loader, valid_loader

def prepare_dataloaders(kfold_datasets, tokenizer, max_len, train_batch_size, valid_batch_size):
    """
    Esta función asume que kfold_datasets contiene pares de DataFrames de pandas para entrenamiento y validación en cada fold.
    """
    train_dataloaders = []
    valid_dataloaders = []
    
    for fold in kfold_datasets:
        # Asumiendo que cada fold es un diccionario con 'train' y 'validation' como DataFrames de pandas
        train_dataset, valid_dataset = fold['train'], fold['validation']
        
        # Inicializar NAICSDataset para cada DataFrame
        train_data = NAICSDataset(dataframe=train_dataset, tokenizer=tokenizer, max_len=max_len)
        valid_data = NAICSDataset(dataframe=valid_dataset, tokenizer=tokenizer, max_len=max_len)
        
        # Crear DataLoaders
        train_loader, valid_loader = create_data_loaders_from_datasets(train_data, valid_data, tokenizer, max_len, train_batch_size, valid_batch_size)
        
        # Agregar a las listas
        train_dataloaders.append(train_loader)
        valid_dataloaders.append(valid_loader)
    
    return train_dataloaders, valid_dataloaders


In [11]:
from transformers import DistilBertModel
import torch

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        # Cargar DistilBERT preentrenado
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        # Una capa lineal que reduce la dimensionalidad de la salida de DistilBERT de 768 a 768.
        # Esto actúa como una capa "pre-clasificadora", pero en este caso, no cambia la dimensión.
        self.pre_classifier = torch.nn.Linear(768, 768)
        # Capa de dropout para regularización, ayudando a prevenir el sobreajuste
        self.dropout = torch.nn.Dropout(0.3)  # Aumento la tasa de dropout para una mayor regularización
        # Capa clasificadora final que mapea de 768 características a 24 categorías (tus etiquetas)
        self.classifier = torch.nn.Linear(768, 24)

    def forward(self, input_ids, attention_mask):
        """
        Forward pass del modelo.
        
        :param input_ids: IDs de tokens para la entrada.
        :param attention_mask: Máscara de atención para evitar que el modelo atienda a tokens de padding.
        """
        # Obtener la salida de DistilBERT. No necesita token_type_ids porque DistilBERT no los usa.
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]  # La salida completa de la secuencia.
        pooler = hidden_state[:, 0]  # Tomamos solo la primera posición de todos los embeddings (representando [CLS])
        pooler = self.pre_classifier(pooler)  # Pasamos por la capa pre-clasificadora
        pooler = torch.nn.Tanh()(pooler)  # Aplicamos la función de activación Tanh
        pooler = self.dropout(pooler)  # Aplicamos dropout para regularización
        output = self.classifier(pooler)  # La salida de la capa clasificadora final
        return output

# Inicialización del modelo
model = DistilBERTClass()
model.to(device)
