In [19]:
import pandas as pd
import numpy as np
import logging
import torch
from datasets import Dataset, DatasetDict
from transformers import DistilBertModel
from transformers import AutoTokenizer, DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

In [20]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

## Data loading

In [21]:
data_path = '../../data/coverwallet/coverwallet.xlsx'
df = pd.read_excel(data_path)
df = df.dropna()
def truncate_naics_and_prepare_data(df, column_name, num_digits):
    """
    Truncates the NAICS codes in the specified column to the desired number of digits.

    :param df: pandas DataFrame containing the NAICS codes.
    :param column_name: the name of the column with the NAICS codes.
    :param num_digits: the number of digits to truncate to.
    :return: A copy of the DataFrame with the NAICS codes truncated.
    """
    # Validate the number of digits
    if not isinstance(num_digits, int) or num_digits <= 0:
        logging.error("Number of digits must be a positive integer")
        raise ValueError("Number of digits must be a positive integer")
    
    # Make a copy of the DataFrame to avoid modifying the original
    df_copy = df.copy()
    def truncate_code(code):
        """
        Truncates or pads the NAICS code to the specified number of digits.
        :param code: the NAICS code to be truncated.
        :return: The truncated or original NAICS code as a string.
        """
        try:
            # Ensure the code is a string
            code_str = str(code)
            # Truncate the code if it's longer than num_digits
            return code_str[:num_digits].ljust(num_digits, '0')
        except Exception as e:
            logging.exception("Error truncating code: {}".format(code))
            return code
        
    # Apply the truncation function to the specified column
    df_copy[column_name] = df_copy[column_name].apply(truncate_code)
    # Try to convert the truncated column to integers
    try:
        df_copy[column_name] = df_copy[column_name].astype(int)
    except ValueError as e:
        logging.warning("Could not convert truncated codes to integers: {}".format(e))
        # Keep the column as strings if conversion fails
        pass
    
    labels = df_copy['NAICS'].unique().tolist()
    id2label = {idx: label for idx, label in enumerate(labels)}
    label2id = {label: idx for idx, label in enumerate(labels)}
    df_copy['label'] = df_copy['NAICS'].map(label2id)
    logging.info("NAICS codes processed successfully. Here's the head of the processed DataFrame:")
    logging.info("\n%s", df_copy.head())
    df_copy_train, df_copy_final_val = train_test_split(df_copy, test_size=0.15, shuffle=True, random_state=42)
    
    dataset_train = Dataset.from_pandas(df_copy_train)
    dataset_final_val = Dataset.from_pandas(df_copy_final_val)
    
    return df_copy,  dataset_train, dataset_final_val
'''
# Configuration k-fold
    num_folds = 3
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    kfold_datasets = []

    for fold, (train_indices, val_indices) in enumerate(kf.split(dataset_train)):
        train_dataset = dataset_train.select(train_indices)
        val_dataset = dataset_train.select(val_indices)
        
        dataset_dict = {
            'train': train_dataset,
            'validation': val_dataset
        }

        features_dict = {
            "NAICS": dataset_train["NAICS"],
            "BUSINESS_DESCRIPTION": dataset_train["BUSINESS_DESCRIPTION"],
        }
    
        kfold_datasets.append(dataset_dict)
        logging.info(f"Processed fold {fold + 1}")

    for i, dataset_dict in enumerate(kfold_datasets):
        for split in dataset_dict.keys():
            dataset_dict[split] = dataset_dict[split].map(lambda example: {key: example[key] for key in features_dict.keys()})

        logging.info(f"DatasetDict for Fold {i + 1}:")
        for split, dataset in dataset_dict.items():
            logging.info(f"  {split} split: {dataset}")
            
    logging.info("NAICS codes truncated successfully. Here's the head of the truncated DataFrame:")
    logging.info("\n%s", df_copy.head())
    logging.info("Number of unique NAICS labels: %d", len(labels))
    '''
    #return df_copy, kfold_datasets, dataset_train, dataset_final_val

    #df_2_digits, kfold_2_digits, dataset_train_2_digits, dataset_final_val_2_digits = truncate_naics_and_prepare_data(df, 'NAICS', 2)
df_2_digits, dataset_train_2_digits, dataset_final_val_2_digits = truncate_naics_and_prepare_data(df, 'NAICS', 2)

INFO: NAICS codes processed successfully. Here's the head of the processed DataFrame:
INFO: 
   NAICS                               BUSINESS_DESCRIPTION  label
0     72  Zenyai Viet Cajun & Pho Restaurant is dedicate...      0
1     54  Kilduff Underground Engineering, Inc. (KUE) is...      1
2     45  024™ is a premium home fragrance brand that de...      2
3     56  Our Services include Office Cleaning Carpet cl...      3
4     62                    NYS Licensed Home Health Agency      4


**HYPERPARAMETERS**

In [22]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
print(df_2_digits.columns)
print(sorted(df_2_digits['label'].unique()))
df_2_digits['BUSINESS_DESCRIPTION'] = df_2_digits['BUSINESS_DESCRIPTION'].apply(lambda x: ' '.join(x) if isinstance(x, np.ndarray) else x)

Index(['NAICS', 'BUSINESS_DESCRIPTION', 'label'], dtype='object')
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]


In [23]:
train_size = 0.8
train_data=df_2_digits.sample(frac=train_size,random_state=200)
val_data=df_2_digits.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(df_2_digits.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("VALIDATION Dataset: {}".format(val_data.shape))

FULL Dataset: (14175, 3)
TRAIN Dataset: (11340, 3)
VALIDATION Dataset: (2835, 3)


## Data loaders generation

In [24]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.texts = np.array(dataframe['BUSINESS_DESCRIPTION'].astype(str))
        self.targets = np.array(dataframe['label'])  
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        texts = (self.texts[idx])
        target = self.targets[idx]
        inputs_ids= []
        attention_masks= []
        targets= []
        for text in texts:  # Iterar sobre cada texto en el array.
            inputs = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_len,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )
            input_ids = inputs['input_ids'].squeeze()  # Squeeze para remover dimensiones extras
            attention_mask = inputs['attention_mask'].squeeze()
            targets= torch.tensor(target, dtype=torch.long).squeeze().unsqueeze(1)
            inputs_ids.append(input_ids)
            attention_masks.append(attention_mask)
            #targets.append(target)

        return {
            
            'input_ids': inputs_ids,
            'attention_mask': attention_masks,
            'labels': targets
        }

    @property
    def data(self):
        return self._data

    @data.setter
    def data(self, new_data):
        """
        Setter para el atributo 'data'. Este método permite cambiar el valor de 'data'.
        """
        self._data = new_data
        # Actualizar 'text' y 'targets' si es necesario
        #self.text = new_data['BUSINESS_DESCRIPTION']
        #self.targets = list(new_data['label'])

**Creación de los loaders**

In [25]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

training_set = CustomDataset(train_data, tokenizer, MAX_LEN)
val_set = CustomDataset(val_data, tokenizer, MAX_LEN)
set = CustomDataset(df_2_digits, tokenizer, MAX_LEN)

training_loader = DataLoader(dataset=training_set, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(dataset=val_set, batch_size=VALID_BATCH_SIZE, shuffle=True, num_workers=0)
set_loader = DataLoader(dataset=set, batch_size=VALID_BATCH_SIZE, shuffle=True, num_workers=0)

**Comprobación de que funcionan los loaders**

In [26]:
print(type(val_loader))
print(dir(val_loader))
print("Cantidad de lotes en training_loader:", len(val_loader))

for i, batch in enumerate(training_loader):
    
    print(f"Input ids: {len(batch['input_ids'])}")
    print(f"Attention mask: {len(batch['attention_mask'])}")
    print(f"labels: {batch['labels']}")

    break

<class 'torch.utils.data.dataloader.DataLoader'>
['_DataLoader__initialized', '_DataLoader__multiprocessing_context', '_IterableDataset_len_called', '__annotations__', '__class__', '__class_getitem__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__orig_bases__', '__parameters__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_auto_collation', '_dataset_kind', '_get_iterator', '_index_sampler', '_is_protocol', '_iterator', 'batch_sampler', 'batch_size', 'check_worker_number_rationality', 'collate_fn', 'dataset', 'drop_last', 'generator', 'multiprocessing_context', 'num_workers', 'persistent_workers', 'pin_memory', 'pin_memory_device', 'prefetch_factor', 'sampler', 'timeout', 'worker_init_fn']
Cant

## Creating the model

In [27]:
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 24)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
    
model = DistilBERTClass()
model.to(device)

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [28]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [29]:
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [30]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['labels'].to(device, dtype = torch.long).squeeze(1)

        outputs = model(ids, mask)
        loss = loss_fn(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [31]:
for epoch in range(EPOCHS):
    train(epoch)

Training Loss per 5000 steps: 3.157454013824463
Training Accuracy per 5000 steps: 0.0


KeyboardInterrupt: 

**Validation of the model**

In [None]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            targets = data['labels'].to(device, dtype = torch.long).squeeze(1)
            outputs = model(ids, mask).squeeze()
            loss = loss_fn(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [None]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

acc = valid(model, val_loader)
print("Accuracy on val data = %0.2f%%" % acc)

**Saving the model**