In [1]:
import pandas as pd
import numpy as np
import logging
import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, DistilBertTokenizer
from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = '../data/coverwallet.xlsx'
df = pd.read_excel(data_path)
df = df.dropna()
def truncate_naics_and_prepare_data(df, column_name, num_digits):
    """
    Truncates the NAICS codes in the specified column to the desired number of digits.

    :param df: pandas DataFrame containing the NAICS codes.
    :param column_name: the name of the column with the NAICS codes.
    :param num_digits: the number of digits to truncate to.
    :return: A copy of the DataFrame with the NAICS codes truncated.
    """
    # Validate the number of digits
    if not isinstance(num_digits, int) or num_digits <= 0:
        logging.error("Number of digits must be a positive integer")
        raise ValueError("Number of digits must be a positive integer")
    
    # Make a copy of the DataFrame to avoid modifying the original
    df_copy = df.copy()
    def truncate_code(code):
        """
        Truncates or pads the NAICS code to the specified number of digits.
        :param code: the NAICS code to be truncated.
        :return: The truncated or original NAICS code as a string.
        """
        try:
            # Ensure the code is a string
            code_str = str(code)
            # Truncate the code if it's longer than num_digits
            return code_str[:num_digits].ljust(num_digits, '0')
        except Exception as e:
            logging.exception("Error truncating code: {}".format(code))
            return code
        
    # Apply the truncation function to the specified column
    df_copy[column_name] = df_copy[column_name].apply(truncate_code)
    # Try to convert the truncated column to integers
    try:
        df_copy[column_name] = df_copy[column_name].astype(int)
    except ValueError as e:
        logging.warning("Could not convert truncated codes to integers: {}".format(e))
        # Keep the column as strings if conversion fails
        pass
    
    labels = df_copy['NAICS'].unique().tolist()
    id2label = {idx: label for idx, label in enumerate(labels)}
    label2id = {label: idx for idx, label in enumerate(labels)}
    df_copy['label'] = df_copy['NAICS'].map(label2id)
    logging.info("NAICS codes processed successfully. Here's the head of the processed DataFrame:")
    logging.info("\n%s", df_copy.head())
    df_copy_train, df_copy_final_val = train_test_split(df_copy, test_size=0.15, shuffle=True, random_state=42)
    
    dataset_train = Dataset.from_pandas(df_copy_train)
    dataset_final_val = Dataset.from_pandas(df_copy_final_val)

# Configuration k-fold
    num_folds = 3
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    kfold_datasets = []

    for fold, (train_indices, val_indices) in enumerate(kf.split(dataset_train)):
        train_dataset = dataset_train.select(train_indices)
        val_dataset = dataset_train.select(val_indices)
        
        dataset_dict = {
            'train': train_dataset,
            'validation': val_dataset
        }

        features_dict = {
            "NAICS": dataset_train["NAICS"],
            "BUSINESS_DESCRIPTION": dataset_train["BUSINESS_DESCRIPTION"],
        }
    
        kfold_datasets.append(dataset_dict)
        logging.info(f"Processed fold {fold + 1}")

    for i, dataset_dict in enumerate(kfold_datasets):
        for split in dataset_dict.keys():
            dataset_dict[split] = dataset_dict[split].map(lambda example: {key: example[key] for key in features_dict.keys()})

        logging.info(f"DatasetDict for Fold {i + 1}:")
        for split, dataset in dataset_dict.items():
            logging.info(f"  {split} split: {dataset}")
            
    logging.info("NAICS codes truncated successfully. Here's the head of the truncated DataFrame:")
    logging.info("\n%s", df_copy.head())
    logging.info("Number of unique NAICS labels: %d", len(labels))

    return df_copy, kfold_datasets, dataset_train, dataset_final_val



In [3]:
df_2_digits, kfold_2_digits, dataset_train_2_digits, dataset_final_val_2_digits = truncate_naics_and_prepare_data(df, 'NAICS', 2)
# Defining some key variables that will be used later on in the training
MAX_LEN = 128 
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 2
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

INFO: NAICS codes processed successfully. Here's the head of the processed DataFrame:
INFO: 
   NAICS                               BUSINESS_DESCRIPTION  label
0     72  Zenyai Viet Cajun & Pho Restaurant is dedicate...      0
1     54  Kilduff Underground Engineering, Inc. (KUE) is...      1
2     45  024™ is a premium home fragrance brand that de...      2
3     56  Our Services include Office Cleaning Carpet cl...      3
4     62                    NYS Licensed Home Health Agency      4
INFO: Processed fold 1
INFO: Processed fold 2
INFO: Processed fold 3
Map: 100%|██████████| 8032/8032 [00:00<00:00, 8226.70 examples/s]
Map: 100%|██████████| 4016/4016 [00:00<00:00, 6286.37 examples/s]
INFO: DatasetDict for Fold 1:
INFO:   train split: Dataset({
    features: ['NAICS', 'BUSINESS_DESCRIPTION', 'label', '__index_level_0__'],
    num_rows: 8032
})
INFO:   validation split: Dataset({
    features: ['NAICS', 'BUSINESS_DESCRIPTION', 'label', '__index_level_0__'],
    num_rows: 4016
})
Map: 

CREACION CLASE DATASET

In [4]:
from torch.utils.data import Dataset
import torch

class MultiLabelDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = dataframe['BUSINESS_DESCRIPTION']
        self.targets = dataframe['label']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = self.text[idx]
        inputs = self.tokenizer.encode_plus(
            text, None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
        )
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(self.targets[idx], dtype=torch.float)
        }


DATALOADERS

In [5]:
from torch.utils.data import DataLoader

def create_dataloaders(kfold_datasets, tokenizer, max_len, batch_size=4):
    dataloaders = []
    for fold in kfold_datasets:
        train_dataset = MultiLabelDataset(fold['train'], tokenizer, max_len)
        valid_dataset = MultiLabelDataset(fold['validation'], tokenizer, max_len)
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
        
        dataloaders.append((train_loader, valid_loader))
    
    return dataloaders


MODEL DEFINITION

In [9]:
from transformers import DistilBertModel
import torch.nn as nn

class DistilBERTClass(nn.Module):
    def __init__(self, num_labels):
        super(DistilBERTClass, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.pre_classifier = nn.Linear(768, 768)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask):
        distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = distilbert_output[0]
        pooled_output = hidden_state[:, 0]
        pooled_output = self.pre_classifier(pooled_output)
        pooled_output = nn.ReLU()(pooled_output)
        pooled_output = self.dropout(pooled_output)
        output = self.classifier(pooled_output)
        return output



In [10]:
from torch.optim import Adam
from tqdm import tqdm
import torch
import torch.nn as nn

# Definición de la función de pérdida
loss_fn = nn.BCEWithLogitsLoss()
def train_model(model, data_loader, loss_fn, optimizer, device, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(data_loader, desc=f"Epoch {epoch + 1}/{epochs}", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        
        avg_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch + 1} finished. Average loss: {avg_loss:.4f}")

def validate_model(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validating", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()
    
    avg_loss = total_loss / len(data_loader)
    print(f"Validation finished. Average loss: {avg_loss:.4f}")



In [11]:
import torch
from transformers import AdamW
from tqdm.auto import tqdm

# Asumiendo que model es tu instancia de DistilBERTClass
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBERTClass(num_labels=24)
model.to(device)

# Definición de la función de pérdida y el optimizador
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

train_dataloader, valid_dataloader = create_dataloaders(kfold_2_digits[0], tokenizer, max_len=128, batch_size=16)  # Asume que kfold_datasets[0] contiene tu primer fold.

train_model(model, train_dataloader, loss_fn, optimizer, device, EPOCHS)
validate_model(model, valid_dataloader, loss_fn, device)




TypeError: string indices must be integers, not 'str'