## BERT ANALYSIS

LIBRARIES LOADING

In [1]:
import pandas as pd
import numpy as np
import logging
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, DistilBertTokenizer

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


This approach will be very useful not only for going step byu step with the classification but also for allowing to decide the trade-off of accuracy and digits returned.

DATA PREPROCESSING

In [7]:
data_path = '../data/coverwallet.xlsx'
df = pd.read_excel(data_path)
df = df.dropna()
def truncate_naics_and_prepare_data(df, column_name, num_digits):
    """
    Truncates the NAICS codes in the specified column to the desired number of digits.

    :param df: pandas DataFrame containing the NAICS codes.
    :param column_name: the name of the column with the NAICS codes.
    :param num_digits: the number of digits to truncate to.
    :return: A copy of the DataFrame with the NAICS codes truncated.
    """
    # Validate the number of digits
    if not isinstance(num_digits, int) or num_digits <= 0:
        logging.error("Number of digits must be a positive integer")
        raise ValueError("Number of digits must be a positive integer")
    
    # Make a copy of the DataFrame to avoid modifying the original
    df_copy = df.copy()
    def truncate_code(code):
        """
        Truncates or pads the NAICS code to the specified number of digits.
        :param code: the NAICS code to be truncated.
        :return: The truncated or original NAICS code as a string.
        """
        try:
            # Ensure the code is a string
            code_str = str(code)
            # Truncate the code if it's longer than num_digits
            return code_str[:num_digits].ljust(num_digits, '0')
        except Exception as e:
            logging.exception("Error truncating code: {}".format(code))
            return code
        
    # Apply the truncation function to the specified column
    df_copy[column_name] = df_copy[column_name].apply(truncate_code)
    # Try to convert the truncated column to integers
    try:
        df_copy[column_name] = df_copy[column_name].astype(int)
    except ValueError as e:
        logging.warning("Could not convert truncated codes to integers: {}".format(e))
        # Keep the column as strings if conversion fails
        pass
    
    labels = df_copy['NAICS'].unique().tolist()
    id2label = {idx: label for idx, label in enumerate(labels)}
    label2id = {label: idx for idx, label in enumerate(labels)}
    df_copy['label'] = df_copy['NAICS'].map(label2id)
    logging.info("NAICS codes processed successfully. Here's the head of the processed DataFrame:")
    logging.info("\n%s", df_copy.head())
    df_copy_train, df_copy_final_val = train_test_split(df_copy, test_size=0.15, shuffle=True, random_state=42)
    
    dataset_train = Dataset.from_pandas(df_copy_train)
    dataset_final_val = Dataset.from_pandas(df_copy_final_val)

# Configuration k-fold
    num_folds = 3
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    kfold_datasets = []

    for fold, (train_indices, val_indices) in enumerate(kf.split(dataset_train)):
        train_dataset = dataset_train.select(train_indices)
        val_dataset = dataset_train.select(val_indices)
        
        dataset_dict = {
            'train': train_dataset,
            'validation': val_dataset
        }

        features_dict = {
            "NAICS": dataset_train["NAICS"],
            "BUSINESS_DESCRIPTION": dataset_train["BUSINESS_DESCRIPTION"],
        }
    
        kfold_datasets.append(dataset_dict)
        logging.info(f"Processed fold {fold + 1}")

    for i, dataset_dict in enumerate(kfold_datasets):
        for split in dataset_dict.keys():
            dataset_dict[split] = dataset_dict[split].map(lambda example: {key: example[key] for key in features_dict.keys()})

        logging.info(f"DatasetDict for Fold {i + 1}:")
        for split, dataset in dataset_dict.items():
            logging.info(f"  {split} split: {dataset}")
            
    logging.info("NAICS codes truncated successfully. Here's the head of the truncated DataFrame:")
    logging.info("\n%s", df_copy.head())
    logging.info("Number of unique NAICS labels: %d", len(labels))

    return df_copy, kfold_datasets, dataset_train, dataset_final_val

In [8]:
df_2_digits, kfold_2_digits, dataset_train_2_digits, dataset_final_val_2_digits = truncate_naics_and_prepare_data(df, 'NAICS', 2)


INFO: NAICS codes processed successfully. Here's the head of the processed DataFrame:
INFO: 
   NAICS                               BUSINESS_DESCRIPTION  label
0     72  Zenyai Viet Cajun & Pho Restaurant is dedicate...      0
1     54  Kilduff Underground Engineering, Inc. (KUE) is...      1
2     45  024™ is a premium home fragrance brand that de...      2
3     56  Our Services include Office Cleaning Carpet cl...      3
4     62                    NYS Licensed Home Health Agency      4


INFO: Processed fold 1
INFO: Processed fold 2
INFO: Processed fold 3
Map: 100%|██████████| 8032/8032 [00:00<00:00, 9408.72 examples/s]
Map: 100%|██████████| 4016/4016 [00:00<00:00, 10147.58 examples/s]
INFO: DatasetDict for Fold 1:
INFO:   train split: Dataset({
    features: ['NAICS', 'BUSINESS_DESCRIPTION', 'label', '__index_level_0__'],
    num_rows: 8032
})
INFO:   validation split: Dataset({
    features: ['NAICS', 'BUSINESS_DESCRIPTION', 'label', '__index_level_0__'],
    num_rows: 4016
})
Map: 100%|██████████| 8032/8032 [00:00<00:00, 9520.51 examples/s] 
Map: 100%|██████████| 4016/4016 [00:00<00:00, 8953.38 examples/s] 
INFO: DatasetDict for Fold 2:
INFO:   train split: Dataset({
    features: ['NAICS', 'BUSINESS_DESCRIPTION', 'label', '__index_level_0__'],
    num_rows: 8032
})
INFO:   validation split: Dataset({
    features: ['NAICS', 'BUSINESS_DESCRIPTION', 'label', '__index_level_0__'],
    num_rows: 4016
})
Map: 100%|██████████| 8032/8032 [00:00<00:00, 8838.67 examples/s]


DATA PROCESSING

In [6]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)