# Furniture Classification Model

This notebook demonstrates the process of loading, preprocessing, and training a machine learning model for classifying furniture items. The steps include data loading, tokenization, model training, and evaluation.

## Import libraries

In [1]:
%load_ext tensorboard

import torch
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import pandas as pd
import random
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

print("Using torch backend")

Using torch backend


## Seed RNGs

In [2]:
SEED = 450

np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)

## Data Loading

In [3]:
furniture_dataset = pd.read_csv("../data/model_input/dataset.csv")
flipkart_dataset = pd.read_csv("../data/model_input/flipkart.csv")

In [4]:
# Extract unique furniture names from the furniture dataset.
furniture_names = list(set(furniture_dataset["name"]))

# Extract unique product names from the flipkart dataset that are not categorized as furniture.
other_product_names = list(flipkart_dataset[~flipkart_dataset["product_category_tree"].str.startswith("[\"Furniture")]["product_name"].unique())

# Shuffle the list of non-furniture product names to randomize their order
np.random.shuffle(other_product_names)

# Truncate the list of non-furniture product names to the same length as the list of furniture names
other_product_names = other_product_names[:len(furniture_names)]

# Print the number of furniture and non-furniture names for comparison.
print(f"# of furniture names: {len(furniture_names)}")
print(f"# of non-furniture names: {len(other_product_names)}")

# of furniture names: 5605
# of non-furniture names: 5605


In [5]:
class FurnitureDataset(torch.utils.data.Dataset):
  """
  A PyTorch Dataset class for furniture and non-furniture items.

  This class is designed to handle datasets containing tokenized names of furniture and non-furniture products,
  allowing it to be used with PyTorch's DataLoader for efficient batching during training or evaluation.

  Attributes:
  tokenized_furniture_names (dict): Tokenized names of furniture products.
  tokenized_non_furniture_names (dict): Tokenized names of non-furniture products.
  furniture_count (int): Number of furniture items in the dataset.
  non_furniture_count (int): Number of non-furniture items in the dataset.
  total_count (int): Total number of items in the dataset.
  """
  def __init__(self, tokenized_furniture_names, tokenized_non_furniture_names):
    """
    Initialize the dataset with tokenized furniture and non-furniture names.

    Args:
    tokenized_furniture_names (dict): A dictionary containing the tokenized names of furniture products.
    tokenized_non_furniture_names (dict): A dictionary containing the tokenized names of non-furniture products.
    """
    super().__init__()
    
    self.tokenized_furniture_names = tokenized_furniture_names
    self.tokenized_non_furniture_names = tokenized_non_furniture_names

    # Count of furniture and non-furniture items in the dataset
    self.furniture_count = len(self.tokenized_furniture_names["input_ids"])
    self.non_furniture_count = len(self.tokenized_non_furniture_names["input_ids"])
    self.total_count = self.furniture_count + self.non_furniture_count
  
  def __len__(self):
    """
    Return the total count of items in the dataset.
    """
    return self.total_count

  def __getitem__(self, index):
    """
    Retrieve an item from the dataset at the specified index.

    Args:
    index (int): Index of the item to be retrieved.

    Returns:
    dict: A dictionary containing input_ids, attention_mask, and label for the item.
    """
    if index < self.furniture_count:
        input_ids = self.tokenized_furniture_names["input_ids"][index]
        attention_mask = self.tokenized_furniture_names["attention_mask"][index]
        label = 1
    else:
        index -= self.furniture_count
        input_ids = self.tokenized_non_furniture_names["input_ids"][index]
        attention_mask = self.tokenized_non_furniture_names["attention_mask"][index]
        label = 0

    return {
      "input_ids": torch.tensor(input_ids, dtype=torch.long),
      "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
      "label": torch.tensor(label, dtype=torch.long),
    }

## Tokenization

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
tokenized_furniture_names = tokenizer(furniture_names)
tokenized_other_product_names = tokenizer(other_product_names)

In [9]:
dataset = FurnitureDataset(
  tokenized_furniture_names,
  tokenized_other_product_names
)

In [10]:
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1

train_size = int(TRAIN_RATIO * len(dataset))
val_size = int(VAL_RATIO * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"  # Device configuration

## Model Training

In [15]:
def experiment():
    """
    Conducts an experiment by training, validating, and testing a DistilBERT model for sequence classification.

    The function performs the following steps:
    1. Initializes data loaders for training, validation, and testing datasets.
    2. Sets up the DistilBERT model, optimizer, and loss function.
    3. Runs training and validation for a specified number of epochs.
    4. Evaluates the model on the test dataset.
    5. Saves the model

    No parameters are taken; instead, the function uses pre-defined settings.
    """

    # Data loading and model configuration parameters
    batch_size = 64
    batch_log_count = 25
    num_epochs = 5
    best_vloss = 1_000_000.  # Initial best validation loss for comparison

    # Initializing data loaders for training, validation, and testing
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, collate_fn=data_collator, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, collate_fn=data_collator, shuffle=False)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, collate_fn=data_collator, shuffle=False)

    # Initializing model and moving it to the appropriate device
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
    model.to(device)

    # Setting up the optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
    loss_fn = torch.nn.CrossEntropyLoss()

    # TensorBoard for logging
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    writer = SummaryWriter('../experiment_logs/furniture_trainer_{}'.format(timestamp))

    # Training and validation loop
    epoch_number = 0
    for epoch in range(num_epochs):
        print(f"EPOCH {epoch_number}:")

        # Training phase
        model.train(True)
        # Initialize metrics for training
        avg_loss, avg_accuracy, avg_precision, avg_recall, avg_f1 = 0, 0, 0, 0, 0
        for i, data in enumerate(train_dataloader):
            current_step = epoch * len(train_dataloader) + i + 1
            optimizer.zero_grad()

            # Prepare data and perform a forward pass
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            labels = data["labels"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            loss.backward()  # Backpropagation
            optimizer.step()  # Update model parameters

            # Update training metrics
            avg_loss += loss.item()
            avg_accuracy += (outputs.logits.argmax(axis=1) == labels).sum().item() / len(labels)
            avg_precision += ((outputs.logits.argmax(axis=1) == labels) & (labels == 1)).sum().item() / (outputs.logits.argmax(axis=1) == 1).sum().item()
            avg_recall += ((outputs.logits.argmax(axis=1) == labels) & (labels == 1)).sum().item() / (labels == 1).sum().item()
            avg_f1 = 2 * avg_precision * avg_recall / (avg_precision + avg_recall)

            # Log metrics every `batch_log_count` batches
            if i % batch_log_count == batch_log_count - 1:
                avg_loss /= batch_log_count
                avg_accuracy /= batch_log_count
                avg_precision /= batch_log_count
                avg_recall /= batch_log_count
                avg_f1 /= batch_log_count

                print(f"  batch {i + 1} loss: {avg_loss}, accuracy: {avg_accuracy}, recall: {avg_recall}, precision: {avg_precision}, f1: {avg_f1}")

                # Writing metrics to TensorBoard
                writer.add_scalar('Loss/train', avg_loss, current_step)
                writer.add_scalar('Accuracy/train', avg_accuracy, current_step)
                writer.add_scalar('Precision/train', avg_precision, current_step)
                writer.add_scalar('Recall/train', avg_recall, current_step)
                writer.add_scalar('F1/train', avg_f1, current_step)

                # Reset metrics after logging
                avg_loss, avg_accuracy, avg_precision, avg_recall, avg_f1 = 0, 0, 0, 0, 0

        # Validation phase
        running_vloss = 0
        val_true_positives, val_false_positives, val_true_negatives, val_false_negatives = 0, 0, 0, 0
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():  # Disable gradient calculation
            for i, vdata in enumerate(val_dataloader):
                # Prepare validation data
                input_ids = vdata["input_ids"].to(device)
                attention_mask = vdata["attention_mask"].to(device)
                labels = vdata["labels"].to(device)

                # Forward pass
                voutputs = model(input_ids=input_ids, attention_mask=attention_mask)
                vloss = loss_fn(voutputs.logits, labels)

                # Update validation metrics
                val_true_positives += ((voutputs.logits.argmax(axis=1) == labels) & (labels == 1)).sum().item()
                val_false_positives += ((voutputs.logits.argmax(axis=1) == 1) & (labels == 0)).sum().item()
                val_true_negatives += ((voutputs.logits.argmax(axis=1) == labels) & (labels == 0)).sum().item()
                val_false_negatives += ((voutputs.logits.argmax(axis=1) == 0) & (labels == 1)).sum().item()

                running_vloss += vloss.item()

        # Calculate average validation loss and metrics
        avg_vloss = running_vloss / (i + 1)
        val_accuracy = (val_true_positives + val_true_negatives) / (val_true_positives + val_true_negatives + val_false_positives + val_false_negatives)
        val_precision = val_true_positives / (val_true_positives + val_false_positives)
        val_recall = val_true_positives / (val_true_positives + val_false_negatives)
        val_f1 = 2 * val_precision * val_recall / (val_precision + val_recall)

        print(f"LOSS train {avg_loss} valid {avg_vloss} accuracy {val_accuracy} precision {val_precision} recall {val_recall} f1 {val_f1}")

        # Writing validation metrics to TensorBoard
        writer.add_scalar('Loss/valid', avg_vloss, epoch_number)
        writer.add_scalar('Accuracy/valid', val_accuracy, epoch_number)
        writer.add_scalar('Precision/valid', val_precision, epoch_number)
        writer.add_scalar('Recall/valid', val_recall, epoch_number)
        writer.add_scalar('F1/valid', val_f1, epoch_number)
        writer.flush()

        # Model checkpointing based on validation loss
        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            model_path = f"../models/model_{timestamp}_{epoch_number}"
            model.save_pretrained(model_path)  # Save the model

        epoch_number += 1
    
    # Test phase
    test_true_positives, test_false_positives, test_true_negatives, test_false_negatives = 0, 0, 0, 0
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        for i, tdata in enumerate(test_dataloader):
            # Prepare test data
            input_ids = tdata["input_ids"].to(device)
            attention_mask = tdata["attention_mask"].to(device)
            labels = tdata["labels"].to(device)

            # Forward pass
            toutputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Update test metrics
            test_true_positives += ((toutputs.logits.argmax(axis=1) == labels) & (labels == 1)).sum().item()
            test_false_positives += ((toutputs.logits.argmax(axis=1) == 1) & (labels == 0)).sum().item()
            test_true_negatives += ((toutputs.logits.argmax(axis=1) == labels) & (labels == 0)).sum().item()
            test_false_negatives += ((toutputs.logits.argmax(axis=1) == 0) & (labels == 1)).sum().item()

    # Calculate and print test metrics
    test_accuracy = (test_true_positives + test_true_negatives) / (test_true_positives + test_true_negatives + test_false_positives + test_false_negatives)
    test_precision = test_true_positives / (test_true_positives + test_false_positives)
    test_recall = test_true_positives / (test_true_positives + test_false_negatives)
    test_f1 = 2 * test_precision * test_recall / (test_precision + test_recall)
    print(f"TEST accuracy {test_accuracy} precision {test_precision} recall {test_recall} f1 {test_f1}")

    return model


## Model Training Results

In [12]:
experiment()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


EPOCH 0:
  batch 25 loss: 0.35609730005264284, accuracy: 0.886875, recall: 0.9612036121765698, precision: 0.8902863505458195, f1: 0.9243868162891546
  batch 50 loss: 0.03394752856343985, accuracy: 0.993125, recall: 0.9941143321762516, precision: 0.9923325785159399, f1: 0.9932226562695916
  batch 75 loss: 0.03331039322540164, accuracy: 0.9925, recall: 0.9949285714285715, precision: 0.9910720289667658, f1: 0.9929965557577735
  batch 100 loss: 0.014766490627080202, accuracy: 0.9975, recall: 0.9978235294117647, precision: 0.9974901960784313, f1: 0.9976568349020803
  batch 125 loss: 0.008649311624467373, accuracy: 0.99875, recall: 0.9987096774193549, precision: 0.9987096774193549, f1: 0.998709677419355
LOSS train 0.21713808667846024 valid 0.003950648203802605 accuracy 0.9991079393398751 precision 0.9982332155477032 recall 1.0 f1 0.9991158267020336
EPOCH 1:
  batch 25 loss: 0.008238663766533136, accuracy: 0.99875, recall: 1.0, precision: 0.9975555555555554, f1: 0.9987762821225944
  batch 50 

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [16]:
def predict(string, model):
    model.eval()
    with torch.no_grad():
        input_ids = torch.tensor(tokenizer(string)["input_ids"]).unsqueeze(0).to(device)
        attention_mask = torch.tensor(tokenizer(string)["attention_mask"]).unsqueeze(0).to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        is_furniture = outputs.logits.argmax(axis=1).item()
        print(f"Is furniture: {'yes' if is_furniture else 'no'}")

In [17]:
model = AutoModelForSequenceClassification.from_pretrained("../models/model_20231204_195024_4/").to(device)


In [18]:
predict("MANHATTAN | BLACK PVC DINING CHAIRS | SET OF 4", model)

Is furniture: yes


In [None]:
%tensorboard