### Importing libraries

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import torch

import warnings
warnings.filterwarnings('ignore')

### Loading data

In [2]:
data_path = 'data/Retail_Sales_Data.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,retailer,store_id,week,product,description,regular_price,competition_1_regular_price,competition_2_regular_price,competition_3_regular_price,competition_4_regular_price,...,competition_4_promo_price,competition_5_promo_price,competition_6_promo_price,competition_7_promo_price,competition_8_promo_price,competition_9_promo_price,competition_10_promo_price,value,volume,quantity
0,retail1,1,1,MintyFresh Mint 18g,MintyFresh refresh mint 18grams,10.308645,10.624815,10.013451,10.378847,9.910385,...,8.500521,7.139537,7.606831,8.339536,7.184384,5.75962,7.376553,85.36831,3.953882,9.707949
1,retail1,1,1,ChocoDelight Dark 200g,200g ChocoDelight smooth,7.947956,7.748726,7.589198,8.085262,7.602363,...,5.766135,4.267236,5.912718,5.156558,6.976287,5.884786,5.335849,82.924731,2.991418,14.922586
2,retail1,1,1,ChocoDelight White 350g,350g ChocoDelight delicious,13.439368,13.751508,13.183621,13.87544,12.982707,...,8.195118,8.214548,8.995387,10.266603,10.940801,8.727031,7.915147,52.527762,6.5136,12.889607
3,retail1,1,1,NuttyCream Hazelnuts 80g,delightful Hazelnuts 80g NuttyCream,5.112777,5.279713,5.222118,5.117789,4.877549,...,,,,,,,,48.373782,5.489587,14.374834
4,retail1,1,1,DarkDream Dark 60g,A decadent DarkDream dark chocolate ice cream ...,14.085425,14.153721,13.860002,14.339698,14.365517,...,,,,,,,,47.718252,9.15205,9.193818


#### Creating a dataframe with product name and description

In [3]:
df_product = df[['product', 'description']].drop_duplicates()
df_product

Unnamed: 0,product,description
0,MintyFresh Mint 18g,MintyFresh refresh mint 18grams
1,ChocoDelight Dark 200g,200g ChocoDelight smooth
2,ChocoDelight White 350g,350g ChocoDelight delicious
3,NuttyCream Hazelnuts 80g,delightful Hazelnuts 80g NuttyCream
4,DarkDream Dark 60g,A decadent DarkDream dark chocolate ice cream ...
6,ChocoDelight Dark 200g,A rich and smooth ChocoDelight dark chocolate ...
7,ChocoDelight White 350g,ChocoDelight White 350g creamy
8,NuttyCream Hazelnuts 80g,Hazelnuts NuttyCream 80g delightful
10,MintyFresh Mint 18g,18g MintyFresh ref
12,ChocoDelight White 350g,creamy ChocoDelight 350g


### Using TF-IDF and Logistic Regression

In [4]:
# Text preprocessing and feature extraction
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df_product['description'])

# Target variable
y = df_product['product']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = LogisticRegression()
model.fit(X_train, y_train)

# Model prediction and evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 1.00


In [5]:
# Original values
y_test

0          MintyFresh Mint 18g
48    NuttyCream Hazelnuts 80g
34          DarkDream Dark 60g
1       ChocoDelight Dark 200g
Name: product, dtype: object

In [6]:
# Predicted values
y_pred.tolist()

['MintyFresh Mint 18g',
 'NuttyCream Hazelnuts 80g',
 'DarkDream Dark 60g',
 'ChocoDelight Dark 200g']

In [7]:
pred_df = pd.DataFrame({'Actual': np.array(y_test), 'Predicted': y_pred})
pred_df

Unnamed: 0,Actual,Predicted
0,MintyFresh Mint 18g,MintyFresh Mint 18g
1,NuttyCream Hazelnuts 80g,NuttyCream Hazelnuts 80g
2,DarkDream Dark 60g,DarkDream Dark 60g
3,ChocoDelight Dark 200g,ChocoDelight Dark 200g


In [8]:
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

  ChocoDelight Dark 200g       1.00      1.00      1.00         1
      DarkDream Dark 60g       1.00      1.00      1.00         1
     MintyFresh Mint 18g       1.00      1.00      1.00         1
NuttyCream Hazelnuts 80g       1.00      1.00      1.00         1

                accuracy                           1.00         4
               macro avg       1.00      1.00      1.00         4
            weighted avg       1.00      1.00      1.00         4



### Using BERT

In [9]:
data = df_product.copy()

In [10]:
label_encoder = LabelEncoder()
data['product_encoded'] = label_encoder.fit_transform(data['product'])

In [11]:
# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    data['description'], data['product_encoded'], test_size=0.2, random_state=42)

In [12]:
# Dataset class
class ProductDataset(Dataset):
    def __init__(self, descriptions, labels, tokenizer):
        self.descriptions = descriptions
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.descriptions.iloc[idx], truncation=True, padding="max_length", max_length=128)
        encoding['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        return encoding

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Training dataset
train_dataset = ProductDataset(train_data, train_labels, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [15]:
# Testing dataset
test_dataset = ProductDataset(test_data, test_labels, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=2)

In [16]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)

In [17]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)


In [18]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=80, training_loss=0.8676412582397461, metrics={'train_runtime': 203.6695, 'train_samples_per_second': 0.786, 'train_steps_per_second': 0.393, 'total_flos': 10524725698560.0, 'train_loss': 0.8676412582397461, 'epoch': 10.0})

In [19]:
eval_results = trainer.evaluate(test_dataset)
print(f"Evaluation results: {eval_results}")

Evaluation results: {'eval_loss': 0.28931036591529846, 'eval_runtime': 1.0374, 'eval_samples_per_second': 3.856, 'eval_steps_per_second': 0.964, 'epoch': 10.0}


In [20]:
predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(-1)

In [21]:
# Mapping predictions back to product names
predicted_product_names = label_encoder.inverse_transform(predicted_labels)
print(f"Predicted product names: {predicted_product_names}")

Predicted product names: ['MintyFresh Mint 18g' 'NuttyCream Hazelnuts 80g' 'DarkDream Dark 60g'
 'ChocoDelight Dark 200g']


In [22]:
y_test = label_encoder.inverse_transform(test_labels).tolist()
print(f"Original product names: {y_test}")

Original product names: ['MintyFresh Mint 18g', 'NuttyCream Hazelnuts 80g', 'DarkDream Dark 60g', 'ChocoDelight Dark 200g']


In [23]:
accuracy = accuracy_score(test_labels, predicted_labels)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 1.00


In [25]:
pred_df = pd.DataFrame({'Actual': np.array(y_test), 'Predicted': predicted_product_names})
pred_df

Unnamed: 0,Actual,Predicted
0,MintyFresh Mint 18g,MintyFresh Mint 18g
1,NuttyCream Hazelnuts 80g,NuttyCream Hazelnuts 80g
2,DarkDream Dark 60g,DarkDream Dark 60g
3,ChocoDelight Dark 200g,ChocoDelight Dark 200g


In [26]:
print(classification_report(test_labels, predicted_labels))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4



In [27]:
print(classification_report(y_test, predicted_product_names))

                          precision    recall  f1-score   support

  ChocoDelight Dark 200g       1.00      1.00      1.00         1
      DarkDream Dark 60g       1.00      1.00      1.00         1
     MintyFresh Mint 18g       1.00      1.00      1.00         1
NuttyCream Hazelnuts 80g       1.00      1.00      1.00         1

                accuracy                           1.00         4
               macro avg       1.00      1.00      1.00         4
            weighted avg       1.00      1.00      1.00         4

