In [0]:
import pandas as pd
import numpy as np

df = pd.read_csv("/Workspace/Users/anmol@i-genie.ai/MDL_Coding/Hair_Care_Corrected.csv")
df.head()

# Replace None and empty strings with a placeholder 'None'
df.fillna('None', inplace=True)
df.replace('', 'None', inplace=True)

# Concatenate the output columns
df['Combined_Labels'] = df['Category'] + '|||' + df['Segment'] + '|||' + df['Sub-Segment']

print(df[['Product', 'Product Category', 'Combined_Labels']])


                                                 Product  ...                                 Combined_Labels
0      Honey Treasures Leave-In Miracle Nectar Treatment  ...  Hair Care|||Hair Treat and Protect|||Treatment
1         Eva NYC Therapy Session Hair Mask - 16.9 fl oz  ...       Hair Care|||Hair Treat and Protect|||Mask
2             Eva NYC Therapy Session Hair Mask, 16.9 OZ  ...       Hair Care|||Hair Treat and Protect|||Mask
3      Kitsch Rice Water Protein Strengthening Shampo...  ...                   Hair Care|||Shampoo|||Shampoo
4      One Signature Conditioner - Moisturizes, Smoot...  ...           Hair Care|||Conditioner|||Conditioner
...                                                  ...  ...                                             ...
12655  Cremo Hair Sculpting Clay, High Hold, Matte Fi...  ...     Hair Care|||Hair Styling|||Styling Products
12656      ($24 Value) Aquage Transforming Paste, 4.6 oz  ...     Hair Care|||Hair Styling|||Styling Products
12657     

In [0]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os, pickle
import torch
torch.cuda.empty_cache()

from torch.nn import DataParallel
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import matplotlib.pyplot as plt
import plotly.graph_objects as go

from tqdm import tqdm

class RoBERTaClassifier:
    def __init__(self, model_name='roberta-base', max_len=128, model_path="/dbfs/mnt/igenie-blob01/Anmol_AI_dir/MDP/", category="product_type"):
        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
        self.max_len = max_len
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.product_model_path = os.path.join(model_path, category)

    class TextClassificationDataset(Dataset):
        def __init__(self, texts, labels, tokenizer, max_len):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_len = max_len

        def __len__(self):
            return len(self.texts)

        def __getitem__(self, idx):
            text = self.texts[idx]
            label = self.labels[idx]
            encoding = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_len,
                return_attention_mask=True,
                padding='max_length',
                truncation=True,
                return_tensors='pt',
            )
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(label, dtype=torch.long)
            }

    def prepare_data(self, df, input_columns, label_column):
        df['input_text'] = df[input_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
        texts = df['input_text'].tolist()
        labels = df[label_column].tolist()
        
        self.label_encoder = LabelEncoder()
        encoded_labels = self.label_encoder.fit_transform(labels)
        num_classes = len(set(encoded_labels))
        
        return texts, encoded_labels, num_classes

    def evaluate_model(self, val_loader):
        self.model.eval()
        total_eval_loss = 0
        all_predictions = []
        all_labels = []

        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(self.device) for k, v in batch.items()}
                outputs = self.model(input_ids=batch['input_ids'],
                                    attention_mask=batch['attention_mask'],
                                    labels=batch['labels'])
                loss = outputs.loss
                total_eval_loss += loss.item()

                predictions = torch.argmax(outputs.logits, dim=1)
                all_predictions.extend(predictions.cpu().numpy())
                all_labels.extend(batch['labels'].cpu().numpy())

        avg_val_loss = total_eval_loss / len(val_loader)
        overall_accuracy = accuracy_score(all_labels, all_predictions)
        metrics_report = self.enhancedClassificationReport(all_labels, all_predictions)

        return {
            'avg_val_loss': avg_val_loss,
            'classification_report': metrics_report,
            'accuracy': overall_accuracy
        }

    def enhancedClassificationReport(self, true_labels, predicted_labels):
            report = classification_report(true_labels, predicted_labels, output_dict=True, zero_division=0)
            cm = confusion_matrix(true_labels, predicted_labels)
            TP = cm.diagonal()
            FP = cm.sum(axis=0) - TP
            FN = cm.sum(axis=1) - TP
            
            # Enhance the report with TP, FP, FN
            for i, label in enumerate(self.label_encoder.classes_):
                report[str(i)]['TP'] = TP[i]
                report[str(i)]['FP'] = FP[i]
                report[str(i)]['FN'] = FN[i]
            return report
        
    def get_label_name(self, label):
        try:
            # Attempt to transform the label using the encoder
            if any([label, int(label) in self.label_encoder.transform(self.label_encoder.classes_)]):
                return self.label_encoder.inverse_transform([int(label)])[0]
            else:
                return label
        except ValueError:
            # If the label is not recognized by the encoder, return it as is
            return label
    
    def comparativePivot(self, metrics_df):

        # Define your preferred order of metrics
        metrics_values = ['F1-Score', 'Precision', 'Recall', 'Support', 'TP', 'FP', 'FN']

        pivot_df = metrics_df.pivot_table(
            index=['Label', 'Label Name'],
            columns=['Phase', 'Epoch'],
            values=metrics_values,
            aggfunc='first'
        ).reset_index()

        # Ensure the DataFrame's columns are lexsorted
        df = pivot_df.sort_index(axis=1)

        # Sorting based on test F1-scores
        special_rows = df[df['Label Name'].isin(['weighted avg', 'macro avg'])]
        df_rest = df[~df['Label Name'].isin(['weighted avg', 'macro avg'])]
        max_index = df_rest['F1-Score', 'test-set'].columns.max()
        df_rest_sorted = df_rest.sort_values(by=('F1-Score', 'test-set', max_index), ascending=False)
        pivot_df = pd.concat([df_rest_sorted, special_rows])
        pivot_df.set_index(['Label', 'Label Name'], inplace=True)

        # sorting column indexes & formatting values
        sorted_columns = sorted(pivot_df.columns, key=lambda x: (metrics_values.index(x[0]), x[1], x[2]))
        pivot_df = pivot_df[sorted_columns]
        pivot_df.update(pivot_df[['F1-Score', 'Precision', 'Recall']].apply(lambda x: x * 100).round(2).astype(str) + '%')
        
        return pivot_df
    
    def build_metrics_dataframe(self, all_epoch_metrics, test_metrics=None, phase_name='test-set'):
        epochs = len(all_epoch_metrics)
        data = []

        # Collecting data for each epoch
        for epoch, metrics in enumerate(all_epoch_metrics, start=1):
            for label, scores in metrics['classification_report'].items():
                label_name = self.get_label_name(label)
                if isinstance(scores, dict): 
                    data.append({
                        'Phase': phase_name,
                        'Epoch': epoch,
                        'Label': label,
                        'Label Name': label_name,
                        'Precision': scores.get('precision'),
                        'Recall': scores.get('recall'),
                        'F1-Score': scores.get('f1-score'),
                        'Support': scores.get('support'),
                        'TP': scores.get('TP'),
                        'FP': scores.get('FP'),
                        'FN': scores.get('FN')
                    })

        # Adding test metrics
        if test_metrics:
            for label, scores in test_metrics['classification_report'].items():
                label_name = self.get_label_name(label)
                if isinstance(scores, dict):
                    data.append({
                        'Phase': 'test-set',
                        'Epoch': epoch+1,
                        'Label': label,
                        'Label Name': label_name,
                        'Precision': scores.get('precision'),
                        'Recall': scores.get('recall'),
                        'F1-Score': scores.get('f1-score'),
                        'Support': scores.get('support'),
                        'TP': scores.get('TP'),
                        'FP': scores.get('FP'),
                        'FN': scores.get('FN')
                    })

        metrics_df = pd.DataFrame(data)
        return metrics_df, self.comparativePivot(metrics_df)
 
    def plotMetrics(self, metrics_df):
        # Define the metrics you want to plot
        metrics = ['F1-Score', 'Precision', 'Recall'] 

        # Iterate over each metric to create a separate plot
        for metric in metrics:
            fig = go.Figure()

            # Group by label to plot each Metric
            for (label, label_name), group in metrics_df.groupby(['Label', 'Label Name']):

                # Format label based on its type (numeric or string)
                formatted_label = f'{int(label):02}' if str(label).isdigit() else label
                # Construct the trace name
                trace_name = f'{formatted_label} - {label_name}'

                fig.add_trace(go.Scatter(
                    x=group['Epoch'],
                    y=group[metric],
                    mode='lines+markers',
                    name=trace_name
                ))

            # Update plot layout
            fig.update_layout(
                title=f'{metric} Over Epochs',
                xaxis_title='Epochs',
                yaxis_title=metric,
                legend_title='Label',
                hovermode='x unified'
            )

            # Show the plot
            fig.show()

    def save_checkpoint(self, model, label_encoder):
        if not os.path.exists(self.product_model_path):
            os.makedirs(self.product_model_path)
        torch.save(model.state_dict(), os.path.join(self.product_model_path, 'roberta_model.pt'))
        with open(os.path.join(self.product_model_path, 'label_encoder.pkl'), 'wb') as f:
            pickle.dump(label_encoder, f)

    def load_checkpoint(self):
        with open(os.path.join(self.product_model_path, 'label_encoder.pkl'), 'rb') as f:
            label_encoder = pickle.load(f)

        model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))
        # Load the model state dict with DataParallel wrapper
        model = torch.nn.DataParallel(model)
        model.load_state_dict(torch.load(os.path.join(self.product_model_path, 'roberta_model.pt')))
        model = model.module.to(self.device)  # Remove DataParallel wrapper after loading
        model.eval()

        return model, label_encoder

    def train_and_evaluate(self, df, input_columns, label_column, batch_size=64, epochs=3, learning_rate=2e-5):
        texts, labels, num_classes = self.prepare_data(df, input_columns, label_column)
        self.model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_classes)
        self.model = DataParallel(self.model)
        self.model.to(self.device)

        # First, split into training+validation and test sets
        train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(
            texts, labels, test_size=0.1, random_state=42, stratify=labels)

        # Then, split the remaining data into training and validation sets
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            train_val_texts, train_val_labels, test_size=0.1, random_state=42, stratify=train_val_labels)  
        
        train_dataset = self.TextClassificationDataset(train_texts, train_labels, self.tokenizer, self.max_len)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_dataset = self.TextClassificationDataset(val_texts, val_labels, self.tokenizer, self.max_len)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        optimizer = AdamW(self.model.parameters(), lr=learning_rate)
        all_epoch_metrics = []

        for epoch in range(epochs):
            self.model.train()
            train_progress = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}, Training')
            total_train_loss = 0
            for batch in train_progress:
                optimizer.zero_grad()
                outputs = self.model(input_ids=batch['input_ids'].to(self.device),
                                     attention_mask=batch['attention_mask'].to(self.device),
                                     labels=batch['labels'].to(self.device))
                loss = outputs.loss
                loss.backward()
                optimizer.step()

                total_train_loss += loss.item()
                train_progress.set_postfix(loss=loss.item())

            avg_train_loss = total_train_loss / len(train_loader)
            epoch_metrics = self.evaluate_model(val_loader)
            all_epoch_metrics.append(epoch_metrics)

            print(f"\nEpoch {epoch + 1} complete. Training Loss: {avg_train_loss}, Validation Loss: {epoch_metrics.get('avg_val_loss')}, Validation Accuracy: {epoch_metrics.get('accuracy')}")

        # Evaluate on the test set
        test_dataset = self.TextClassificationDataset(test_texts, test_labels, self.tokenizer, self.max_len)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
        test_metrics = self.evaluate_model(test_loader)

        # Comparative Analysis & Metrics Visualization
        comparative_metrics_analysis_df, pivot_metrics_df = self.build_metrics_dataframe(all_epoch_metrics, test_metrics, phase_name='dev-set')
        self.plotMetrics(comparative_metrics_analysis_df)

        # Saving checkpoints
        self.save_checkpoint(self.model, self.label_encoder)
        print("Saved best model checkpoint.")
        return comparative_metrics_analysis_df, pivot_metrics_df

    def predict(self, df, input_columns, label_column=None, use_checkpoint=True):
        if use_checkpoint:
            self.model, self.label_encoder = self.load_checkpoint()

        self.model.eval()
        predictions = []

        # Prepare input text from the dataframe using specified input columns
        df['input_text'] = df[input_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
        texts = df['input_text'].tolist()

        prediction_progress = tqdm(texts, desc='Predicting')
        for text in prediction_progress:
            encoded_input = self.tokenizer.encode_plus(
                text, 
                add_special_tokens=True, 
                max_length=self.max_len, 
                return_attention_mask=True,
                padding='max_length', 
                truncation=True, 
                return_tensors='pt'
            )
            with torch.no_grad():
                outputs = self.model(
                    input_ids=encoded_input['input_ids'].to(self.device),
                    attention_mask=encoded_input['attention_mask'].to(self.device)
                )
                logits = outputs.logits
                predicted_label_indices = torch.argmax(logits, dim=1).cpu().numpy()
                predicted_labels = self.label_encoder.inverse_transform(predicted_label_indices)
                predictions.extend(predicted_labels)

        # Add predictions to DataFrame
        df['Predicted'] = predictions
        predict_report = pd.DataFrame()

        # If a label column for evaluation is provided, calculate additional metrics
        if label_column is not None:
            true_labels = df[label_column].tolist()
            encoded_true_labels = self.label_encoder.transform(true_labels)
            enhanced_report = self.enhancedClassificationReport(encoded_true_labels, self.label_encoder.transform(predictions))
            predict_report = pd.DataFrame(enhanced_report).T
            label_name = predict_report.apply(lambda x: self.get_label_name(x.name), axis=1)
            predict_report.insert(loc=0, column='label_name', value=label_name)
            predict_report.update(predict_report[['f1-score', 'precision', 'recall']].apply(lambda x: x * 100).round(2).astype(str) + '%')

            # Sorting based on test F1-scores
            special_rows = predict_report[predict_report['label_name'].isin(['accuracy', 'weighted avg', 'macro avg'])]
            df_rest = predict_report[~predict_report['label_name'].isin(['accuracy', 'weighted avg', 'macro avg'])]
            df_rest_sorted = df_rest.sort_values(by='f1-score', ascending=False)
            predict_report = pd.concat([df_rest_sorted, special_rows])
        return df, predict_report


In [0]:
# classifier starts
bert_classifier = RoBERTaClassifier(model_path="/dbfs/mnt/igenie-blob01/Anmol_AI_dir/MDP/", category="hair_care")

# # Train the model
# comparative_metrics_analysis_df, pivot_metrics_df= bert_classifier.train_and_evaluate(df, 
#                                                                                       input_columns=['Product', 'Product Category'], 
#                                                                                       label_column='Combined_Labels', 
#                                                                                       batch_size=128, 
#                                                                                       epochs=2, learning_rate=2e-5)

In [0]:
predict_df, predict_report = bert_classifier.predict(df, 
                                                    input_columns=['Product', 'Product Category'], 
                                                    label_column='Combined_Labels')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should pr

In [0]:
pivot_metrics_df

Unnamed: 0_level_0,Unnamed: 1_level_0,F1-Score,F1-Score,F1-Score,Precision,Precision,Precision,Recall,Recall,Recall,Support,Support,Support,TP,TP,TP,FP,FP,FP,FN,FN,FN
Unnamed: 0_level_1,Phase,dev-set,dev-set,test-set,dev-set,dev-set,test-set,dev-set,dev-set,test-set,dev-set,dev-set,test-set,dev-set,dev-set,test-set,dev-set,dev-set,test-set,dev-set,dev-set,test-set
Unnamed: 0_level_2,Epoch,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3
Label,Label Name,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3
0,Hair Care|||Combo Packs|||Combo Packs,92.71%,99.45%,100.0%,87.25%,98.9%,100.0%,98.89%,100.0%,100.0%,90,90,100,89.0,90.0,100.0,13.0,1.0,0.0,1.0,0.0,0.0
11,Hair Care|||None|||None,0.0%,100.0%,100.0%,0.0%,100.0%,100.0%,0.0%,100.0%,100.0%,9,9,10,0.0,9.0,10.0,0.0,0.0,0.0,9.0,0.0,0.0
1,Hair Care|||Conditioner|||Conditioner,95.96%,99.72%,99.75%,92.71%,100.0%,100.0%,99.44%,99.44%,99.5%,179,179,199,178.0,178.0,198.0,14.0,0.0,0.0,1.0,1.0,1.0
4,Hair Care|||Hair Styling|||Styling Products,98.56%,99.76%,99.14%,99.03%,100.0%,99.14%,98.09%,99.52%,99.14%,209,209,232,205.0,208.0,230.0,2.0,0.0,2.0,4.0,1.0,2.0
7,Hair Care|||Hair Treat and Protect|||Mask,95.12%,100.0%,98.88%,90.7%,100.0%,97.78%,100.0%,100.0%,100.0%,39,39,44,39.0,39.0,44.0,4.0,0.0,1.0,0.0,0.0,0.0
13,Hair Care|||Shampoo|||Shampoo,92.11%,99.84%,98.84%,85.37%,100.0%,100.0%,100.0%,99.68%,97.71%,315,315,350,315.0,314.0,342.0,54.0,0.0,0.0,0.0,1.0,8.0
14,Hair Care|||Shampoo|||Therapeutic,0.0%,98.63%,98.77%,0.0%,100.0%,100.0%,0.0%,97.3%,97.56%,37,37,41,0.0,36.0,40.0,0.0,0.0,0.0,37.0,1.0,1.0
6,Hair Care|||Hair Treat and Protect|||Leave In Conditioner,68.18%,98.25%,98.41%,93.75%,96.55%,96.88%,53.57%,100.0%,100.0%,28,28,31,15.0,28.0,31.0,1.0,1.0,1.0,13.0,0.0,0.0
8,Hair Care|||Hair Treat and Protect|||Oil,91.18%,100.0%,97.81%,83.78%,100.0%,98.53%,100.0%,100.0%,97.1%,62,62,69,62.0,62.0,67.0,12.0,0.0,1.0,0.0,0.0,2.0
2,Hair Care|||Hair Regrowth|||Hair Regrowth,97.14%,100.0%,95.0%,100.0%,100.0%,95.0%,94.44%,100.0%,95.0%,18,18,20,17.0,18.0,19.0,0.0,0.0,1.0,1.0,0.0,1.0


In [0]:
predict_report

Unnamed: 0,label_name,precision,recall,f1-score,support,TP,FP,FN
4,Hair Care|||Hair Styling|||Styling Products,99.74%,99.66%,99.7%,2319.0,2311.0,6.0,8.0
1,Hair Care|||Conditioner|||Conditioner,99.7%,99.6%,99.65%,1991.0,1983.0,6.0,8.0
7,Hair Care|||Hair Treat and Protect|||Mask,99.32%,99.77%,99.54%,438.0,437.0,3.0,1.0
11,Hair Care|||None|||None,100.0%,98.98%,99.49%,98.0,97.0,0.0,1.0
8,Hair Care|||Hair Treat and Protect|||Oil,99.42%,99.42%,99.42%,688.0,684.0,4.0,4.0
13,Hair Care|||Shampoo|||Shampoo,99.83%,99.0%,99.41%,3499.0,3464.0,6.0,35.0
0,Hair Care|||Combo Packs|||Combo Packs,99.5%,99.3%,99.4%,997.0,990.0,5.0,7.0
14,Hair Care|||Shampoo|||Therapeutic,99.02%,98.54%,98.78%,411.0,405.0,4.0,6.0
6,Hair Care|||Hair Treat and Protect|||Leave In ...,96.88%,99.04%,97.95%,313.0,310.0,10.0,3.0
15,Hair Care|||Vitamins & Supplements|||Vitamins ...,97.56%,97.56%,97.56%,41.0,40.0,1.0,1.0
