<a href="https://colab.research.google.com/github/benny-liang0623/maboo/blob/main/Brand/bert_brand_classifier_with_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MABOO Brand Classifier with XLM-RoBERTa

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
!pip install -q pytorch-lightning
!pip install -q sentencepiece
!pip install -q transformers

[K     |████████████████████████████████| 584 kB 7.4 MB/s 
[K     |████████████████████████████████| 409 kB 55.6 MB/s 
[K     |████████████████████████████████| 596 kB 47.9 MB/s 
[K     |████████████████████████████████| 140 kB 74.9 MB/s 
[K     |████████████████████████████████| 1.1 MB 52.8 MB/s 
[K     |████████████████████████████████| 271 kB 55.4 MB/s 
[K     |████████████████████████████████| 144 kB 66.0 MB/s 
[K     |████████████████████████████████| 94 kB 4.1 MB/s 
[K     |████████████████████████████████| 1.2 MB 8.1 MB/s 
[K     |████████████████████████████████| 4.2 MB 8.3 MB/s 
[K     |████████████████████████████████| 86 kB 5.6 MB/s 
[K     |████████████████████████████████| 6.6 MB 52.3 MB/s 
[?25h

In [6]:
# Import all libraries
import pandas as pd
import numpy as np
import re

# Huggingface transformers
import transformers
from transformers import BertModel, BertTokenizer, XLMRobertaConfig, AdamW, get_linear_schedule_with_warmup
# from transformers import XLMRobertaModel, XLMRobertaTokenizer, XLMRobertaConfig

import torch
from torch import nn ,cuda
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
%matplotlib inline

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Data preprocessing

In [7]:
import pandas as pd
train_path = "/content/drive/Shareddrives/MABOO/brand/data/over_under_brand.csv"
train_data = pd.read_csv(train_path, encoding="utf-8")
train_data.head(1)

Unnamed: 0.1,Unnamed: 0,name,brand
0,0,元山熱水瓶YS-5401A,元山


In [8]:
train_data.shape

(141891, 3)

In [9]:
def to_string(name_list):
  s = ""
  for name in name_list:
    s += name
    s+= ""
  return s

In [10]:
train_data = train_data.dropna(0)
train_data.shape

  """Entry point for launching an IPython kernel.


(141891, 3)

In [11]:
train_data["name"] = train_data["name"].map(lambda x: to_string(re.findall(r'[a-zA-Z0-9\u4e00-\u9fff]+',x)) )
train_data = train_data[["name","brand"]]
train_data.head(1)

Unnamed: 0,name,brand
0,元山熱水瓶YS5401A,元山


In [12]:
train = train_data
train.shape

(141891, 2)

Encode y label

In [13]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

x = train["name"].tolist()
le = LabelEncoder()
yt = le.fit_transform(train["brand"])

In [14]:
CLASS_NUM = len(le.classes_)
CLASS_NUM

4068

In [15]:
questions = x

Split the dataset into training, validation and test set.

In [16]:
from sklearn.model_selection import train_test_split
# First Split for Train and Test
x_train,x_test,y_train,y_test = train_test_split(x, yt, test_size=0.2, stratify=yt, random_state=RANDOM_SEED, shuffle=True)
# Next split Train in to training and validation
x_tr,x_val,y_tr,y_val = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=RANDOM_SEED, shuffle=True)

In [17]:
len(x_tr) ,len(x_val), len(x_test)

(90809, 22703, 28379)

## Preparing the Dataset and DataModule  
First create QTagDataset class based on the Dataset class, that readies the text in a format needed for the RoBERTa Model

In [18]:
class QTagDataset (Dataset):
    def __init__(self,quest,tags, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = quest
        self.labels = tags
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item_idx):
        text = self.text[item_idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True, # Add [CLS] [SEP]
            max_length= self.max_len,
            padding = 'max_length',
            return_token_type_ids= False,
            return_attention_mask= True, # Differentiates padded vs normal token
            truncation=True, # Truncate data beyond max length
            return_tensors = 'pt' # PyTorch Tensor format
          )
        
        input_ids = inputs['input_ids'].flatten()
        attn_mask = inputs['attention_mask'].flatten()
        #token_type_ids = inputs["token_type_ids"]
        
        return {
            'input_ids': input_ids ,
            'attention_mask': attn_mask,
            'label': torch.tensor(self.labels[item_idx], dtype=torch.long)
            
        }

Since we are using Pytorch Lightning for Model training - we will setup the QTagDataModule class that is derived from the LightningDataModule

In [19]:
class QTagDataModule (pl.LightningDataModule):
    
    def __init__(self,x_tr,y_tr,x_val,y_val,x_test,y_test,tokenizer,batch_size=16,max_token_len=200):
        super().__init__()
        self.tr_text = x_tr
        self.tr_label = y_tr
        self.val_text = x_val
        self.val_label = y_val
        self.test_text = x_test
        self.test_label = y_test
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self,stage=None):
        self.train_dataset = QTagDataset(quest=self.tr_text, tags=self.tr_label, tokenizer=self.tokenizer,max_len = self.max_token_len)
        self.val_dataset  = QTagDataset(quest=self.val_text,tags=self.val_label,tokenizer=self.tokenizer,max_len = self.max_token_len)
        self.test_dataset  = QTagDataset(quest=self.test_text,tags=self.test_label,tokenizer=self.tokenizer,max_len = self.max_token_len)
             
    def train_dataloader(self):
        return DataLoader (self.train_dataset,batch_size = self.batch_size,shuffle = True , num_workers=2)

    def val_dataloader(self):
        return DataLoader (self.val_dataset,batch_size= 16)

    def test_dataloader(self):
        return DataLoader (self.test_dataset,batch_size= 16)

In [20]:
# Initialize the XLMR tokenizer
# ROBERTA_MODEL_NAME = "xlm-roberta-large" 
# tokenizer = transformers.XLMRobertaTokenizerFast.from_pretrained(ROBERTA_MODEL_NAME)

In [21]:
# Initialize the Bert tokenizer
BERT_MODEL_NAME = "bert-base-multilingual-uncased" 
tokenizer = transformers.BertTokenizer.from_pretrained(BERT_MODEL_NAME)

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [22]:
#　Initialize the parameters that will be use for training
N_EPOCHS = 100
BATCH_SIZE = 64
MAX_LEN = 64
LR = 0.0012

In [23]:
# Instantiate and set up the data_module
QTdata_module = QTagDataModule(x_tr,y_tr,x_val,y_val,x_test,y_test,tokenizer,BATCH_SIZE,MAX_LEN)
QTdata_module.setup()

## Train the Model
Setup the Classifier Model - dervived from LightningModule , similar to nn.module of PyTorch

In [24]:
class QTagClassifier(pl.LightningModule):
    def __init__(self, n_classes=CLASS_NUM, steps_per_epoch=None, n_epochs=N_EPOCHS, lr=LR):
        super().__init__()

        self.num_labels = 1
        # self.roberta = XLMRobertaModel.from_pretrained(ROBERTA_MODEL_NAME)
        # self.linear_1 = nn.Linear(self.roberta.config.hidden_size, 1024)
        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME)
        self.linear_1 = nn.Linear(self.bert.config.hidden_size, 1024)
        self.linear_2 = nn.Linear(1024, n_classes)
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr
        self.criterion = nn.CrossEntropyLoss()
        
    def forward(self,input_ids, attn_mask):
        # output = self.roberta(input_ids = input_ids ,attention_mask = attn_mask)
        output = self.bert(input_ids = input_ids ,attention_mask = attn_mask)
        output = torch.relu(self.linear_1(output.pooler_output))
        output = self.linear_2(output)
        return output
    
    
    def training_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('train_loss',loss , prog_bar=True,logger=True)
        
        return {"loss" :loss, "predictions":outputs, "labels": labels }


    def validation_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('val_loss',loss , prog_bar=True,logger=True)
        
        return loss

    def test_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('test_loss',loss , prog_bar=True,logger=True)
        
        return loss
    
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters() , lr=self.lr)
        warmup_steps = self.steps_per_epoch//3
        total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps

        scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,total_steps)

        return [optimizer], [scheduler]

In [25]:
# Instantiate the classifier model
steps_per_epoch = len(x_tr)//BATCH_SIZE
model = QTagClassifier(n_classes=CLASS_NUM, steps_per_epoch=steps_per_epoch,n_epochs=N_EPOCHS,lr=LR)

Downloading:   0%|          | 0.00/641M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
# Initialize Pytorch Lightning callback for Model checkpointing  
# saves a file like: input/QTag-epoch=02-val_loss=0.32.ckpt
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',# monitored quantity
    filename='QTag-{epoch:02d}-{val_loss:.4f}',
    save_top_k=2, #  save the top 2 models
    mode='min', # mode of the monitored quantity  for optimization
)

In [27]:
# Instantiate the Model Trainer
# trainer = pl.Trainer(max_epochs = N_EPOCHS, gpus =1 , callbacks=[checkpoint_callback], progress_bar_refresh_rate = 30, default_root_dir='/content/drive/Shareddrives/MABOO/brand/model2')

  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [28]:
# Test the Model
# trainer.test(model,datamodule=QTdata_module)

In [29]:
# Train the Model
# trainer.fit(model, QTdata_module)

## Continue training : resume_from_checkpoint

In [32]:
# Instantiate the Model Trainer
trainer = pl.Trainer(max_epochs = N_EPOCHS, gpus =1, callbacks=[checkpoint_callback],progress_bar_refresh_rate = 30,default_root_dir='/content/drive/Shareddrives/MABOO/brand/model2',resume_from_checkpoint ="/content/drive/Shareddrives/MABOO/brand/model2/lightning_logs/version_1/checkpoints/QTag-epoch=11-val_loss=0.2397.ckpt")

  "Setting `Trainer(resume_from_checkpoint=)` is deprecated in v1.5 and"
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [31]:
trainer.fit(model, QTdata_module)

  ckpt_path = ckpt_path or self.resume_from_checkpoint
Restoring states from the checkpoint path at /content/drive/Shareddrives/MABOO/brand/model2/lightning_logs/version_1/checkpoints/QTag-epoch=11-val_loss=0.2397.ckpt
  f"The dirpath has changed from {dirpath_from_ckpt!r} to {self.dirpath!r},"
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | bert      | BertModel        | 167 M 
1 | linear_1  | Linear           | 787 K 
2 | linear_2  | Linear           | 4.2 M 
3 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
172 M     Trainable params
0         Non-trainable params
172 M     Total params
689.254   Total estimated model params size (MB)
Restored all states from the checkpoint file at /content/drive/Shareddrives/MABOO/brand/model2/lightning_logs/version_1/checkpoints/QTag-epoch=11-val_loss=0.2397.ckpt


Sanity Checking: 0it [00:00, ?it/s]

Training: 1419it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


## Tensorboard

In [None]:
# Visualize the logs using tensorboard.
%load_ext tensorboard
%tensorboard --logdir /content/drive/Shareddrives/MABOO/brand/model2/lightning_logs/

## Evaluate Model Performance on Test Set

In [33]:
# Size of Test set
print(f'Number of Questions = {len(x_test)}')

Number of Questions = 28379


In [34]:
from torch.utils.data import TensorDataset

# Tokenize all questions in x_test
input_ids = []
attention_masks = []


for quest in x_test:
    encoded_quest =  tokenizer.encode_plus(
                    quest,
                    None,
                    add_special_tokens=True,
                    max_length= MAX_LEN,
                    padding = 'max_length',
                    return_token_type_ids= False,
                    return_attention_mask= True,
                    truncation=True,
                    return_tensors = 'pt'      
    )
    
    # Add the input_ids from encoded question to the list.    
    input_ids.append(encoded_quest['input_ids'])
    # Add its attention mask 
    attention_masks.append(encoded_quest['attention_mask'])
    
# Now convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(y_test)

# Set the batch size.  
TEST_BATCH_SIZE = 64  

# Create the DataLoader.
pred_data = TensorDataset(input_ids, attention_masks, labels)
pred_sampler = SequentialSampler(pred_data)
pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=TEST_BATCH_SIZE)

## Prediction on test set

In [35]:
flat_pred_outs = 0
flat_true_labels = 0

In [None]:
# Put model in evaluation mode

model = QTagClassifier.load_from_checkpoint("/content/drive/Shareddrives/MBTI/model/lightning_logs/maboo/checkpoints/QTag-epoch=14-val_loss=0.1782.ckpt")
model = model.to(device) # moving model to cuda
model.eval()

In [37]:
# Tracking variables 
pred_outs, true_labels = [], []
#i=0
# Predict 
for batch in pred_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
  
    # Unpack the inputs from our dataloader
    b_input_ids, b_attn_mask, b_labels = batch
 
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        pred_out = model(b_input_ids,b_attn_mask)
        pred_out = torch.sigmoid(pred_out)
        # Move predicted output and labels to CPU
        pred_out = pred_out.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        #i+=1
        # Store predictions and true labels
        #print(i)
        #print(outputs)
        #print(logits)
        #print(label_ids)
    pred_outs.append(pred_out)
    true_labels.append(label_ids)

In [38]:
# pred_outs[0][0]

In [39]:
# true_labels[0][0]

In [40]:
# Combine the results across all batches. 
flat_pred_outs = np.concatenate(pred_outs, axis=0)

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

In [41]:
flat_pred_outs.shape , flat_true_labels.shape

((28379, 4068), (28379,))

## Predictions of Tags in Test set
The predictions are in terms of logits (probabilities for each of the 16 tags). Hence we need to the maximun value to convert these probabilities to 0 or 1.

In [42]:
# convert probabilities into 0 or 1
def classify(pred_prob):
    y_pred = []

    for tag_label_row in pred_prob:
        max_value = max(tag_label_row)
        max_index = tag_label_row.tolist().index(max_value)
        y_pred.append(max_index)

    return y_pred

In [43]:
from sklearn import metrics
scores=[] # Store the list of f1 scores for prediction on each threshold

#convert labels to 1D array
y_true = flat_true_labels.ravel() 

In [44]:
y_true[:10]

array([1869, 1209,  866, 1257, 3251, 1822, 2922, 3811, 3507, 3525])

## Performance Score Evaluation

In [45]:
y_pred_labels = classify(flat_pred_outs)
y_pred = np.array(y_pred_labels).ravel() # Flatten

In [46]:
y_pred[:10]

array([1869, 2971,  866, 1257, 3251, 1822, 2922, 3811, 3507, 3525])

In [47]:
print(metrics.classification_report(y_true,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.50      0.67         6
           1       0.90      0.90      0.90        20
           2       1.00      1.00      1.00         6
           3       1.00      0.33      0.50         6
           4       1.00      1.00      1.00         6
           5       1.00      1.00      1.00         6
           6       1.00      1.00      1.00         6
           7       1.00      1.00      1.00         6
           8       1.00      1.00      1.00         6
           9       1.00      1.00      1.00         6
          10       0.86      1.00      0.92         6
          11       1.00      1.00      1.00         6
          12       1.00      1.00      1.00         6
          13       1.00      1.00      1.00         6
          14       1.00      1.00      1.00         6
          15       0.93      0.87      0.90        15
          16       1.00      1.00      1.00         6
          17       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [48]:
y_pred = le.inverse_transform(np.array(y_pred_labels))
y_act = le.inverse_transform(flat_true_labels)

df = pd.DataFrame({'Body':x_test,'Actual Tags':y_act,'Predicted Tags':y_pred})

In [49]:
df.sample(10)

Unnamed: 0,Body,Actual Tags,Predicted Tags
19966,日全食杏仁米豆奶飲品910ml,格林全,格林全
11428,CRBOTANICAL1st高保濕化妝水400mL,Botanical 1st,Botanical 1st
18650,CLOVER敏感肌泡泡沐浴乳450ml補充包,Clover,Clover
4110,華麗牌石英管電暖器HS101,華麗牌,華麗牌
21317,696073逸萱秀速乾空氣感洗髮精700M,逸萱秀 Essentia,逸萱秀 Essentia
19435,日本牛乳石鹼滋卿愛青春調理洗面乳130g,SkinLife,SkinLife
10309,CHAMPION童鞋KFS038水藍KFUS038906KFUS038913KFUS0389,CHAMPION,CHAMPION
23819,Cetaphil舒特膚AD益膚康修護潔膚滋養組,舒特膚,舒特膚
2201,賽吉兒菁萃柔嫩乳霜30ML,SAUGELLA 賽吉兒,SAUGELLA 賽吉兒
25568,施巴55sebamed潤膚乳液10mlx2,施巴Sebamed,施巴Sebamed


In [None]:
# accuracy, Precision
# from sklearn import metrics
# y_test = df["Actual Tags"]
# y_predict = df["Predicted Tags"]
# accuracy = metrics.accuracy_score(y_test, y_predict)
# precision = metrics.precision_score(y_test, y_predict,average="macro")
# recall = metrics.recall_score(y_test, y_predict,average="macro")
# f1 = metrics.f1_score(y_test, y_predict,average="macro")
# print("Accuracy: {:.2f} %".format(accuracy*100))
# print('Precision: {:.2f} %'.format(precision*100))
# print('Recall: {:.2f} %'.format(recall*100))
# print('F1: {:.2f} %'.format(f1*100))

In [None]:
# df.to_csv("/content/drive/Shareddrives/MABOO/brand/result/test_result2.csv",encoding="utf-8")