In [1]:
import pandas as pd
import torch
import torch.optim as optim
import numpy as np
import itertools

from transformers import BertTokenizer
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from BrandClassifier import ThresholdClassifier
from DataPreprocess import BrandDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_4 = pd.read_csv('brand_data\\brand_data_4.csv')
data_1 = pd.read_csv('brand_data\\brand_data_1.csv')
data = pd.concat([data_1.iloc[:, :2], data_4], axis=0)
le= LabelBinarizer()
# X = data['name']
# y = le.fit_transform(data['bra_整合'])
X_train, X_val, y_train, y_val = train_test_split(data['name'].values, data['brand'].values, test_size=0.2, random_state=89)

y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

In [3]:
# initialize training parameters
device = torch.device('cuda')
max_length = 64
batch_size = 128
learning_rate = 5e-4
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

train_dataset = BrandDataset(X_train, y_train, tokenizer=tokenizer, max_length=max_length)
val_dataset = BrandDataset(X_val, y_val, tokenizer=tokenizer, max_length=max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

model = ThresholdClassifier(class_num=len(le.classes_))
# model.load_state_dict(torch.load('saved_model/Brand_classifier_temp.ckpt'))
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=learning_rate)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def train(train_loader=None, val_loader=None, model=None, epochs=None, criterion=None, optimizer=None, l1_lambda=None, l2_lambda=0.001):
    
    scheduler_1 = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    scheduler_2 = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=7, factor=0.5)
    all_train_accs = []
    all_train_loss = []
    all_val_accs = []
    all_val_loss = []
    for epoch in itertools.count():
        
        model.train()
        train_loss = []
        train_accs = []
        
        for batch in train_loader:
            
            ids = batch['ids'].to(device, dtype=torch.long)
            mask = batch['mask'].to(device, dtype=torch.long)
            y = batch['targets'].to(device, dtype=torch.float32)
            
            logits = model(ids, mask)
            loss = criterion(logits, y)
            
            # L1 regularization with normalized l1
            if l1_lambda is not None:
                L1_regularization = sum(p.abs().sum() for p in model.parameters())
                param_num = sum(p.numel() for p in model.parameters())
                loss += (l1_lambda / param_num) * L1_regularization
            
            # L2 regularization with normalized l2
            if l2_lambda is not None:
                L2_regularization = sum(p.pow(2.0).sum() for p in model.parameters())
                # param_num = sum(p.numel() for p in model.parameters())
                loss += l2_lambda * L2_regularization
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            acc = float(((logits.argmax(dim=-1) == y.argmax(dim=-1)).sum()).detach().cpu() / y.shape[0])
            train_loss.append(loss.item())
            train_accs.append(acc)
        
        train_loss = sum(train_loss) / len(train_loss)
        train_acc = sum(train_accs) / len(train_accs)
        
        all_train_loss.append(train_loss)
        all_train_accs.append(train_acc)
        
        scheduler_1.step()
        scheduler_2.step(train_loss)
        
        if val_loader is not None:
            model.eval()

            valid_loss = []
            valid_accs = []
            
            for batch in val_loader:
                
                ids = batch['ids'].to(device, dtype=torch.long)
                mask = batch['mask'].to(device, dtype=torch.long)
                y = batch['targets'].to(device, dtype=torch.float32)
                
                with torch.no_grad():
                    
                    logits = model(ids, mask)

                    acc = float(((logits.argmax(dim=-1) == y.argmax(dim=-1)).sum()).detach().cpu() / y.shape[0])
                    valid_loss.append(loss.item())
                    valid_accs.append(acc)
            
            valid_loss = sum(valid_loss) / len(valid_loss)
            valid_acc = sum(valid_accs) / len(valid_accs)
            
            all_val_loss.append(valid_loss)
            all_val_accs.append(valid_acc)
            
            if epochs is not None:
                print(f'[ {epoch+1}/{epochs} ] | train_loss = {train_loss:.5f}, train_acc = {train_acc:.5f}, val_loss = {valid_loss:.5f}, val_acc = {valid_acc:.5f}')
            else:
                print(f'[ {epoch+1} ] | train_loss = {train_loss:.5f}, train_acc = {train_acc:.5f}, val_loss = {valid_loss:.5f}, val_acc = {valid_acc:.5f}')
                if epoch+1 == epochs:
                    return model, [[all_train_accs, all_train_loss], [all_val_accs, all_val_loss]]
            
            if np.mean(all_train_accs[-15:]) > train_acc:
                print('Accuracy no longer increase, stop training!')
                return model, [[all_train_accs, all_train_loss], [all_val_accs, all_val_loss]]
        
        else:
            if epochs is not None:
                print(f'[ {epoch+1}/{epochs} ] | train_loss = {train_loss:.5f}, train_acc = {train_acc:.5f}')
            else:
                print(f'[ {epoch+1} ] | train_loss = {train_loss:.5f}, train_acc = {train_acc:.5f}')
                if epoch+1 == epochs:
                    return model, [all_train_accs, all_train_loss]
        
        # model saving
        if train_acc >= max(all_train_accs):
            torch.save(model.state_dict(), 'saved_model/Brand_classifier_temp.ckpt')
        
        # stopping criterion    
        if (len(all_train_accs) >= 30) and (np.mean(all_train_loss[-25:]) > train_loss):
            print('Loss no longer decrease, stop training!')
            return model, [all_train_accs, all_train_loss]

In [5]:
model, history = train(train_loader=train_loader, val_loader=val_loader, model=model, epochs=None, criterion=criterion, optimizer=optimizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[ 1 ] | train_loss = 190.70487, train_acc = 0.03880, val_loss = 190.46538, val_acc = 0.07238
[ 2 ] | train_loss = 190.20262, train_acc = 0.11012, val_loss = 190.09349, val_acc = 0.14595


In [None]:
# torch.save(model, 'saved_model/whole_Brand_classifier.ckpt')