In [1]:
import pandas as pd
import torch
import torch.optim as optim
import numpy as np
import itertools

from transformers import BertTokenizer
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelBinarizer
from BrandClassifier import ThresholdClassifier
from DataPreprocess import BrandDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('brand_data\\brand_data_4.csv')[['name', 'bra_整合']]
le= LabelBinarizer()
X = data['name']
y = le.fit_transform(data['bra_整合'])

In [3]:
device = torch.device('cuda')
max_length = 64
batch_size = 128
learning_rate = 1e-3
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

# train_data = data.sample(frac=0.8, random_state=87)
train_data = data
# val_data = data.drop(train_data.index).reset_index(drop=True)
# train_data = train_data.reset_index(drop=True)

train_dataset = BrandDataset(X, y, tokenizer=tokenizer, max_length=max_length)
# val_dataset = BrandDataset(val_data, tokenizer=tokenizer, max_length=max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=batch_size)

model = ThresholdClassifier()
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=learning_rate)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def train(train_loader=None, val_loader=None, model=None, epochs=None, criterion=None, optimizer=None, l1_lambda=None, l2_lambda=0.001):
    
    scheduler_1 = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    scheduler_2 = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=10, factor=0.75)
    all_train_accs = []
    all_train_loss = []
    all_val_accs = []
    all_val_loss = []
    for epoch in itertools.count():
        
        model.train()
        train_loss = []
        train_accs = []
        
        for batch in train_loader:
            
            ids = batch['ids'].to(device, dtype=torch.long)
            mask = batch['mask'].to(device, dtype=torch.long)
            y = batch['targets'].to(device, dtype=torch.float32)
            
            logits = model(ids, mask)
            loss = criterion(logits, y)
            
            # L1 regularization with normalized l1
            if l1_lambda is not None:
                L1_regularization = sum(p.abs().sum() for p in model.parameters())
                param_num = sum(p.numel() for p in model.parameters())
                loss += (l1_lambda / param_num) * L1_regularization
            
            # L2 regularization with normalized l2
            if l2_lambda is not None:
                L2_regularization = sum(p.pow(2.0).sum() for p in model.parameters())
                # param_num = sum(p.numel() for p in model.parameters())
                loss += l2_lambda * L2_regularization
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            acc = float(((logits.argmax(dim=-1) == y.argmax(dim=-1)).sum()).detach().cpu() / y.shape[0])
            train_loss.append(loss.item())
            train_accs.append(acc)
        
        train_loss = sum(train_loss) / len(train_loss)
        train_acc = sum(train_accs) / len(train_accs)
        
        all_train_loss.append(train_loss)
        all_train_accs.append(train_acc)
        
        scheduler_1.step()
        scheduler_2.step(train_acc)
        
        if val_loader is not None:
            model.eval()

            valid_loss = []
            valid_accs = []
            
            for batch in val_loader:
                
                ids = batch['ids'].to(device, dtype=torch.long)
                mask = batch['mask'].to(device, dtype=torch.long)
                y = batch['targets'].to(device, dtype=torch.float32)
                
                with torch.no_grad():
                    
                    logits = model(ids, mask)

                    acc = float(((logits.argmax(dim=-1) == y.argmax(dim=-1)).sum()).detach().cpu() / y.shape[0])
                    valid_loss.append(loss.item())
                    valid_accs.append(acc)
            
            valid_loss = sum(valid_loss) / len(valid_loss)
            valid_acc = sum(valid_accs) / len(valid_accs)
            
            all_val_loss.append(valid_loss)
            all_val_accs.append(valid_acc)
            
            if epochs is not None:
                print(f'[ {epoch+1}/{epochs} ] | train_loss = {train_loss:.5f}, train_acc = {train_acc:.5f}, val_loss = {valid_loss:.5f}, val_acc = {valid_acc:.5f}')
            else:
                print(f'[ {epoch+1} ] | train_loss = {train_loss:.5f}, train_acc = {train_acc:.5f}, val_loss = {valid_loss:.5f}, val_acc = {valid_acc:.5f}')
                if epoch+1 == epochs:
                    return model, [[all_train_accs, all_train_loss], [all_val_accs, all_val_loss]]
            
            if np.mean(all_train_accs[-15:]) > train_acc:
                print('Accuracy no longer increase, stop training!')
                return model, [[all_train_accs, all_train_loss], [all_val_accs, all_val_loss]]
        
        else:
            if epochs is not None:
                print(f'[ {epoch+1}/{epochs} ] | train_loss = {train_loss:.5f}, train_acc = {train_acc:.5f}')
            else:
                print(f'[ {epoch+1} ] | train_loss = {train_loss:.5f}, train_acc = {train_acc:.5f}')
                if epoch+1 == epochs:
                    return model, [all_train_accs, all_train_loss]
                
        if (len(all_train_accs) >= 35) and (np.mean(all_train_accs[-20:]) > train_acc):
            print('Accuracy no longer increase, stop training!')
            return model, [all_train_accs, all_train_loss]

In [5]:
model, history = train(train_loader=train_loader, model=model, epochs=None, criterion=criterion, optimizer=optimizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])
torch.Size([128, 64, 768])
torch.Size([128, 768])


KeyboardInterrupt: 

In [9]:
a = torch.randn((64, 768))

In [13]:
a.unsqueeze(dim=0)

tensor([[[ 0.7429,  0.0886,  0.1007,  ...,  0.1293, -0.3862,  2.8118],
         [-0.6036, -0.7348, -0.1388,  ..., -0.8339,  1.8971,  0.3012],
         [-1.6799,  0.3439,  1.3862,  ..., -0.9472,  0.9723, -1.6039],
         ...,
         [-0.3996,  0.2091,  0.6462,  ...,  1.0400, -0.6488, -0.4588],
         [-0.9309, -1.0784, -0.8039,  ...,  0.4364, -1.7029,  0.1990],
         [ 0.3330,  1.2533, -0.8361,  ...,  2.0426, -0.3894, -0.2206]]])