In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelBinarizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('brand_data\\brand_data_4.csv')[['name', 'bra_整合']]
le= LabelBinarizer()
X = data['name']
y = le.fit_transform(data['bra_整合'])

# class_dict = {}
# for i in enumerate(le.classes_):
#     class_dict[i[1]] = i[0]


In [3]:
device = torch.device('cuda')
max_length = 64
batch_size = 128
epochs = 10
learning_rate = 12e-4
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [4]:
class BrandDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_length):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __getitem__(self, index):
        text = self.X[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.y[index], dtype=torch.float32)
        }
    
    def __len__(self):
        return len(self.X)

In [5]:
# train_data = data.sample(frac=0.8, random_state=87)
train_data = data
# val_data = data.drop(train_data.index).reset_index(drop=True)
# train_data = train_data.reset_index(drop=True)

In [6]:
train_dataset = BrandDataset(X, y, tokenizer=tokenizer, max_length=max_length)
# val_dataset = BrandDataset(val_data, tokenizer=tokenizer, max_length=max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [7]:
class ThresholdClassifier(nn.Module):
    def __init__(self):
        super(ThresholdClassifier, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-chinese')
        self.linear_1 = nn.Linear(768, 512)
        self.linear_2 = nn.Linear(512, 256)
        # self.linear_3 = nn.Linear(256, 128)
        self.dropout = nn.Dropout(0.2)
        self.ln_1 = nn.LayerNorm(768)
        self.ln_2 = nn.LayerNorm(512)
        self.ln_3 = nn.LayerNorm(256)
        # self.ln_4 = nn.LayerNorm(128)
        self.linear_out = nn.Linear(256, 121)
    
    def forward(self, input_ids, attention_mask):
        x = self.bert_model(input_ids, attention_mask)
        x = self.ln_1(x[1])
        x = torch.relu(self.linear_1(x))
        x = self.ln_2(x)
        x = torch.relu(self.linear_2(x))
        x = self.ln_3(x)
        # x = self.dropout(x)
        # x = torch.relu(self.linear_3(x))
        # x = self.ln_4(x)
        x = self.linear_out(x)
        # distribution = torch.softmax(x)
        return x # distribution
        

In [8]:
model = ThresholdClassifier()
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=learning_rate)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [26]:
def train(train_loader=None, val_loader=None, model=None, epochs=None, criterion=None, optimizer=None, l1_lambda=None, l2_lambda=0.001):
    
    scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    
    for epoch in range(epochs):
        
        model.train()
        train_loss = []
        train_accs = []
        
        for batch in train_loader:
            
            ids = batch['ids'].to(device, dtype=torch.long)
            mask = batch['mask'].to(device, dtype=torch.long)
            y = batch['targets'].to(device, dtype=torch.float32)
            
            logits = model(ids, mask)
            loss = criterion(logits, y)
            
            # L1 regularization with normalized l1
            if l1_lambda is not None:
                L1_regularization = sum(p.abs().sum() for p in model.parameters())
                param_num = sum(p.numel() for p in model.parameters())
                loss += (l1_lambda / param_num) * L1_regularization
            
            # L2 regularization with normalized l2
            if l2_lambda is not None:
                L2_regularization = sum(p.pow(2.0).sum() for p in model.parameters())
                param_num = sum(p.numel() for p in model.parameters())
                loss += (l2_lambda / param_num) * L2_regularization
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            acc = int(((logits.argmax(dim=-1) == y.argmax(dim=-1)).sum()).detach().cpu()) / y.shape[0]
            train_loss.append(loss.item())
            train_accs.append(acc)
        
        train_loss = sum(train_loss) / len(train_loss)
        train_acc = sum(train_accs) / len(train_accs)
        
        scheduler.step()
        
        if val_loader is not None:
            model.eval()

            valid_loss = []
            valid_accs = []
            
            for batch in val_loader:
                
                ids = batch['ids'].to(device, dtype=torch.long)
                mask = batch['mask'].to(device, dtype=torch.long)
                y = batch['targets'].to(device, dtype=torch.float32)
                
                with torch.no_grad():
                    
                    logits = model(ids, mask)

                    acc = int(((logits.argmax(dim=-1) == y.argmax(dim=-1)).sum()).detach().cpu()) / y.shape[0]
                    valid_loss.append(loss.item())
                    valid_accs.append(acc)
            
            valid_loss = sum(valid_loss) / len(valid_loss)
            valid_acc = sum(valid_accs) / len(valid_accs)
        
            print(f'[ {epoch+1}/{epochs} ] | train_loss = {train_loss:.5f}, train_acc = {train_acc:.5f}, val_loss = {valid_loss:.5f}, val_acc = {valid_acc:.5f}')
        else:
            print(f'[ {epoch+1}/{epochs} ] | train_loss = {train_loss:.5f}, train_acc = {train_acc:.5f}')
        
        if epoch+1 >= 50:
            print("It's over 50 epochs, stop training")
            break

In [27]:
train(train_loader=train_loader, model=model, epochs=100, criterion=criterion, optimizer=optimizer)

[ 1/100 ] | train_loss = 4.72797, train_acc = 0.02170
[ 2/100 ] | train_loss = 4.72408, train_acc = 0.02163
[ 3/100 ] | train_loss = 4.72199, train_acc = 0.02163
[ 4/100 ] | train_loss = 4.72216, train_acc = 0.02170
[ 5/100 ] | train_loss = 4.71940, train_acc = 0.02176
[ 6/100 ] | train_loss = 4.71929, train_acc = 0.02163
[ 7/100 ] | train_loss = 4.71817, train_acc = 0.02163
[ 8/100 ] | train_loss = 4.71722, train_acc = 0.02170
[ 9/100 ] | train_loss = 4.71709, train_acc = 0.02163
[ 10/100 ] | train_loss = 4.71684, train_acc = 0.02163
