In [19]:
from torch.utils.data import DataLoader

from bert_pytorch.model import ALBERT, ALBERTLM
from bert_pytorch.trainer import BERTTrainer
from bert_pytorch.dataset import ALBERTDataset, WordVocab

import torch
import torch.nn as nn
from torch.utils.data import Dataset, WeightedRandomSampler
from torch.optim import Adam

import tqdm

# numpy & pandas
import numpy as np
import pandas as pd

# scikit learn
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import textlib as tl

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
vocab_path = '../TCL2021_Telco_Embedding_Dataset/corpora/telco_vocab.dat'
vocab = WordVocab.load_vocab(vocab_path)

In [3]:
bert = ALBERT(vocab_size=len(vocab), embed_size=128, hidden=256, n_layers=8, attn_heads=8, seq_len=64)

In [4]:
bert = torch.load('../TCL2021_Telco_Embedding_Dataset/albert_model_good/albert_model_weights_only_finetuning/albert.model_weightsonly.ep3')

In [5]:
bert.embedding.token.weight

Parameter containing:
tensor([[-0.2393,  0.0931,  0.2913,  ..., -0.3466, -0.0221,  0.2755],
        [-0.6690, -0.1193, -1.0739,  ...,  0.2408,  0.1327,  0.4912],
        [-0.4424,  0.0456, -0.6106,  ...,  1.4350,  0.1514, -0.5756],
        ...,
        [-0.1260, -0.2287,  0.2891,  ...,  0.2943, -0.4238, -0.8864],
        [-0.4999,  0.0103,  0.0177,  ...,  0.0442, -0.4262,  0.1222],
        [ 0.6505,  0.5411, -0.3057,  ...,  0.3251,  0.2935, -0.2772]],
       requires_grad=True)

In [6]:
class SOPClassifier(nn.Module):
    def __init__(self, bert: ALBERT, num_class):
        super().__init__()
        self.bert = bert
        self.decode = nn.Linear(self.bert.hidden, num_class)
        
    def forward(self, x, segment_label):
        x = self.bert(x, segment_label)
        out = self.decode(x[:, 0])
        return out    

In [7]:
num_class = 37
batch_size = 64
seq_len = 64

clf = SOPClassifier(bert, num_class)

In [8]:
# 테스트

x = torch.zeros([batch_size,seq_len], dtype=torch.int32)
s = torch.ones([batch_size,seq_len], dtype=torch.int32)



y = clf.forward(x, s)
print(y.shape)
print(y)


torch.Size([64, 37])
tensor([[ 0.1755,  0.1799,  0.0135,  ..., -0.1694, -0.0202,  0.3690],
        [ 0.1956,  0.1741, -0.0095,  ..., -0.1570, -0.0545,  0.3896],
        [ 0.1933,  0.1771,  0.0021,  ..., -0.1937, -0.0451,  0.3882],
        ...,
        [ 0.1735,  0.1760, -0.0054,  ..., -0.1792, -0.0362,  0.3709],
        [ 0.1888,  0.1777,  0.0110,  ..., -0.1755, -0.0439,  0.3807],
        [ 0.1847,  0.1820, -0.0056,  ..., -0.1805, -0.0430,  0.3927]],
       grad_fn=<AddmmBackward>)


In [9]:
# sop dataset 읽어옴
input_file_name = '../TCL2021_Telco_Embedding_Dataset/dataset/sop_dataset.xlsx'
try:
    df = pd.read_excel(input_file_name, sheet_name=0, engine='openpyxl')
except FileNotFoundError:
    print(f'{input_file_name}이 없습니다! skip!')

print( df.shape )


(65635, 6)


In [10]:
# 첫 모델은 sentence와 label만 써보자
# df_zip = df[ ['sentence', 'label'] ]

y = df.pop('label_clean')
X = df.pop('sentence')

# 문자열로 되어 있는 label을 categorical value로 변환
label_encoder = preprocessing.LabelEncoder()
y = label_encoder.fit_transform(y)

In [11]:
# train / test 분리
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    #train_test_split(X[:1000], y[:1000], test_size=0.2, random_state=42)
    
    


In [12]:
class SOPDataset(Dataset):
    def __init__(self, X, y, vocab, seq_len):
        self.vocab = vocab
        self.seq_len = seq_len
        self.X = []
        self.y = torch.LongTensor(y)
        
        print(f'data loading started! size = {len(X)}')
        for i, text in enumerate(X):
            try:
                # 클렌징
                cleansed_text = tl.clean_text(text)
            except TypeError:
                print(f'      {i+1} 번째 데이터에 문제가 있어 skip!')
                continue

            # 문장으로 분리하여 배열로 리턴
            sentences = tl.segment_sentences(cleansed_text)
            # 문장 배열을 입릭으로 받아 형태소로 쪼갠 뒤, 다시 하나의 문자열로 변환하여 저장
            corpora = ' '.join(tl.get_corpora(sentences)).split(' ')
            self.X.append(corpora)   
            
            if i%1000 == 0:
                print(f'{i}th data loading completed!')
            
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, item):
        x = self.X[item]        
        x = x[:self.seq_len-2]
        # token to id
        tokens = ['<sos>'] + x + ['<eos>']
        segment_ids = [0] * len(tokens)
        
        input_ids = self.vocab.to_seq(tokens)
        
        n_pad = self.seq_len - len(input_ids)
        input_ids.extend([0]*n_pad)
        segment_ids.extend([0]*n_pad)
        
        output = {'input_ids': input_ids,
                  'segment_ids': segment_ids,
                  'label': self.y[item]}

        return {key: torch.tensor(value) for key, value in output.items()}        

In [13]:
train_dataset = SOPDataset(X_train, y_train, vocab, seq_len)
test_dataset  = SOPDataset(X_test, y_test, vocab, seq_len)

data loading started! size = 52508
0th data loading completed!
1000th data loading completed!
2000th data loading completed!
3000th data loading completed!
4000th data loading completed!
5000th data loading completed!
6000th data loading completed!
7000th data loading completed!
8000th data loading completed!
9000th data loading completed!
10000th data loading completed!
11000th data loading completed!
12000th data loading completed!
13000th data loading completed!
14000th data loading completed!
15000th data loading completed!
16000th data loading completed!
17000th data loading completed!
18000th data loading completed!
19000th data loading completed!
20000th data loading completed!
21000th data loading completed!
22000th data loading completed!
23000th data loading completed!
24000th data loading completed!
25000th data loading completed!
26000th data loading completed!
27000th data loading completed!
28000th data loading completed!
29000th data loading completed!
30000th data loadi

In [14]:
train_data_loader = DataLoader(train_dataset, 
                            batch_size=batch_size, 
                            shuffle=True,
                            num_workers=1)
test_data_loader  = DataLoader(test_dataset, 
                            batch_size=batch_size, 
                            num_workers=1)

In [15]:
criterion = nn.CrossEntropyLoss()
optim = Adam(clf.parameters(), 1e-4)

In [16]:
cuda_condition = torch.cuda.is_available()
device = torch.device("cuda:0" if cuda_condition else "cpu")

clf.to(device)

SOPClassifier(
  (bert): ALBERT(
    (embedding): BERTEmbedding(
      (token): Embedding(9544, 128)
      (position): Embedding(64, 128)
      (segment): Embedding(2, 128)
      (norm): LayerNorm()
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (factorized): Linear(in_features=128, out_features=256, bias=True)
    (transformer_block): TransformerBlock(
      (attention): MultiHeadedAttention(
        (linear_layers): ModuleList(
          (0): Linear(in_features=256, out_features=256, bias=True)
          (1): Linear(in_features=256, out_features=256, bias=True)
          (2): Linear(in_features=256, out_features=256, bias=True)
        )
        (output_linear): Linear(in_features=256, out_features=256, bias=True)
        (attention): Attention()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): FeedForward(
        (w_1): Linear(in_features=256, out_features=1024, bias=True)
        (w_2): Linear(in_features=1024, out_features=256, bias=True)
 

In [17]:
def iterate(clf, optim, epoch, data_loader, criterion, train=True):
    sum_loss = 0.0
    total_correct_sop = 0
    total_element_sop = 0

    str_code = "train" if train else "test"
    
    if train:
        clf.train()
    else:
        clf.eval()

    data_iter = tqdm.tqdm(enumerate(data_loader),
                            desc="EP_%s:%d" % (str_code, epoch),
                            total=len(data_loader),
                            bar_format="{l_bar}{r_bar}")        
    
    for i, data in data_iter:
        # 0. batch_data will be sent into the device(GPU or cpu)
        data = {key: value.to(device) for key, value in data.items()}

        pred_y = clf.forward(data["input_ids"], data["segment_ids"])

        loss = criterion(pred_y, data["label"])

        # 3. backward and optimization only in train
        if train: 
            optim.zero_grad()
            loss.backward()
            optim.step()

        sum_loss += loss.item()

        # SOP accuracy
        correct_sop = pred_y.argmax(dim=-1).eq(data["label"]).sum().item()
        total_correct_sop += correct_sop
        total_element_sop += data["label"].nelement()

        sop_acc = total_correct_sop / total_element_sop * 100

        post_fix = {
            "epoch": "[%d/%s]" % (epoch, str_code),
            "iter": "[%d/%d]" % (i, len(data_loader)),
            "avg_loss": sum_loss / (i + 1),
            "sop_acc": sop_acc,
            "total_loss": loss.item()
        }

        global_step = epoch * len(data_loader) + i

        if i % 100 == 0:
            data_iter.write(str(post_fix))

    print("EP%d_%s, avg_loss=" % (epoch, str_code), sum_loss / len(data_loader), \
        "total_sop_acc=", total_correct_sop * 100.0 / total_element_sop)

def train(clf, optim, epoch, data_loader, criterion):
    iterate(clf, optim, epoch, data_loader, criterion, True)

def test(clf, optim, epoch, data_loader, criterion):
    iterate(clf, optim, epoch, data_loader, criterion, False)    

In [None]:
epochs = 100
for epoch in range(epochs):
    train(clf, optim, epoch, train_data_loader, criterion)
    test (clf, optim, epoch, test_data_loader, criterion)       