<a href="https://colab.research.google.com/github/easy1103/Statistical_Data_Analysis/blob/main/jhlee/pytorch_kobert_JH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## KoBERT text embedding

In [None]:
!pip install ipywidgets  # for vscode
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm

In [None]:
from kobert import get_tokenizer
from kobert import get_pytorch_kobert_model

In [None]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [None]:
## CPU
# device = torch.device("cpu")

## GPU
device = torch.device("cuda:0")

In [None]:
bertmodel, vocab = get_pytorch_kobert_model(cachedir=".cache")

/content/.cache/kobert_v1.zip[██████████████████████████████████████████████████]
/content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece[██████████████████████████████████████████████████]


In [None]:
train_data = 'final' # keywords, fq_keywords, insta_keywords

In [None]:
if train_data == 'final':
  dataset_train = nlp.data.TSVDataset('/Assets/0616_final.tsv', field_indices=[2, 2], num_discard_samples=1)

In [None]:
print(dataset_train)

<gluonnlp.data.dataset.TSVDataset object at 0x7fe9da18bb10>


In [None]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        # self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        # return (self.sentences[i] + (self.labels[i], ))
        return self.sentences[i]

    def __len__(self):
        return (len(self.sentences))


In [None]:
## Setting parameters
max_len = 256
batch_size = 16
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [None]:
data_train = BERTDataset(dataset_train, 0, tok, max_len, True, False)
# data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
# test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

  cpuset_checked))


In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        else:
            out = pooler
        # return self.classifier(out)
        return out

In [None]:
# model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
model = BERTClassifier(bertmodel).to(device)

In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

## Ko-BERT model 
### Text embedding 수행

In [None]:
import numpy as np
import os

if train_data == 'final':
  save_path = '/Assets/kobert_output/final'

os.makedirs(save_path, exist_ok=True)

X = np.empty((0, 768), dtype=np.float32) # it will become of kmenas trainset, should np.float32

model.eval()
for batch_id, (token_ids, valid_length, segment_ids) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    
    out = model(token_ids, valid_length, segment_ids)

    X = np.append(X, out.cpu().detach().numpy(), axis=0)

    print('{} : {}'.format(batch_id, out.shape))

np.save(os.path.join(save_path, '{}_{}.npy'.format(train_data, X.shape)), X)

  cpuset_checked))


  0%|          | 0/23 [00:00<?, ?it/s]

0 : torch.Size([16, 768])
1 : torch.Size([16, 768])
2 : torch.Size([16, 768])
3 : torch.Size([16, 768])
4 : torch.Size([16, 768])
5 : torch.Size([16, 768])
6 : torch.Size([16, 768])
7 : torch.Size([16, 768])
8 : torch.Size([16, 768])
9 : torch.Size([16, 768])
10 : torch.Size([16, 768])
11 : torch.Size([16, 768])
12 : torch.Size([16, 768])
13 : torch.Size([16, 768])
14 : torch.Size([16, 768])
15 : torch.Size([16, 768])
16 : torch.Size([16, 768])
17 : torch.Size([16, 768])
18 : torch.Size([16, 768])
19 : torch.Size([16, 768])
20 : torch.Size([16, 768])
21 : torch.Size([16, 768])
22 : torch.Size([13, 768])
