In [1]:
%cd /content/drive/MyDrive/tiki_sentiment_analyst/

/content/drive/MyDrive/tiki_sentiment_analyst


In [2]:
import pickle

def _save_pkl(path, obj):
  with open(path, 'wb') as f:
    pickle.dump(obj, f)

def _load_pkl(path):
  with open(path, 'rb') as f:
    obj = pickle.load(f)
  return obj

In [3]:
content_train = _load_pkl('data_tiki/content_train.pkl')
content_test = _load_pkl('data_tiki/content_test.pkl')
label_train = _load_pkl('data_tiki/label_train.pkl')
label_test = _load_pkl('data_tiki/label_test.pkl')

In [4]:
len(content_train), len(content_test), len(label_train), len(label_test)

(19083, 2121, 19083, 2121)

In [5]:
from sklearn.model_selection import train_test_split

content_train, content_val, label_train, label_val = train_test_split(content_train, label_train, test_size=0.2)

In [6]:
len(content_train), len(content_val), len(label_train), len(label_val)

(15266, 3817, 15266, 3817)

In [11]:
!pip install transformers
!pip install fastBPE
!pip install fairseq

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 4.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 48.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 533 kB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 33.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 53.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [6]:
# !wget https://public.vinai.io/PhoBERT_base_transformers.tar.gz
# !tar -xzvf PhoBERT_base_transformers.tar.gz

--2022-01-06 10:33:18--  https://public.vinai.io/PhoBERT_base_transformers.tar.gz
Resolving public.vinai.io (public.vinai.io)... 52.85.146.99, 52.85.146.76, 52.85.146.96, ...
Connecting to public.vinai.io (public.vinai.io)|52.85.146.99|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 322405979 (307M) [application/x-tar]
Saving to: ‘PhoBERT_base_transformers.tar.gz’


2022-01-06 10:33:36 (18.2 MB/s) - ‘PhoBERT_base_transformers.tar.gz’ saved [322405979/322405979]

PhoBERT_base_transformers/
PhoBERT_base_transformers/config.json
PhoBERT_base_transformers/bpe.codes
PhoBERT_base_transformers/model.bin
PhoBERT_base_transformers/dict.txt


In [7]:
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes', 
    default="PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

# Load the dictionary
vocab = Dictionary()
vocab.add_from_file("PhoBERT_base_transformers/dict.txt")


In [40]:
text = content_val[1111]
bpe.encode(text)

'nhìn tổng_quan là tạm ổn cách viết theo kiểu việt_@@ nam mang tính hàn_lâm đọc hơi nhàm_chán sẽ cố_gắng đọc hết'

In [41]:
vocab.encode_line('<s> ' + bpe.encode(text) + ' </s>')

tensor([    0,   364, 20204,     8,   918,  4752,   139,   467,    63,   931,
        56444,   542,   156,   294, 24076,   987,  1329,  8631,    38,   977,
          987,   351,     2,     2], dtype=torch.int32)

In [10]:
len(content_train), len(content_val), len(label_train), len(label_val)

(15266, 3817, 15266, 3817)

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_LEN = 256

train_ids = []
for sent in content_train:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    train_ids.append(encoded_sent)

val_ids = []
for sent in content_val:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    val_ids.append(encoded_sent)
    
train_ids = pad_sequences(train_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
val_ids = pad_sequences(val_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

In [12]:
train_ids.shape, val_ids.shape

((15266, 256), (3817, 256))

In [13]:
train_masks = []
for sent in train_ids:
    mask = [int(token_id > 0) for token_id in sent]
    train_masks.append(mask)

val_masks = []
for sent in val_ids:
    mask = [int(token_id > 0) for token_id in sent]

    val_masks.append(mask)

In [14]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

train_inputs = torch.tensor(train_ids)
val_inputs = torch.tensor(val_ids)
train_labels = torch.tensor(label_train)
val_labels = torch.tensor(label_val)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

In [15]:
train_inputs.shape, val_inputs.shape, train_labels.shape, val_labels.shape, train_masks.shape, val_masks.shape

(torch.Size([15266, 256]),
 torch.Size([3817, 256]),
 torch.Size([15266]),
 torch.Size([3817]),
 torch.Size([15266, 256]),
 torch.Size([3817, 256]))

In [16]:
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = SequentialSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=8)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=8)

In [17]:
from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW

config = RobertaConfig.from_pretrained(
    "PhoBERT_base_transformers/config.json", from_tf=False, num_labels = 2, output_hidden_states=False,
)
BERT_SA = RobertaForSequenceClassification.from_pretrained(
    "PhoBERT_base_transformers/model.bin",
    config=config
)

You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at PhoBERT_base_transformers/model.bin were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceC

In [18]:
BERT_SA.cuda()
print('Done')

Done


In [19]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    F1_score = f1_score(pred_flat, labels_flat, average='macro')
    
    return accuracy_score(pred_flat, labels_flat), F1_score

In [20]:
import random
from tqdm import tqdm_notebook
import torch
device = 'cuda'
epochs = 10

param_optimizer = list(BERT_SA.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)

best_f1_score = 0
for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    total_loss = 0
    BERT_SA.train()
    train_accuracy = 0
    nb_train_steps = 0
    train_f1 = 0
    
    for step, batch in tqdm_notebook(enumerate(train_dataloader)):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        BERT_SA.zero_grad()
        outputs = BERT_SA(b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask, 
            labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_train_accuracy, tmp_train_f1 = flat_accuracy(logits, label_ids)
        train_accuracy += tmp_train_accuracy
        train_f1 += tmp_train_f1
        nb_train_steps += 1
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(BERT_SA.parameters(), 1.0)
        optimizer.step()
        
    avg_train_loss = total_loss / len(train_dataloader)
    print(" Accuracy: {0:.4f}".format(train_accuracy/nb_train_steps))
    print(" F1 score: {0:.4f}".format(train_f1/nb_train_steps))
    print(" Average training loss: {0:.4f}".format(avg_train_loss))

    print("Running Validation...")
    BERT_SA.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    eval_f1 = 0
    for batch in tqdm_notebook(val_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = BERT_SA(b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_accuracy, tmp_eval_f1 = flat_accuracy(logits, label_ids)

            eval_accuracy += tmp_eval_accuracy
            eval_f1 += tmp_eval_f1
            nb_eval_steps += 1

    print(" Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
    print(" F1 score: {0:.4f}".format(eval_f1/nb_eval_steps))
    if eval_f1/nb_eval_steps > best_f1_score:
      torch.save(BERT_SA, "model/model.pth")
      best_f1_score = eval_f1/nb_eval_steps
      
print("Training complete!")

Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

 Accuracy: 0.9115
 F1 score: 0.8973
 Average training loss: 0.2622
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/478 [00:00<?, ?it/s]

 Accuracy: 0.9399
 F1 score: 0.9298
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

 Accuracy: 0.9527
 F1 score: 0.9445
 Average training loss: 0.1730
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/478 [00:00<?, ?it/s]

 Accuracy: 0.9406
 F1 score: 0.9305
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

 Accuracy: 0.9707
 F1 score: 0.9657
 Average training loss: 0.1209
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/478 [00:00<?, ?it/s]

 Accuracy: 0.9467
 F1 score: 0.9373
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

 Accuracy: 0.9819
 F1 score: 0.9788
 Average training loss: 0.0857
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/478 [00:00<?, ?it/s]

 Accuracy: 0.9448
 F1 score: 0.9348
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

KeyboardInterrupt: ignored

In [21]:
model = torch.load("model/model.pth")

In [22]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_LEN = 256

test_ids = []
for sent in content_test:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    test_ids.append(encoded_sent)
    
test_ids = pad_sequences(test_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

In [26]:
test_ids.shape, len(label_test)

((2121, 256), 2121)

In [27]:
test_masks = []
for sent in test_ids:
    mask = [int(token_id > 0) for token_id in sent]
    test_masks.append(mask)

In [28]:
test_inputs = torch.tensor(test_ids)
test_labels = torch.tensor(label_test)
test_masks = torch.tensor(test_masks)

In [29]:
test_inputs.shape, test_labels.shape, test_masks.shape

(torch.Size([2121, 256]), torch.Size([2121]), torch.Size([2121, 256]))

In [30]:
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=8)

In [31]:
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
eval_f1 = 0
for batch in tqdm_notebook(test_dataloader):

    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, 
        token_type_ids=None, 
        attention_mask=b_input_mask)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy, tmp_eval_f1 = flat_accuracy(logits, label_ids)

        eval_accuracy += tmp_eval_accuracy
        eval_f1 += tmp_eval_f1
        nb_eval_steps += 1

print(" Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
print(" F1 score: {0:.4f}".format(eval_f1/nb_eval_steps))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/266 [00:00<?, ?it/s]

 Accuracy: 0.9464
 F1 score: 0.9366
