## PHOBERT FINE-TUNING

In [2]:
import wandb

In [3]:
""" WANDB SETTINGS """

wandb.init(project="INTENT-CLASSIFIER",
           entity="emandai",
           name="Fine-tuning",
           save_code=True,
           notes="Fine-tuning",
           tags=["fine-tune", "all-layer", "[TRAIN] no *other*", "[TEST] 100-calls"],
           config={
               "epochs": 60,
               "lr": 2e-5,
               "batch_size": 16,
               "gradient_accumulation_steps": 2,
               "weight_decay": 0.01,
               "warmup_ratio": 0.06,
               "lr_scheduler_type": "linear"
           })

[34m[1mwandb[0m: Currently logged in as: [33mhosjiu[0m (use `wandb login --relogin` to force relogin)


In [4]:
""" WANDB LOGIN """

wandb.login(relogin=True)



True

In [4]:
from pprint import pprint
import os

from pyvi import ViTokenizer
from vncorenlp import VnCoreNLP
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from torch.utils.data import Dataset
import torch
from datasets import load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

from config import VQC_DATAPATH, phobert_base_checkpoint
import utils
from utils import count_trainable_params

In [5]:
""" LOAD TOKENIZER """

tokenizer = AutoTokenizer.from_pretrained(phobert_base_checkpoint, use_fast=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
""" TEST Tokenizer """

text = "Bánh Xôi Tiêu ông Mẫn rất là ngon."
tokenized_text = tokenizer(text)
pprint(tokenized_text)

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [0, 10122, 29425, 14788, 46, 10578, 59, 8, 1701, 34412, 2],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [7]:
""" TRY DECODE FROM LIST OF GIVEN TOKENS """

tokenizer.decode(tokenized_text["input_ids"])

'<s> Bánh Xôi Tiêu ông Mẫn rất là ngon. </s>'

In [8]:
""" Padding Token """

tokenizer.decode(0)

'< s >'

In [9]:
""" Unknown token """

tokenizer.decode(3)

'< u n k >'

In [10]:
""" TEST Tokenizer """

text = "ký rẹt đi em"
tokenized_text = tokenizer(text)
pprint(tokenized_text)

{'attention_mask': [1, 1, 1, 1, 1, 1, 1],
 'input_ids': [0, 765, 1698, 17987, 57, 193, 2],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0]}


In [11]:
""" TRY TO LOAD MODEL CONFIGURATION """

model_config = AutoConfig.from_pretrained(phobert_base_checkpoint)
print(f"vocab size: {model_config.vocab_size}")
print(f"hidden size: {model_config.hidden_size}")
print(f"num attention heads: {model_config.num_attention_heads}")
print(f"num blocks: {model_config.num_hidden_layers}")
print(f"num labels: {model_config.num_labels}")

vocab size: 64001
hidden size: 768
num attention heads: 12
num blocks: 12
num labels: 2


### Data Preparation

#### Load Data

In [12]:
""" LOAD DATA """

DATA_PATH = os.path.join(VQC_DATAPATH, "data_no_other.xlsx")
data_df = pd.read_excel(DATA_PATH)
data_df.head()

X = data_df["Sample"].tolist()
y = data_df["Intent"].tolist()

#### Data Preprocessing

##### Text Lowering

In [13]:
""" TEXT LOWERING """

X = [text.lower() for text in X]

##### Word Segmentation

In [14]:
""" WORD SEGMENTATION """

' WORD SEGMENTATION '

In [None]:
# pyvi
X = [ViTokenizer.tokenize(text) for text in X]

In [15]:
# RDRSegmenter
annotator = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size="-Xmx500m")

def word_segmenter(f):
    _concat = lambda x: " ".join([token for token in x[0]])
    def wrapper(*args, **kwargs):
        list_of_tokens = f(*args, **kwargs)
        return _concat(list_of_tokens)
    return wrapper

@word_segmenter
def _ws(text):
    return annotator.tokenize(text)

X = [_ws(text) for text in X]

##### Label Formatting

In [16]:
""" LABEL ENCODING (convert label from string to numeric data)"""

lb = LabelEncoder()
y = lb.fit_transform(y)
print("[Verbose Labels]")
np.unique(y)

[Verbose Labels]


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41])

In [17]:
print(f"# samples: {len(y)}")

# samples: 1062


##### (Train, Val, Test) Split

In [18]:
""" DATA SPLITING FOR TRAINING, EVALUATING AND TESTING """

TEST_SIZE = 0.1
VAL_SIZE = 0.1
train_texts, test_texts, train_labels, test_labels = train_test_split(X, y, test_size=TEST_SIZE, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=VAL_SIZE, random_state=42)

In [19]:
len(np.unique(train_labels)), len(np.unique(val_labels)), len(np.unique(test_labels))

(42, 39, 38)

##### Text Tokenization

In [20]:
""" TOKENIZE FOR EACH SPLIT """

train_encodings = tokenizer(train_texts, padding="max_length", truncation=True)
val_encodings = tokenizer(val_texts, padding="max_length", truncation=True)
test_encodings = tokenizer(test_texts, padding="max_length", truncation=True)

#### Create PyTorch Dataset

In [21]:
""" CREATE DATASET (including `encodings` and `labels`) """

class VqcDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {}
        for key, val in self.encodings.items():
            item.update({key: torch.tensor(val[idx])})
        item.update({"label": torch.tensor(self.labels[idx])})
        return item

    def __len__(self):
        return len(self.labels)

    # @classmethod
    def get_num_labels(self):
        return len(np.unique(self.labels))

train_dataset = VqcDataset(train_encodings, train_labels)
val_dataset = VqcDataset(val_encodings, val_labels)
test_dataset = VqcDataset(test_encodings, test_labels)

### Load Pre-trained PhoBERT-base checkpoint

In [22]:
""" LOAD PRE-TRAINED PhoBERT MODEL """

# We can use RobertaForSequenceClassification as an alternative
num_labels = train_dataset.get_num_labels()
model = AutoModelForSequenceClassification.from_pretrained("./results/TAPT/checkpoint-2500",
                                                           num_labels=num_labels)

# Re-check model configurations
print(f"👇 MODEL CONFIGURATIONS 👇\n")
print(model.config)

Some weights of the model checkpoint at ./results/TAPT/checkpoint-2500 were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./results/TAPT/checkpoint-2500 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classi

👇 MODEL CONFIGURATIONS 👇

RobertaConfig {
  "_name_or_path": "./results/TAPT/checkpoint-2500",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28": "LABEL_28"

In [23]:
""" LOAD PRE-TRAINED PhoBERT MODEL """

# We can use RobertaForSequenceClassification as an alternative
num_labels = train_dataset.get_num_labels()
model = AutoModelForSequenceClassification.from_pretrained("./results/pretraining/checkpoint-100000/",
                                                           num_labels=num_labels)

Some weights of the model checkpoint at ./results/pretraining/checkpoint-100000/ were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./results/pretraining/checkpoint-100000/ and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out

### Define Metric

In [24]:
# Define Metric for model evaluation
# Reference: https://huggingface.co/transformers/training.html#fine-tuning-in-pytorch-with-the-trainer-api
metric = load_metric("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions,
                          references=labels,
                          average="macro")

In [None]:
wandb.init(project="INTENT-CLASSIFIER",
           entity="emandai",
           name="Fine-tuning",
           save_code=True,
           notes="Fine-tuning",
           tags=["fine-tune", "all-layer", "[TRAIN] no *other*", "[TEST] 100-calls"],
           config={
               "epochs": 60,
               "lr": 2e-5,
               "batch_size": 16,
               "gradient_accumulation_steps": 2,
               "weight_decay": 0.01,
               "warmup_ratio": 0.06,
               "lr_scheduler_type": "linear"
           })

### Training Settings

In [25]:
# Trainer Argument
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=wandb.config.epochs,
    learning_rate=wandb.config.lr,
    per_device_train_batch_size=wandb.config.batch_size,
    per_device_eval_batch_size=wandb.config.batch_size,
    gradient_accumulation_steps=wandb.config.gradient_accumulation_steps,
    weight_decay=wandb.config.weight_decay,
    warmup_ratio=wandb.config.warmup_ratio,
    lr_scheduler_type=wandb.config.lr_scheduler_type,
    logging_steps=10,
    eval_steps=100,
    save_steps=100,
    save_strategy="steps",
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    save_total_limit=10,
    logging_dir="./logs",
    report_to="wandb",
)

# Fine-tuning using Trainer API from Huggingface
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(5)] # Early Stopping
)

### Fine-tuning All Layers

In [26]:
print(f"total number of trainable parameters: {count_trainable_params(model)}")

# Fine-tuning
trainer.train()

# Finish wandb
wandb.finish(quiet=True)

***** Running training *****
  Num examples = 859
  Num Epochs = 60
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 1620
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


total number of trainable parameters: 154714410


Step,Training Loss,Validation Loss,F1
100,3.3596,3.296652,0.246738
200,2.3842,2.359263,0.476249
300,1.598,1.673203,0.649381
400,1.0376,1.249442,0.691361
500,0.5995,1.019155,0.742193
600,0.381,0.862205,0.731624
700,0.2418,0.809447,0.72559
800,0.1552,0.807439,0.725782
900,0.1184,0.806648,0.725782
1000,0.1026,0.808311,0.71475


***** Running Evaluation *****
  Num examples = 96
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-100
Configuration saved in ./results/checkpoint-100/config.json
Model weights saved in ./results/checkpoint-100/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-700] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 96
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-200
Configuration saved in ./results/checkpoint-200/config.json
Model weights saved in ./results/checkpoint-200/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-800] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 96
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-300
Configuration saved in ./results/checkpoint-300/config.json
Model weights saved in ./results/checkpoint-300/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-900] due to args.save_total_limit
***** Running Evalu

VBox(children=(Label(value=' 4.82MB of 4.82MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

### Fine-tuning Last Layer

In [None]:
# For embedding layers
for name, params in model.roberta.embeddings.named_parameters():
    params.requires_grad = False

# For encoder layers
for name, params in model.roberta.encoder.named_parameters():
    params.requires_grad = False

    if "layer.11" in name:
        params.requires_grad = True

print(f"total number of trainable parameters: {count_trainable_params(model)}")
        
# Fine-tuning
trainer.train()

# Finish wandb
wandb.finish(quiet=True)

### Fine-tune from 11th Layer

In [None]:
# For embedding layers
for name, params in model.roberta.embeddings.named_parameters():
    params.requires_grad = False

# For encoder layers
for name, params in model.roberta.encoder.named_parameters():
    params.requires_grad = False

    if "layer.10" in name:
        params.requires_grad = True
        continue
    
    if "layer.11" in name:
        params.requires_grad = True

print(f"total number of trainable parameters: {count_trainable_params(model)}")
        
# Fine-tuning
trainer.train()

# Finish wandb
wandb.finish(quiet=True)

### Fine-tune from 10th Layer

In [None]:
# For embedding layers
for name, params in model.roberta.embeddings.named_parameters():
    params.requires_grad = False

# For encoder layers
for name, params in model.roberta.encoder.named_parameters():
    params.requires_grad = False
    
    if "layer.9" in name:
        params.requires_grad = True
        continue
    
    if "layer.10" in name:
        params.requires_grad = True
        continue
    
    if "layer.11" in name:
        params.requires_grad = True

print(f"total number of trainable parameters: {sum([params.numel() for params in model.parameters() if params.requires_grad])}")
        
# Fine-tuning
trainer.train()

# Finish wandb
wandb.finish(quiet=True)

### Fine-tune from 9th Layer

In [27]:
# For embedding layers
for name, params in model.roberta.embeddings.named_parameters():
    params.requires_grad = False

# For encoder layers
for name, params in model.roberta.encoder.named_parameters():
    params.requires_grad = False

    if "layer.8" in name:
        params.requires_grad = True
        continue
    
    if "layer.9" in name:
        params.requires_grad = True
        continue
    
    if "layer.10" in name:
        params.requires_grad = True
        continue
    
    if "layer.11" in name:
        params.requires_grad = True

print(f"total number of trainable parameters: {count_trainable_params(model)}")
        
# Fine-tuning
trainer.train()

# evaluate
trainer.evaluate()

# Finish wandb
wandb.finish(quiet=True)

***** Running training *****
  Num examples = 859
  Num Epochs = 60
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 1620
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


total number of trainable parameters: 28974378


Step,Training Loss,Validation Loss,F1
100,3.3904,3.288343,0.277443
200,2.372,2.308057,0.46114
300,1.7054,1.707848,0.601507
400,1.2517,1.32779,0.695901
500,0.8443,1.087066,0.735015
600,0.5914,0.915444,0.750985
700,0.4681,0.815914,0.772123
800,0.3197,0.75085,0.782963
900,0.2488,0.711245,0.782963
1000,0.1923,0.693392,0.773872


***** Running Evaluation *****
  Num examples = 96
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-100
Configuration saved in ./results/checkpoint-100/config.json
Model weights saved in ./results/checkpoint-100/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-700] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 96
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-200
Configuration saved in ./results/checkpoint-200/config.json
Model weights saved in ./results/checkpoint-200/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-800] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 96
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-300
Configuration saved in ./results/checkpoint-300/config.json
Model weights saved in ./results/checkpoint-300/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-900] due to args.save_total_limit
***** Running Evalu

VBox(children=(Label(value=' 4.96MB of 4.96MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

### Fine-tuning Classification Head Only

In [None]:
for name, params in model.named_parameters():
    if "classifier" not in name:
        params.requires_grad = False

print(f"total number of trainable parameters: {count_trainable_params(model)}")

In [None]:
# Fine-tuning
trainer.train()

# Finish wandb
wandb.finish()

### Evaluation

In [28]:
""" DEFINE EVALUATE FUNCTION """

def evaluate(texts, labels, metric="f1"):
    preds = []
    for text in texts:
        input_ids = torch.tensor([tokenizer.encode(text)]).to(device="cuda:0")
        logits = model(input_ids).logits
        prob = torch.softmax(logits, dim=1)
        max_idx = torch.argmax(prob).item()
        preds.append(max_idx)

    if metric == "f1":
        f1 = f1_score(labels, preds, average="macro")
        return f1

In [29]:
""" EVALUATE ON TEST SPLIT """

model.eval()
f1 = evaluate(test_texts, test_labels, metric="f1")
print(f"F1-macro: {f1}")

F1-macro: 0.7495765929976458


In [31]:
""" LOAD REAL TEST DATA """

X_test, y_test = utils.get_test(target="single")

In [31]:
""" SEGMENT REAL TEST DATA """

X_test = [_ws(text) for text in X_test]

In [33]:
""" LABEL ENCODING FOR REAL TEST DATA """

y_test = lb.transform(y_test)

In [33]:
""" EVALUATE ON REAL TEST SET """

model.eval()
f1 = evaluate(X_test, y_test, metric="f1")
print(f"F1-macro: {f1}")

F1-macro: 0.4225098806151392


In [None]:
""" CONFUSION MATRIX [PhoBERT Fine-Tuning] """

cm = confusion_matrix(y_test, preds)

print(len(np.unique(preds)))

plt.figure(figsize=(12, 10))
sns.heatmap(cm)

In [None]:
""" INFERENCE TESTING """

text = "ừ em đóng rất là tốt em đóng rất là tốt và có uy_tín của công_ty chị cho_nên công_ty chị đợt này mới ưu_đãi lại cho em một ờ cái khoản vay bằng tiền_mặt"

# encode = tokenize + numericalize
input_ids = torch.tensor([tokenizer.encode(text)]).to(device="cuda:0")

# forward
logit = model(input_ids).logits
print("Output Logits:\n")
print(logit, end="\n\n")

print("Output Labels:")
prob = torch.softmax(logit, dim=1)
max_idx = torch.argmax(prob).item()
lb.inverse_transform(np.array([max_idx]))

In [None]:
X_test[1000], lb.inverse_transform([y_test[1000]])

## TASK ADAPTIVE PRE-TRAINING

In [92]:
import os
import random

import wandb
from torchtext.vocab import build_vocab_from_iterator
import datasets
from datasets import load_dataset, Dataset
import pandas as pd
from vncorenlp import VnCoreNLP
from pyvi import ViTokenizer
from joblib import Parallel, delayed
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    Trainer,
    pipeline,
    EarlyStoppingCallback
)

from config import (
    VQC_UNLABELED_DATAPATH,
    VQC_DATAPATH,
    phobert_base_checkpoint
)

In [93]:
random.seed(42)
torch.manual_seed(42)
transformers.set_seed(42)

In [70]:
""" WANDB SETTINGS """

wandb.init(project="INTENT-CLASSIFIER",
           entity="emandai",
           name="Task Adaptive Pretraining",
           save_code=True,
           tags=["pretraining", "TAPT"],
           config={
               "epochs": 100,
               "lr": 2e-5,
               "batch_size": 8,
               "gradient_accumulation_steps": 2,
               "weight_decay": 0.01,
               "warmup_ratio": 0.06,
               "lr_scheduler_type": "linear"
           })

In [68]:
""" LOAD PHOBERT CHECKPOINT """

model = AutoModelForMaskedLM.from_pretrained(phobert_base_checkpoint)

loading configuration file https://huggingface.co/vinai/phobert-base/resolve/main/config.json from cache at /home/kiethoang/.cache/huggingface/transformers/a596f267f08b7158c7ab6300b1bf98eb6e1b05e6bcb0d7c18a8070364ee3011b.bbe27b2cac909b2279c83792c2d2b6f159f0a95f5d1c1eb66451da1c89a53609
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 258,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "PhobertTokenizer",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 64001
}

l

### Load Data

In [69]:
# """ LOAD TRANSCRIBED TEXTS """

# path = os.path.join(VQC_UNLABELED_DATAPATH, "transcribed_texts.txt")
# lines_df = pd.read_csv(path, delimiter="\n", names=["text"])

In [82]:
DATA_PATH = os.path.join(VQC_DATAPATH, "pretrain_data_medium.xlsx")

lines_df = pd.read_excel(DATA_PATH, usecols=["Sample"])
lines_df.head()

Unnamed: 0,Sample
0,anh là Ân đây
1,chị là hiếu
2,em tên là hà
3,em hà đây
4,em là hương


### Data Preprocessing

#### Remove Duplication

In [83]:
""" REMOVE DUPLICATE """

lines_df.drop_duplicates(ignore_index=True, inplace=True)
lines_df

Unnamed: 0,Sample
0,anh là Ân đây
1,chị là hiếu
2,em tên là hà
3,em hà đây
4,em là hương
...,...
2584,đợt này bên em có mở ra một cái gói vay vốn ti...
2585,mà không biết là chị cân nhắc được khoản nào k...
2586,thì bên em hỗ trợ cho mình nhận về thì số khoả...
2587,ủa giờ như dịch bệnh này mà lỡ may không có ti...


#### Text Lowering

In [84]:
""" TEXT LOWERING """

for index, row in lines_df.iterrows():
    lines_df.at[index, "Sample"] = row["Sample"].lower()

#### Word Segmentation

In [None]:
# pyvi
ws_texts = Parallel(n_jobs=8)(delayed(ViTokenizer.tokenize)(row["text"]) for _, row in lines_df.iterrows())

In [85]:
# RDRSegmenter
annotator = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size="-Xmx500m")

def word_segmenter(f):
    _concat = lambda x: " ".join([token for token in x[0]])
    def wrapper(*args, **kwargs):
        list_of_tokens = f(*args, **kwargs)
        return _concat(list_of_tokens)
    return wrapper

@word_segmenter
def _ws(text):
    return annotator.tokenize(text)


ws_texts = [_ws(row["Sample"]) for _, row in lines_df.iterrows()]
for index, row in lines_df.iterrows():
    lines_df.at[index, "Sample"] = _ws(row["Sample"])

#### Create Dataset

In [86]:
""" INITSTANTIATE HF DATASET FROM PANDAS DATAFRAME """
dataset = Dataset.from_pandas(df=lines_df)

""" TRAIN TEST SPLIT """
TEST_SIZE = 0.1
dataset = dataset.train_test_split(test_size=TEST_SIZE)
dataset["validation"] = dataset.pop("test") # For name consistency

#### Text Tokenization (removed)

In [87]:
tokenizer = AutoTokenizer.from_pretrained(phobert_base_checkpoint, use_fast=True)   

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/vinai/phobert-base/resolve/main/config.json from cache at /home/kiethoang/.cache/huggingface/transformers/a596f267f08b7158c7ab6300b1bf98eb6e1b05e6bcb0d7c18a8070364ee3011b.bbe27b2cac909b2279c83792c2d2b6f159f0a95f5d1c1eb66451da1c89a53609
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 258,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "PhobertTokenizer",
  "transformer

In [80]:
# """ ADD NEW TOKENS TO VOCAB """

# # build vocab from the train set
# def yield_tokens_from_hf_dataset(dataset):
#     for text in dataset["text"]:
#         yield text.split()
# vocab = build_vocab_from_iterator(iterator=yield_tokens_from_hf_dataset(dataset["train"]))

# # add new tokens to the tokenizer
# num_added_tokens = tokenizer.add_tokens(list(vocab.stoi)[2:]) # remove <pad> and <unk> token

# print(f"[BEFORE] vocab size: {tokenizer.vocab_size}")
# print(f"[AFTER] vocab size: {len(tokenizer)} (+{len(tokenizer) - tokenizer.vocab_size})")

# # Update embeddings matrix size
# # model.resize_token_embeddings(len(tokenizer))

### Training settings

In [81]:
""" TRAINING SETTINGS """

train_args = TrainingArguments(
    num_train_epochs=wandb.config.epochs,
    learning_rate=wandb.config.lr,
    lr_scheduler_type=wandb.config.lr_scheduler_type,
    weight_decay=wandb.config.weight_decay,
    warmup_ratio=wandb.config.warmup_ratio,
    per_device_train_batch_size=wandb.config.batch_size,
    per_device_eval_batch_size=wandb.config.batch_size,
    gradient_accumulation_steps=wandb.config.gradient_accumulation_steps,
    logging_steps=10,
    logging_strategy="steps",
    eval_steps=100,
    evaluation_strategy="steps",
    save_steps=100,
    save_strategy="steps",
    save_total_limit=10,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    logging_dir="./logs/TAPT",
    output_dir="./results/TAPT",
    report_to="wandb"
)

# Random mask
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15
)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(5)]
)

PyTorch: setting up devices


### Fine-tuning

In [82]:
trainer.train()

wandb.finish(quiet=True)

***** Running training *****
  Num examples = 2330
  Num Epochs = 100
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 14600
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
100,4.4167,4.51052
200,3.9421,3.764014
300,2.9414,3.510015
400,3.1557,3.038373
500,2.8089,2.948938
600,2.8611,2.961706
700,2.7703,2.701191
800,2.451,2.583792
900,2.524,2.395486
1000,2.7414,2.558349


***** Running Evaluation *****
  Num examples = 259
  Batch size = 8
Saving model checkpoint to ./results/TAPT/checkpoint-100
Configuration saved in ./results/TAPT/checkpoint-100/config.json
Model weights saved in ./results/TAPT/checkpoint-100/pytorch_model.bin
Deleting older checkpoint [results/TAPT/checkpoint-1700] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 259
  Batch size = 8
Saving model checkpoint to ./results/TAPT/checkpoint-200
Configuration saved in ./results/TAPT/checkpoint-200/config.json
Model weights saved in ./results/TAPT/checkpoint-200/pytorch_model.bin
Deleting older checkpoint [results/TAPT/checkpoint-1800] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 259
  Batch size = 8
Saving model checkpoint to ./results/TAPT/checkpoint-300
Configuration saved in ./results/TAPT/checkpoint-300/config.json
Model weights saved in ./results/TAPT/checkpoint-300/pytorch_model.bin
Deleting older checkpoint [results/TAPT/ch

VBox(children=(Label(value=' 12.69MB of 12.69MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.…

### Sanity Check

In [83]:
""" MLM TESTING """

# Load pretrained model
emandai_model = AutoModelForMaskedLM.from_pretrained("./results/TAPT/checkpoint-2500")
vinai_model = AutoModelForMaskedLM.from_pretrained(phobert_base_checkpoint)

tokenizer = AutoTokenizer.from_pretrained(phobert_base_checkpoint, use_fast=False)

emandai_unmasker = pipeline("fill-mask", model=emandai_model, tokenizer=tokenizer)
vinai_unmasker = pipeline("fill-mask", model=vinai_model, tokenizer=tokenizer)

loading configuration file ./results/TAPT/checkpoint-2500/config.json
Model config RobertaConfig {
  "_name_or_path": "vinai/phobert-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 258,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "PhobertTokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 64001
}

loading weights file ./results/TAPT/checkpoint-2500/pytorch_model.bin
All model checkpoint weights were used when initializing RobertaForMaskedLM.



In [84]:
emandai_model.eval()
vinai_model.eval()

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [85]:
""" MLM TESTING """

text = _ws("lấy tiền gửi cho cháu đi học thêm")
text

'lấy tiền gửi cho cháu đi học thêm'

In [86]:
emandai_unmasker("lấy tiền gửi cho cháu đi học <mask>")

[{'sequence': 'lấy tiền gửi cho cháu đi học thêm',
  'score': 0.6051039099693298,
  'token': 143,
  'token_str': 't h ê m'},
 {'sequence': 'lấy tiền gửi cho cháu đi học đại_học',
  'score': 0.1962745040655136,
  'token': 956,
  'token_str': 'đ ạ i _ h ọ c'},
 {'sequence': 'lấy tiền gửi cho cháu đi học ngoại_ngữ',
  'score': 0.05282137915492058,
  'token': 4408,
  'token_str': 'n g o ạ i _ n g ữ'},
 {'sequence': 'lấy tiền gửi cho cháu đi học nữa',
  'score': 0.03260941058397293,
  'token': 348,
  'token_str': 'n ữ a'},
 {'sequence': 'lấy tiền gửi cho cháu đi học nước_ngoài',
  'score': 0.01431557722389698,
  'token': 516,
  'token_str': 'n ư ớ c _ n g o à i'}]

In [21]:
vinai_unmasker("lấy tiền gửi cho cháu đi học <mask>")

[{'sequence': 'lấy tiền gửi cho cháu đi học.',
  'score': 0.9702707529067993,
  'token': 5,
  'token_str': '.'},
 {'sequence': 'lấy tiền gửi cho cháu đi học :',
  'score': 0.013175510801374912,
  'token': 27,
  'token_str': ':'},
 {'sequence': 'lấy tiền gửi cho cháu đi học ;',
  'score': 0.006559078581631184,
  'token': 65,
  'token_str': ';'},
 {'sequence': 'lấy tiền gửi cho cháu đi học...',
  'score': 0.006110189016908407,
  'token': 135,
  'token_str': '...'},
 {'sequence': 'lấy tiền gửi cho cháu đi học?',
  'score': 0.00104904908221215,
  'token': 114,
  'token_str': '?'}]

## SVM BASELINE

In [27]:
from pprint import pformat

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

In [28]:
pipeline = Pipeline([
    ("vect", TfidfVectorizer()),
    ("scaler", MaxAbsScaler()),
    ("clf", OneVsRestClassifier(LinearSVC(random_state=1337), n_jobs=-1))
])
grid = [{
        "vect__ngram_range": [(1, 1), (1, 2), (2, 2)],
        "vect__max_df": [0.2, 0.5, 0.75, 1.0],
        "clf__estimator__C": [0.01, 0.1, 1, 10],
        "clf__estimator__class_weight": ["balanced", None]
}]

gridcv = GridSearchCV(estimator=pipeline,
                      param_grid=grid,
                      cv=5,
                      scoring="f1_macro",
                      n_jobs=-1,
                      return_train_score=True)
gridcv.fit([*train_texts, *val_texts], [*train_labels, *val_labels])

# Statistic verbose
print(f"Best score: {gridcv.best_score_:.3f}", end="\n\n")
print(f"Best params:\n {pformat(gridcv.best_params_)}", end="\n\n")
print(f"Best estimator:\n {pformat(gridcv.best_estimator_)}", end="\n\n")

linear_svm = gridcv.best_estimator_



Best score: 0.733

Best params:
 {'clf__estimator__C': 0.1,
 'clf__estimator__class_weight': 'balanced',
 'vect__max_df': 0.5,
 'vect__ngram_range': (1, 1)}

Best estimator:
 Pipeline(steps=[('vect', TfidfVectorizer(max_df=0.5)),
                ('scaler', MaxAbsScaler()),
                ('clf',
                 OneVsRestClassifier(estimator=LinearSVC(C=0.1,
                                                         class_weight='balanced',
                                                         random_state=1337),
                                     n_jobs=-1))])





In [29]:
def _evaluate(model, X, y):
    y_preds = model.predict(X)
    f1_scores = f1_score(y, y_preds, average="macro")
    return f1_scores

# Evalute on dev set
# preds = linear_svm.predict(val_texts)
# f1_scores = f1_score(val_labels, preds, average="macro")
# dev_f1_score = _evaluate(linear_svm, val_texts, val_labels)
# print(f"F1 macro [DEV SET]: {dev_f1_score}")

# Evalute on test set
test_f1_score = _evaluate(linear_svm, test_texts, test_labels)
print(f"F1 macro [TEST SET]: {test_f1_score}")

F1 macro [TEST SET]: 0.6370415684180867


In [34]:
# Evalute on REAL test set
real_test_f1_score = _evaluate(linear_svm, X_test, y_test)
print(f"F1 macro [REAL TEST SET]: {real_test_f1_score}")

F1 macro [REAL TEST SET]: 0.3203896073664704


In [None]:
""" CONFUSION MATRIX [SVM] """

y_preds = linear_svm.predict(X_test)
cm = confusion_matrix(y_test, y_preds)

print(len(np.unique(y_preds)))

plt.figure(figsize=(12, 10))
sns.heatmap(cm)