# PID

In [1]:
import os

os.getpid()

124

# Prepare

In [54]:
import datetime
import pathlib
import json

import torch as pt
import transformers
from transformers import (
    BertModel, BertConfig, BertTokenizer, AutoTokenizer,
    AlbertModel, AlbertConfig
)
from tokenizers import BertWordPieceTokenizer
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

## params

In [3]:
transformers.logging.set_verbosity_error()

today_dt = datetime.datetime.today()

today_yyyymmdd = today_dt.strftime("%Y%m%d")

max_seq_len = 128
test_rate = 0.2

batch_size = 8

In [4]:
# local model
model_dir = "/home/ccuulinay/pretrained/albert_chinese_small/"
# If it's TF model, it's checkpoint files or keras bin file in model folder.
# model_ckpt = pathlib.Path(model_dir) / "albert_model.ckpt.index"
# model_config = pathlib.Path(model_dir) / "albert_config_base.json"
vocab_file = pathlib.Path(model_dir) / "vocab.txt"

tokenizer = BertTokenizer.from_pretrained(model_dir)
t_tknrs = BertWordPieceTokenizer(str(vocab_file))
t_tknrs.enable_truncation(max_length=max_seq_len)

albert_model = AlbertModel.from_pretrained(model_dir)
albert_config = AlbertConfig.from_pretrained(model_dir)

## test-out

In [5]:
test_input_txt = "几天心情很好啊，买了很多东西，我特别喜欢，终于有了自己喜欢的电子产品，这次总算可以好好学习了。"
text_input_txt_list = ['我爱你','猫不是狗']
encoded_pos = tokenizer.encode(test_input_txt)
input_ids = pt.tensor(encoded_pos).view(-1, len(encoded_pos))
outputs = albert_model(input_ids)

In [6]:
outputs[0].shape, outputs[1].shape

(torch.Size([1, 49, 384]), torch.Size([1, 384]))

In [7]:
len(test_input_txt)

47

In [8]:
# a = t_tknrs.encode(test_input_txt)
b = tokenizer(*text_input_txt_list, padding="max_length", truncation=True, max_length=max_seq_len)
b

{'input_ids': [101, 2769, 4263, 872, 102, 4344, 679, 3221, 4318, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [9]:
class FullConnAlbertCls(pt.nn.Module):
    def __init__(self, 
        bert_model, bert_config, num_class, 
        dropout_rate=0.4
    ):
        super(FullConnAlbertCls, self).__init__()
        self.bert_model = bert_model
        self.dropout = pt.nn.Dropout(dropout_rate)
        self.fc1 = pt.nn.Linear(bert_config.hidden_size, bert_config.hidden_size)
        self.fc2 = pt.nn.Linear(bert_config.hidden_size, num_class)

    def forward(self, token_ids, type_ids=None):
        # model output[0] with size [batch_size, max_seq_len/seq_len, hidden_size]
        if type_ids is not None:
            x = self.bert_model(token_ids, type_ids)[1]
        else:
            x = self.bert_model(token_ids)[1]
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.dropout(x)
        out = self.fc2(x) # With size [batch_size, num_class]
        return out

# Data


## Read data

In [10]:
data_f = "/mnt/e/data_worlds/中文保险小样本多任务竞赛-Data-Baseline-V3-20220509/data/opensource_sample_500.json"

with open(data_f, "r") as f:
    raw_d = json.load(f)
    print(raw_d.get("mrc").keys())
    raw_df = pd.DataFrame(raw_d.get("mrc").get("dureader_yesno").get("data_list"))
    raw_df["label_y"] = raw_df["label"].map({
        "Yes": 1,
        "No": 0,
        "Depends": 2
    })

dict_keys(['cmrc', 'drcd', 'dureader_robust', 'dureader_checklist', 'dureader_yesno', 'cail2019', 'cail2020', 'c3'])


In [11]:
raw_df["label_y"].value_counts()

1    263
0    163
2     74
Name: label_y, dtype: int64

In [12]:
263 / (263 + 163 + 74)

0.526

In [13]:
raw_df[["text_a", "text_b", "label_y"]].head()

Unnamed: 0,text_a,text_b,label_y
0,首都机场t2有免税店吗,2号楼和3号楼的国际出发和国际到达行李提取厅都有免税店。,1
1,有姓催的吗,今河南省的方城县有催氏族人分布。,1
2,马尼拉要签证吗,需要签证。,1
3,美国股市是t+0的吗,中国股票是t加1模式，美国是t加0,1
4,滴滴打车会便宜吗,一个是快车，收费规则是时间+公里数，比如一公里1块钱一分钟1块钱，走一公里用了3分钟就是4块...,2


## Preprocess data
1. Encode data - tokenize/put into TensorDataSet

In [14]:
# a = t_tknrs.encode(
#     *(train_df[["text_a", "text_b"]].iloc[0].values)
# )
# a.pad(128)


# tokenizer(
#     *(train_df[["text_a", "text_b"]].iloc[0].values)
#     , padding="max_length", truncation=True, max_length=max_seq_len
#     # , return_tensors='pt'
# )

def encode_data(
    data, labels=None, 
    tokenzier=None, max_len=128, padding="max_length", truncation=True,
    output_tensor=True
):
    X_word_ids = []
    X_masks = []
    X_seq = []
    
    for x in tqdm(data):
        if isinstance(x, str):
            t_encoder = tokenizer(
                x
                , padding="max_length", truncation=True, max_length=max_len
                # , return_tensors='pt'
            )
        else:
            t_encoder = tokenizer(
                x[0], x[1]
                , padding="max_length", truncation=True, max_length=max_len
                # , return_tensors='pt'
            )
        X_word_ids.append(t_encoder["input_ids"])
        X_masks.append(t_encoder["attention_mask"])
        X_seq.append(t_encoder["token_type_ids"])
    
    X_word_ids = np.array(X_word_ids)
    X_masks = np.array(X_masks)
    X_seq = np.array(X_seq)
    
    if labels is not None:
        Y = [
            int(y) for y in tqdm(labels)
        ]
        
        return {
            "input_word_ids": X_word_ids,
            "attention_mask": X_masks,
            "token_type_ids": X_seq,
            "label": Y
        }
    else:
        return {
            "input_word_ids": X_word_ids,
            "attention_mask": X_masks,
            "token_type_ids": X_seq
        }
    
def flat_accuracy(preds, labels):
    """A function for calculating accuracy scores"""
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, pred_flat)

In [15]:
class DataGen(pt.utils.data.Dataset):
    
    def __init__(self, data, label):
        self.data = data
        self.lable = label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return np.array(self.data[index], np.array(self.label[index]))



In [16]:
raw_input_d = encode_data(
    raw_df[["text_a", "text_b"]].values,
    raw_df["label_y"].values,
    tokenizer
)

input_tensor = pt.utils.data.TensorDataset(
    pt.tensor(raw_input_d["input_word_ids"]),
    pt.tensor(raw_input_d["token_type_ids"]),
    pt.tensor(raw_input_d["attention_mask"]),
    pt.tensor(raw_input_d["label"])
)

100%|██████████| 500/500 [00:00<00:00, 2265.13it/s]
100%|██████████| 500/500 [00:00<00:00, 1170285.71it/s]


## Split train and test

In [17]:
train_df, test_df = train_test_split(
    raw_df[["text_a", "text_b", "label_y"]], test_size=test_rate, random_state=42, shuffle=True
)

train_input_d = encode_data(
    train_df[["text_a", "text_b"]].values,
    train_df["label_y"].values,
    tokenizer
)
test_input_d = encode_data(
    test_df[["text_a", "text_b"]].values,
    test_df["label_y"].values,
    tokenizer
)

train_input = pt.utils.data.TensorDataset(
    pt.tensor(train_input_d["input_word_ids"]),
    pt.tensor(train_input_d["token_type_ids"]),
    pt.tensor(train_input_d["attention_mask"]),
    pt.tensor(train_input_d["label"])
)

test_input = pt.utils.data.TensorDataset(
    pt.tensor(test_input_d["input_word_ids"]),
    pt.tensor(test_input_d["token_type_ids"]),
    pt.tensor(test_input_d["attention_mask"]),
    pt.tensor(test_input_d["label"])
)

100%|██████████| 400/400 [00:00<00:00, 2281.01it/s]
100%|██████████| 400/400 [00:00<00:00, 1142862.13it/s]
100%|██████████| 100/100 [00:00<00:00, 2053.06it/s]
100%|██████████| 100/100 [00:00<00:00, 639375.61it/s]


In [18]:
train_data_loader = pt.utils.data.DataLoader(
    train_input, batch_size=8
)

test_data_loader = pt.utils.data.DataLoader(
   test_input, batch_size=1
)

# Build cls

In [19]:
cls = FullConnAlbertCls(albert_model, albert_config, 3)
device = pt.device("cuda:0") if pt.cuda.is_available() else "cpu"
cls = cls.to(device)

# loss func and optimizer
criterion = pt.nn.CrossEntropyLoss() # CrossEntropyLoss, which already combines the softmax plus the negative-likelihood

optimizer = pt.optim.SGD(
    cls.parameters(), lr=0.007, momentum=0.9, weight_decay=1e-4
)

In [20]:
list(train_data_loader)[0][3].to(device)

tensor([1, 1, 0, 1, 1, 0, 1, 1], device='cuda:0')

In [21]:
# outputs.detach().cpu().numpy()
# outputs.argmax(1)

In [22]:
len(train_input)

400

In [23]:
for epoch in range(100):
    total_loss = 0
    accu = 0
    
    cls.train()
    for step,batch in enumerate(train_data_loader):
        labels=batch[3].to(device)
        token_ids = batch[0].to(device)
        type_ids = batch[1].to(device)
        
        # print(token_ids)
        
        outputs  = cls(token_ids = token_ids,
            type_ids = type_ids,
            # attention_mask=batch[2].to(device),
            # labels=batch[3].to(device)
        )    #输出loss 和 每个分类对应的输出，softmax后才是预测是对应分类的概率
        
        # loss, logits = outputs.loss, outputs.logits
        # logits = outputs.argmax(1)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        
        # total_loss += loss.cpu().data.numpy()
        # total_loss += loss.cuda().data.cpu().numpy()
        loss.backward()
        
        pt.nn.utils.clip_grad_norm_(cls.parameters(), 1.0)
        optimizer.step()
        
        total_loss += loss.data.cpu().numpy()
        accu += (outputs.argmax(1) == labels).sum().cpu().data.numpy()

            
    cls.eval()
    
    test_loss_sum = 0
    test_accu = 0
    print(f"++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print('Epoch:' , epoch)
    for i, batch in enumerate(test_data_loader):
        labels=batch[3].to(device)
        token_ids = batch[0].to(device)
        type_ids = batch[1].to(device)
        with pt.no_grad():
            outputs  = cls(
                token_ids = token_ids,
                type_ids = type_ids,
                # attention_mask=batch[2].to(device),
                # labels=batch[3].to(device)
            )    #输出loss 和 每个分类对应的输出，softmax后才是预测是对应分类的概率
            loss = criterion(outputs, labels)
            # print(f"loss: {loss}")
            test_loss_sum  += loss.data.cpu().numpy()
            test_accu += (outputs.argmax(1) == labels).sum().cpu().data.numpy()
            
    print(f"train loss: {total_loss / len(train_input)}, train accuracy: {accu/ len(train_input)} ")
    print(f"test loss: {test_loss_sum / len(test_input)}, test accuracy: {test_accu/ len(test_input)} ")
    print(f"++++++++++++++++++++++++++++++++++++++++++++++++++++++")

++++++++++++++++++++++++++++++++++++++++++++++++++++++
Epoch: 0
train loss: 0.13672034934163094, train accuracy: 0.44 
test loss: 0.9654735785722732, test accuracy: 0.58 
++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++
Epoch: 1
train loss: 0.13020988255739213, train accuracy: 0.46 
test loss: 0.961404920220375, test accuracy: 0.58 
++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++
Epoch: 2
train loss: 0.127969002276659, train accuracy: 0.4675 
test loss: 0.9565609896183014, test accuracy: 0.53 
++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++
Epoch: 3
train loss: 0.1231536403298378, train accuracy: 0.52 
test loss: 0.893791743516922, test accuracy: 0.62 
++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++
Epoch: 4
train loss: 0.11397727891802788, tr

++++++++++++++++++++++++++++++++++++++++++++++++++++++
Epoch: 36
train loss: 7.09540961906896e-06, train accuracy: 1.0 
test loss: 2.708929895881065, test accuracy: 0.62 
++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++
Epoch: 37
train loss: 7.608994060319674e-06, train accuracy: 1.0 
test loss: 2.7051730309561752, test accuracy: 0.62 
++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++
Epoch: 38
train loss: 6.445751646424469e-06, train accuracy: 1.0 
test loss: 2.706259582798666, test accuracy: 0.62 
++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++
Epoch: 39
train loss: 7.372726770427107e-06, train accuracy: 1.0 
test loss: 2.7085485395543767, test accuracy: 0.62 
++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++
Epoch: 40
train loss: 5.878107103853

train loss: 3.7058863881611613e-06, train accuracy: 1.0 
test loss: 2.8017923095253625, test accuracy: 0.62 
++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++
Epoch: 73
train loss: 4.233885227904466e-06, train accuracy: 1.0 
test loss: 2.8115698296633083, test accuracy: 0.62 
++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++
Epoch: 74
train loss: 3.931008980089245e-06, train accuracy: 1.0 
test loss: 2.8117777423235477, test accuracy: 0.62 
++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++
Epoch: 75
train loss: 3.4020215207419826e-06, train accuracy: 1.0 
test loss: 2.812325235938947, test accuracy: 0.62 
++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++
Epoch: 76
train loss: 3.210800035731154e-06, train accuracy: 1.0 
test loss: 2.814059286439924, t

In [26]:
print("Model's state_dict:")
for param_tensor in cls.state_dict():
    print(param_tensor, "\t", cls.state_dict()[param_tensor].size())

Model's state_dict:
bert_model.embeddings.position_ids 	 torch.Size([1, 512])
bert_model.embeddings.word_embeddings.weight 	 torch.Size([21128, 128])
bert_model.embeddings.position_embeddings.weight 	 torch.Size([512, 128])
bert_model.embeddings.token_type_embeddings.weight 	 torch.Size([2, 128])
bert_model.embeddings.LayerNorm.weight 	 torch.Size([128])
bert_model.embeddings.LayerNorm.bias 	 torch.Size([128])
bert_model.encoder.embedding_hidden_mapping_in.weight 	 torch.Size([384, 128])
bert_model.encoder.embedding_hidden_mapping_in.bias 	 torch.Size([384])
bert_model.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.weight 	 torch.Size([384])
bert_model.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.bias 	 torch.Size([384])
bert_model.encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight 	 torch.Size([384, 384])
bert_model.encoder.albert_layer_groups.0.albert_layers.0.attention.query.bias 	 torch.Size([384])
bert_model.encoder.alb

## Save model states

In [30]:
pt.save(cls.state_dict(), "./output_models/pt_albert_cls_states.pt")

pt.save(cls, "./output_models/pt_albert_cls.pt")

In [28]:
cv_cls = FullConnAlbertCls(albert_model, albert_config, 3)
cv_cls.load_state_dict(pt.load("./output_models/pt_albert_cls_states.pt"))

<All keys matched successfully>

In [31]:
ecv_cls = pt.load("./output_models/pt_albert_cls.pt")

In [51]:
cv_cls = cv_cls.to(device)
cv_cls.eval()
preds = []
test_lbls = []

for i, batch in enumerate(test_data_loader):
    with pt.no_grad():
        pred_outs = cv_cls(

            token_ids = batch[0].to(device),
            type_ids = batch[1].to(device),
        )
        pred_logits = pred_outs.detach().cpu().numpy()
        preds += list(np.argmax(pred_logits, axis=1))
        test_lbls += list(batch[3].cpu().numpy())

In [55]:
accuracy_score(preds, test_lbls)

0.64

In [57]:
print(classification_report(preds, test_lbls))

              precision    recall  f1-score   support

           0       0.63      0.49      0.55        35
           1       0.75      0.77      0.76        57
           2       0.21      0.38      0.27         8

    accuracy                           0.64       100
   macro avg       0.53      0.54      0.53       100
weighted avg       0.66      0.64      0.65       100

