# 文本相似度

## Step1 載入套件

In [1]:
!pip install transformers[torch]
!pip install datasets
!pip install evaluate
!pip install trainer

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Step2 下載數據集

In [4]:
dataset = load_dataset("json", data_files="/content/drive/MyDrive/Colab_Notebooks/NLP_tutorial/Transformers 大祕寶/transformers-code/sunny_huginfs_NLP/advance_task/04文本相似度/simCLUE_train_pair_1w.json", split="train")
dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 10000
})

In [5]:
dataset[0]

{'sentence1': '找一部小时候的动画片', 'sentence2': '求一部小时候的动画片。谢了', 'label': '1'}

## Step3 切分數據集

In [6]:
datasets = dataset.train_test_split(test_size=0.2)
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})

## Step4 資料預處理

In [7]:
sen_0 = datasets['train'][:1]
sen_0

{'sentence1': ['最后，议案宣称今年己征收的人头税为数不足，应在全国范围普遍加征每人四先令，凡拒绝向政府重新宣誓的人应付双倍的税款。'],
 'sentence2': ['不难想象，这一切，非但不能安慰他，反而更增加了他的痛苦。'],
 'label': ['0']}

### 預處理結果測試

In [8]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

sentences = []
labels = []
for sen1, sen2, label in zip(sen_0["sentence1"], sen_0["sentence2"], sen_0["label"]):
    # print(sen1)
    # print(sen2)
    sentences.append(sen1)
    sentences.append(sen2)
    labels.append(1 if int(label) == 1 else -1)

print(sentences)
print(labels)

tokenized_examples = tokenizer(sentences, max_length=128, truncation=True, padding="max_length")
print(tokenized_examples)

tokenized_examples_ = {k: [v[i: i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()} # batch 表示 2D -> 3D
print(tokenized_examples_)




tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

['最后，议案宣称今年己征收的人头税为数不足，应在全国范围普遍加征每人四先令，凡拒绝向政府重新宣誓的人应付双倍的税款。', '不难想象，这一切，非但不能安慰他，反而更增加了他的痛苦。']
[-1]
{'input_ids': [[101, 3297, 1400, 8024, 6379, 3428, 2146, 4917, 791, 2399, 2346, 2519, 3119, 4638, 782, 1928, 4925, 711, 3144, 679, 6639, 8024, 2418, 1762, 1059, 1744, 5745, 1741, 3249, 6881, 1217, 2519, 3680, 782, 1724, 1044, 808, 8024, 1127, 2867, 5318, 1403, 3124, 2424, 7028, 3173, 2146, 6292, 4638, 782, 2418, 802, 1352, 945, 4638, 4925, 3621, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 679, 7410, 2682, 6496, 8024, 6821, 671, 1147, 8024, 7478, 852, 679, 5543, 2128, 2720, 800, 8024, 1353, 5445, 3291, 1872, 1217, 749, 800, 4638, 4578, 5736, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [9]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

def process_function(examples):
    sentences = []
    labels = []
    for sen1, sen2, label in zip(examples["sentence1"], examples["sentence2"], examples["label"]):
        sentences.append(sen1)
        sentences.append(sen2)
        labels.append(1 if int(label) == 1 else -1) #　強制二分類
    # input_ids, attention_mask, token_type_ids
    tokenized_examples = tokenizer(sentences, max_length=128, truncation=True, padding="max_length")
    tokenized_examples = {k: [v[i: i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}
    tokenized_examples["labels"] = labels
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [10]:
tokenized_datasets["train"][0]['labels']

-1

In [11]:
print(tokenized_datasets["train"][0])

{'input_ids': [[101, 3297, 1400, 8024, 6379, 3428, 2146, 4917, 791, 2399, 2346, 2519, 3119, 4638, 782, 1928, 4925, 711, 3144, 679, 6639, 8024, 2418, 1762, 1059, 1744, 5745, 1741, 3249, 6881, 1217, 2519, 3680, 782, 1724, 1044, 808, 8024, 1127, 2867, 5318, 1403, 3124, 2424, 7028, 3173, 2146, 6292, 4638, 782, 2418, 802, 1352, 945, 4638, 4925, 3621, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 679, 7410, 2682, 6496, 8024, 6821, 671, 1147, 8024, 7478, 852, 679, 5543, 2128, 2720, 800, 8024, 1353, 5445, 3291, 1872, 1217, 749, 800, 4638, 4578, 5736, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Step5 建立模型

In [12]:
from transformers import BertForSequenceClassification, BertPreTrainedModel, BertModel
from typing import Optional
from transformers.configuration_utils import PretrainedConfig
from torch.nn import CosineSimilarity, CosineEmbeddingLoss

class DualModel(BertPreTrainedModel):

    def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.bert = BertModel(config)
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Step1 分别獲得sentenceA 和 sentenceB的输入
        senA_input_ids, senB_input_ids = input_ids[:, 0], input_ids[:, 1] # [batch: 0] [batch: 1]
        senA_attention_mask, senB_attention_mask = attention_mask[:, 0], attention_mask[:, 1]
        senA_token_type_ids, senB_token_type_ids = token_type_ids[:, 0], token_type_ids[:, 1]

        # Step2 分别獲得sentenceA 和 sentenceB的向量表示
        senA_outputs = self.bert(
            senA_input_ids,
            attention_mask=senA_attention_mask,
            token_type_ids=senA_token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        senA_pooled_output = senA_outputs[1]    # [batch, hidden=768] [CLS] output

        senB_outputs = self.bert(
            senB_input_ids,
            attention_mask=senB_attention_mask,
            token_type_ids=senB_token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        senB_pooled_output = senB_outputs[1]    # [batch, hidden]

        # step3 計算相似度

        cos = CosineSimilarity()(senA_pooled_output, senB_pooled_output)    # [batch, ]

        # step4 計算loss

        loss = None
        if labels is not None:
            loss_fct = CosineEmbeddingLoss(0.3)
            loss = loss_fct(senA_pooled_output, senB_pooled_output, labels)

        output = (cos,)
        return ((loss,) + output) if loss is not None else output

model = DualModel.from_pretrained("hfl/chinese-macbert-base")

pytorch_model.bin:   0%|          | 0.00/412M [00:00<?, ?B/s]

## Step6 建立評估函數

In [13]:
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metirc = evaluate.load("f1")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [14]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = [int(p > 0.7) for p in predictions]
    labels = [int(l > 0) for l in labels]
    # predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metirc.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc



## Step7 撰寫TrainingArguments

In [15]:
train_args = TrainingArguments(output_dir="./dual_model",      # 输出文件夹
                               per_device_train_batch_size=32,  # 训练时的batch_size
                               per_device_eval_batch_size=32,  # 验证时的batch_size
                               logging_steps=10,                # log 打印的频率
                               evaluation_strategy="epoch",     # 评估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=3,              # 最大保存数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.01,               # weight_decay
                               metric_for_best_model="f1",      # 设定评估指标
                               load_best_model_at_end=True)     # 训练完成后加载最优模型



## Step8 建立Trainer

In [16]:
trainer = Trainer(model=model,
                  args=train_args,
                  train_dataset=tokenized_datasets["train"],
                  eval_dataset=tokenized_datasets["test"],
                  compute_metrics=eval_metric)

## Step9 模型訓練

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1909,0.183527,0.793,0.757327
2,0.1769,0.174736,0.81,0.774614
3,0.1141,0.172357,0.807,0.76969


TrainOutput(global_step=750, training_loss=0.17786832841237385, metrics={'train_runtime': 365.7704, 'train_samples_per_second': 65.615, 'train_steps_per_second': 2.05, 'total_flos': 3157275967488000.0, 'train_loss': 0.17786832841237385, 'epoch': 3.0})

### - config.json:
This file contains the configuration of the model—such as the architecture, the size of the model, and various hyperparameters. It allows for the model to be re-loaded with the same configuration settings it was initially trained with.

### - model.safernsensors
(likely model.<something>.bin): It looks like there may be a typo in the filename. Usually, this would be model.bin or pytorch_model.bin, which is the actual saved model weights. This binary file holds the state_dict of the PyTorch model, which includes all the learned parameters of the model.

### - optimizer.pt:
This is a PyTorch file that saves the state of the optimizer. It includes the current values of all optimizer parameters, like learning rate, weight decay, etc., which are necessary for continuing training from the exact point it was saved.

### - scheduler.pt:
This file contains the state of the learning rate scheduler. If you're using a learning rate scheduler to change the learning rate over time, this file saves its state so that you can resume training with the learning rate schedule intact.

### - trainer_state.json: This JSON file contains the state of the Trainer itself, including information about the number of training steps, the current epoch, and other training-related metadata.

### - training_args.bin:
This binary file contains the arguments used to set up the training process. This includes all the arguments that were passed to the TrainingArguments class in the transformers library, which governs the behavior of the Trainer.

### - rng_state.pth:
This file stores the state of the random number generator (RNG) used during training. Saving the RNG state is crucial for reproducibility purposes, as it allows you to recreate the exact same sequence of random numbers, which is essential for debugging and for re-creating experiments exactly.

## Step10 模型評估

In [18]:
trainer.evaluate(tokenized_datasets["test"])

{'eval_loss': 0.17473584413528442,
 'eval_accuracy': 0.81,
 'eval_f1': 0.7746144721233688,
 'eval_runtime': 9.6132,
 'eval_samples_per_second': 208.048,
 'eval_steps_per_second': 6.554,
 'epoch': 3.0}

## Step11 模型預測

In [19]:
class SentenceSimilarityPipeline:

    def __init__(self, model, tokenizer) -> None:
        self.model = model.bert
        self.tokenizer = tokenizer
        self.device = model.device

    def preprocess(self, senA, senB):
        return self.tokenizer([senA, senB], max_length=128, truncation=True, return_tensors="pt", padding=True)

    def predict(self, inputs):
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        return self.model(**inputs)[1]  # [2, 768]

    def postprocess(self, logits):
        cos = CosineSimilarity()(logits[None, 0, :], logits[None,1, :]).squeeze().cpu().item()
        return cos

    def __call__(self, senA, senB, return_vector=False):
        inputs = self.preprocess(senA, senB)
        logits = self.predict(inputs)
        result = self.postprocess(logits)
        if return_vector:
            return result, logits
        else:
            return result

In [20]:
pipe = SentenceSimilarityPipeline(model, tokenizer)

In [22]:
pipe("我愛台灣", "台灣真棒", return_vector=True)

(0.839076042175293,
 tensor([[-0.9461, -0.8767,  0.9334,  ...,  0.5494,  0.4754, -0.9137],
         [-0.7233, -0.9641,  0.9992,  ..., -0.0028, -0.3723, -0.9966]],
        device='cuda:0', grad_fn=<TanhBackward0>))

In [25]:
# save 模型參數

save_directory = "/content/drive/MyDrive/Colab_Notebooks/NLP_tutorial/Transformers 大祕寶/transformers-code/sunny_huginfs_NLP/advance_task/04文本相似度"
model.save_pretrained(save_directory)


In [24]:
!pwd

/content
