# Method 1: Cross-Model

<div>
<img src="figs/07_plan_1.jpg" width="1000"/>
</div>

## Dataset
https://github.com/CLUEbenchmark/SimCLUE

We only use a subset of it
!wget https://raw.githubusercontent.com/zyds/transformers-code/master/02-NLP%20Tasks/12-sentence_similarity/train_pair_1w.json

## Step1 导入相关包

In [19]:
import os
# os.environ["WORLD_SIZE"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
from transformers import DataCollatorWithPadding
import evaluate
from transformers import pipeline

## Step2 数据集准备

In [2]:


dataset = load_dataset("json", data_files="data/train_pair_1w.json", split="train")
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 10000
})

In [3]:
datasets = dataset.train_test_split(test_size=0.2)
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})

In [4]:
datasets['train']['label'][:5]

['0', '1', '0', '1', '1']

In [5]:
datasets['train']['sentence1'][:5]

['不过，他的心里从来没有产生过能够达到仆么目的的想法。',
 '我说。地地道道的真货，阿尔夫说，读吧。',
 '一个穿着绿色毛衣和牛仔裤的男人坐在草坪椅上，脚下有许多购物袋。',
 '可是现在呢，您听，不是门钩在响吗？',
 '世界上最高的山是什么山啊']

In [6]:

checkpoint = "hfl/chinese-macbert-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def process_function(examples):
    tokenized_examples = tokenizer(examples["sentence1"], examples["sentence2"], max_length=128, truncation=True)
    tokenized_examples["labels"] = [int(label) for label in examples["label"]]
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [7]:
print(tokenized_datasets['train'][0])

{'input_ids': [101, 679, 6814, 8024, 800, 4638, 2552, 7027, 794, 3341, 3766, 3300, 772, 4495, 6814, 5543, 1916, 6809, 1168, 789, 720, 4680, 4638, 4638, 2682, 3791, 511, 102, 671, 702, 2134, 3136, 4312, 8024, 671, 702, 2134, 3136, 4312, 800, 2552, 7027, 1353, 1908, 2682, 6887, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': 0}


In [8]:
tokenizer.decode(tokenized_datasets['train'][0]['input_ids'])

'[CLS] 不 过 ， 他 的 心 里 从 来 没 有 产 生 过 能 够 达 到 仆 么 目 的 的 想 法 。 [SEP] 一 个 宗 教 狂 ， 一 个 宗 教 狂 他 心 里 反 复 想 道 。 [SEP]'

# 创建模型

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 创建评估函数

In [9]:
acc_metric = evaluate.load("accuracy")
f1_metirc = evaluate.load("f1")

In [16]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metirc.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

## 训练

In [10]:
train_args = TrainingArguments(num_train_epochs=3,  # 训练的epoch数
                               output_dir="./cross_model",      # 输出文件夹
                               per_device_train_batch_size=32,  # 训练时的batch_size
                               per_device_eval_batch_size=128,  # 验证时的batch_size
                               logging_steps=10,                # log 打印的频率
                               evaluation_strategy="epoch",     # 评估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=3,              # 最大保存数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.01,               # weight_decay
                               metric_for_best_model="f1",      # 设定评估指标
                               load_best_model_at_end=True,     # 训练完成后加载最优模型
                               report_to="none")                # 不使用tensorboard

In [20]:
trainer = Trainer(model=model, 
                  args=train_args, 
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

In [21]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=375, training_loss=0.19637160642941792, metrics={'train_runtime': 112.9805, 'train_samples_per_second': 212.426, 'train_steps_per_second': 3.319, 'total_flos': 1577252110237440.0, 'train_loss': 0.19637160642941792, 'epoch': 3.0})

# 模型评估

In [24]:
trainer.evaluate()



{'eval_loss': 0.2675735354423523,
 'eval_accuracy': 0.906,
 'eval_f1': 0.8766404199475065,
 'eval_runtime': 2.5152,
 'eval_samples_per_second': 795.174,
 'eval_steps_per_second': 3.181,
 'epoch': 3.0}

## 预测

In [22]:
model.config.id2label = {0: "不相似", 1: "相似"}
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)


In [23]:
pipe({'text': '我喜欢你', 'text_pair': '我爱你'})

{'label': '不相似', 'score': 0.7726563811302185}

In [26]:
pipe({'text': '我喜欢北京', 'text_pair': '北京是我喜欢的城市'})

{'label': '相似', 'score': 0.9191399812698364}

In [27]:
pipe({'text': '在干嘛', 'text_pair': '苹果是红色的'})

{'label': '不相似', 'score': 0.992119312286377}

## 延伸； 如何从多个文本中找到最相似的？
<div>
<img src="figs/07_multiple_spans.jpg" width="1000"/>
</div>

In [11]:
def process_function(examples):
    tokenized_examples = tokenizer(examples["sentence1"], examples["sentence2"], max_length=128, truncation=True)
    tokenized_examples["labels"] = [float(label) for label in examples["label"]]
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
print(tokenized_datasets['train'][0])

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'input_ids': [101, 679, 6814, 8024, 800, 4638, 2552, 7027, 794, 3341, 3766, 3300, 772, 4495, 6814, 5543, 1916, 6809, 1168, 789, 720, 4680, 4638, 4638, 2682, 3791, 511, 102, 671, 702, 2134, 3136, 4312, 8024, 671, 702, 2134, 3136, 4312, 800, 2552, 7027, 1353, 1908, 2682, 6887, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': 0.0}


In [12]:
# turn classification to regression
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


```
if self.config.problem_type == "regression":
    loss_fct = MSELoss()
    if self.num_labels == 1:
        loss = loss_fct(logits.squeeze(), labels.squeeze())
    else:
        loss = loss_fct(logits, labels)
```

In [13]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = [int(i > 0.5) for i in predictions]
    labels = [int(i > 0.5) for i in labels]
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metirc.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

In [16]:
trainer = Trainer(model=model, 
                  args=train_args, 
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

In [17]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.126,0.080431,0.8985,0.872247
2,0.0888,0.075497,0.905,0.869146
3,0.0601,0.063843,0.9165,0.891347


TrainOutput(global_step=375, training_loss=0.10146339257558187, metrics={'train_runtime': 112.3048, 'train_samples_per_second': 213.704, 'train_steps_per_second': 3.339, 'total_flos': 1575001524836736.0, 'train_loss': 0.10146339257558187, 'epoch': 3.0})

In [20]:
model.config.id2label = {0: "不相似", 1: "相似"}
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

pipe({'text': '我喜欢北京', 'text_pair': '北京是我喜欢的城市'})

{'label': '不相似', 'score': 1.0}

In [23]:
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0, function_to_apply='none')
result = pipe({'text': '我喜欢北京', 'text_pair': '北京是我喜欢的城市'})
result['label'] = '相似' if result['score'] > 0.5 else '不相似'
result

{'label': '相似', 'score': 0.659388542175293}

In [24]:
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0, function_to_apply='none')
result = pipe({'text': '我喜欢北京', 'text_pair': '天气不错'})
result['label'] = '相似' if result['score'] > 0.5 else '不相似'
result

{'label': '不相似', 'score': 0.04212227836251259}

# Method 2: Vector match

<div>
<img src="figs/07_plan_2.jpg" width="1000"/>
</div>

<div>
<img src="figs/07_sentence_pair.jpg" width="1000"/>
</div>

In [1]:
import os
# os.environ["WORLD_SIZE"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
from transformers import DataCollatorWithPadding
import evaluate
from transformers import pipeline

In [27]:
dataset = load_dataset("json", data_files="data/train_pair_1w.json", split="train")
datasets = dataset.train_test_split(test_size=0.2)


checkpoint = "hfl/chinese-macbert-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def process_function(examples):
    sentences = []
    labels = []
    for sen1, sen2, label in zip(examples["sentence1"], examples["sentence2"], examples["label"]):
        sentences.append(sen1)
        sentences.append(sen2)
        labels.append(1 if int(label) == 1 else -1)
    tokenized_examples = tokenizer(sentences, max_length=128, truncation=True, padding='max_length')
    # keys in tokenized_examples: ['input_ids', 'token_type_ids', 'attention_mask']
    # create sentence pairs
    tokenized_examples = {k: [v[i:i+2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}
    tokenized_examples["labels"] = labels
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [28]:
tokenizer.decode(tokenized_datasets['train'][0]['input_ids'][0]), tokenizer.decode(tokenized_datasets['train'][0]['input_ids'][1])

('[CLS] 三 个 女 孩 站 在 舞 台 前 的 伞 下 。 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] 三 把 伞 站 在 三 个 女 孩 下 面 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD

In [29]:
datasets['train']['sentence1'][0], datasets['train']['sentence2'][0]

('三个女孩站在舞台前的伞下。', '三把伞站在三个女孩下面')

In [30]:
len(tokenized_datasets['train'][0]['input_ids'])

2

## 构建自己的模型

In [31]:
from transformers import BertPreTrainedModel, PretrainedConfig, BertModel
from typing import Optional, Tuple, Union
from torch.nn import CosineEmbeddingLoss, CosineSimilarity

```
class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.bert = BertModel(config)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()
```

In [32]:
class DualModel(BertPreTrainedModel):
    def __init__(self, config: PretrainedConfig):
        super().__init__(config)
        self.bert = BertModel(config)
        self.post_init()
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Tuple[torch.Tensor]:
        
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # step 1: get sentence 1 and sentence 2 inputs
        senA_input_ids, senB_input_ids = input_ids[:, 0], input_ids[:, 1]
        senA_attention_mask, senB_attention_mask = attention_mask[:, 0], attention_mask[:, 1]
        senA_token_type_ids, senB_token_type_ids = token_type_ids[:, 0], token_type_ids[:, 1]

        # step 2: get sentence 1 and sentence 2 embeddings
        senA_outputs = self.bert(
            senA_input_ids,
            attention_mask=senA_attention_mask,
            token_type_ids=senA_token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # [cls] token embedding
        # [batch_size, hidden_size]
        senA_pooled_output = senA_outputs[1]        

        senB_outputs = self.bert(
            senB_input_ids,
            attention_mask=senB_attention_mask,
            token_type_ids=senB_token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # [cls] token embedding
        # [batch_size, hidden_size]
        senB_pooled_output = senB_outputs[1]  

        # step 3: calculate similarity
        # [batch_size]
        cos = CosineSimilarity()(senA_pooled_output, senB_pooled_output)
        
        # step 4: calculate loss
        loss = None
        if labels is not None:
            loss_fct = CosineEmbeddingLoss(margin=0.3)
            loss = loss_fct(senA_pooled_output, senB_pooled_output, labels.float())
        
        # step 5: return
        output = (cos,)
        return ((loss,) + output) if loss is not None else output  


In [33]:
model = DualModel.from_pretrained(checkpoint)      

## 创建评估函数

In [34]:
acc_metric = evaluate.load("accuracy")
f1_metirc = evaluate.load("f1")

def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = [int(i > 0.7) for i in predictions]
    # orginal lables take value in [-1, 1], we convert it to [0, 1]
    labels = [int(i>0)  for i in labels]
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metirc.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

## training

In [35]:
train_args = TrainingArguments(num_train_epochs=5,  # 训练的epoch数
                               output_dir="./dual_model",      # 输出文件夹
                               per_device_train_batch_size=64,  # 训练时的batch_size
                               per_device_eval_batch_size=128,  # 验证时的batch_size
                               logging_steps=10,                # log 打印的频率 only takse effect when report_to ! = none
                               evaluation_strategy="epoch",     # 评估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=2,              # 最大保存数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.01,               # weight_decay
                               metric_for_best_model="f1",      # 设定评估指标
                               load_best_model_at_end=True,     # 训练完成后加载最优模型
                               report_to="none")                # 不使用tensorboard

In [36]:
trainer = Trainer(model=model, 
                  args=train_args, 
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                #   data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2393,0.220697,0.7435,0.72345
2,0.188,0.19395,0.7785,0.750141
3,0.1578,0.189919,0.781,0.737095
4,0.148,0.185648,0.7885,0.748364
5,0.1262,0.183992,0.789,0.745783




TrainOutput(global_step=315, training_loss=0.1800709533312964, metrics={'train_runtime': 287.8632, 'train_samples_per_second': 138.955, 'train_steps_per_second': 1.094, 'total_flos': 5262126612480000.0, 'train_loss': 0.1800709533312964, 'epoch': 5.0})

In [39]:
trainer.evaluate()



{'eval_loss': 0.1939496546983719,
 'eval_accuracy': 0.7785,
 'eval_f1': 0.7501410039481106,
 'eval_runtime': 5.0579,
 'eval_samples_per_second': 395.421,
 'eval_steps_per_second': 1.582,
 'epoch': 5.0}

## prediction

In [74]:
class SentenceSimilarityPipeline:

    def __init__(self, model, tokenizer):
        self.model = model.bert
        self.tokenizer = tokenizer
        self.device = model.device

    def preprocess(self, sentence1, sentence2):
        inputs = self.tokenizer([sentence1, sentence2], max_length=128, truncation=True, return_tensors="pt", padding=True)
        return inputs

    def predict(self, inputs):
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        return self.model(**inputs)[1] # [2, 768]

    def post_process(self, embeddings):
        # None: [2, 768] -> [1, 2, 768]
        cos = CosineSimilarity()(embeddings[None, 0,:], embeddings[None, 1,:]).squeeze().cpu().item()
        return cos

    def __call__(self, sentence1, sentence2, return_vector=False):
        inputs = self.preprocess(sentence1, sentence2)
        embeddings = self.predict(inputs)
        cos = self.post_process(embeddings)
        if return_vector:
            return cos, embeddings
        return cos

In [66]:
pipe = SentenceSimilarityPipeline(model, tokenizer)
pipe('我喜欢北京', '北京是我喜欢的城市')

torch.Size([2, 768])


0.8576623797416687

In [75]:
pipe = SentenceSimilarityPipeline(model, tokenizer)
pipe('我喜欢北京', '北京是我喜欢的城市', return_vector=True)

(0.8576623797416687,
 tensor([[-0.7191, -0.6413,  0.8583,  ...,  0.6953,  0.4166,  0.0423],
         [-0.8430, -0.9357,  0.3875,  ...,  0.6625, -0.0318, -0.6222]],
        device='cuda:0', grad_fn=<TanhBackward0>))

In [76]:
pipe.preprocess('我喜欢北京', '北京是我喜欢的城市')

{'input_ids': tensor([[ 101, 2769, 1599, 3614, 1266,  776,  102,    0,    0,    0,    0],
        [ 101, 1266,  776, 3221, 2769, 1599, 3614, 4638, 1814, 2356,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}