In [1]:
# 导入必要的库
import torch  # PyTorch深度学习库
import torch.nn.functional as F  # PyTorch函数式接口，包含各种神经网络函数

from torch import Tensor  # 导入Tensor类型，用于类型提示
from modelscope import AutoTokenizer, AutoModel  # 从modelscope导入自动分词器和模型加载器


# 定义最后一个token池化函数
# 该函数从最后的隐藏状态中提取每个序列的最后一个有效token的表示
def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    # 检查是否为左侧填充（即所有序列最后一个位置都有效）
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        # 如果是左侧填充，直接返回最后一个位置的隐藏状态
        return last_hidden_states[:, -1]
    else:
        # 如果是右侧填充，计算每个序列的实际长度（减1是因为索引从0开始）
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        # 返回每个序列最后一个有效token的隐藏状态
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


# 定义获取详细指令的函数
# 将任务描述和查询组合成特定格式的指令
def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'


# 每个查询都必须附带一个描述任务的简短指令
# 定义任务描述：给定网络搜索查询，检索相关的回答段落
task = 'Given a web search query, retrieve relevant passages that answer the query'
# 创建查询列表，每个查询都通过get_detailed_instruct函数添加了任务描述
queries = [
    get_detailed_instruct(task, 'how much protein should a female eat'),  # 女性应该摄入多少蛋白质
    get_detailed_instruct(task, 'summit define')  # summit（顶峰）的定义
]
queries

['Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: how much protein should a female eat',
 'Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: summit define']

In [2]:
# 检索文档不需要添加指令
documents = [
    "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",  # 关于女性蛋白质摄入量的文档
    "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments."  # 关于summit定义的文档
]
# 将查询和文档合并为一个输入文本列表
input_texts = queries + documents

# 设置模型路径
model_dir = "D:\\AInewModels\\iic\\gte_Qwen2-1___5B-instruct"
# 加载分词器，trust_remote_code=True允许使用远程代码
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
# 加载模型
model = AutoModel.from_pretrained(model_dir, trust_remote_code=True)

# 设置最大序列长度
max_length = 8192

# 对输入文本进行分词处理
# padding=True：对较短的序列进行填充，使批次中所有序列长度一致
# truncation=True：截断超过max_length的序列
# return_tensors='pt'：返回PyTorch张量
batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt')
batch_dict

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

{'input_ids': tensor([[151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643,    641,   1235,     25,  16246,    264,   3482,   2711,
           3239,     11,  17179,   9760,  46769,    429,   4226,    279,   3239,
            198,   2859,     25,   1246,   1753,  12833,   1265,    264,   8778,
           8180, 151643],
        [151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 151643, 151643, 151643, 151643, 1516

In [3]:
# 将分词后的输入传入模型，获取输出
outputs = model(**batch_dict)
outputs

BaseModelOutputWithPast(last_hidden_state=tensor([[[-11.0522,   4.3318,   1.9510,  ...,   0.3746,  -3.5137,   0.6425],
         [-11.0522,   4.3318,   1.9510,  ...,   0.3746,  -3.5137,   0.6425],
         [-11.0522,   4.3318,   1.9510,  ...,   0.3746,  -3.5137,   0.6425],
         ...,
         [ -2.7279,   1.8041,   4.2044,  ...,  -1.7949,  -2.2994,   4.7448],
         [ -4.1622,   4.0888,   3.8063,  ...,  -5.2322,  -0.8833,   4.1890],
         [ -3.5131,   8.2277,   2.9399,  ...,  -0.8273,   2.2476,   3.6179]],

        [[ -6.0841,   2.2134,  -1.1036,  ...,  -0.9619,  -2.8129,  -1.4385],
         [ -6.0841,   2.2134,  -1.1036,  ...,  -0.9619,  -2.8129,  -1.4385],
         [ -6.0841,   2.2134,  -1.1036,  ...,  -0.9619,  -2.8129,  -1.4385],
         ...,
         [  3.1297,  -6.1933,   2.4328,  ...,   1.4724,  -6.8661,   1.5094],
         [  2.3083,   4.8896,  -0.9277,  ...,   0.4504,  -0.5087,   2.1959],
         [ -1.8018,   6.4114,  -0.6933,  ...,   0.1595,   0.5742,   1.5695]],

  

In [4]:

# 使用last_token_pool函数从最后的隐藏状态中提取每个序列的表示
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# 对嵌入向量进行L2归一化，使其长度为1
# p=2表示L2范数，dim=1表示在第1维（特征维度）上进行归一化
embeddings = F.normalize(embeddings, p=2, dim=1)
# 计算查询和文档之间的相似度分数
# embeddings[:2]：查询的嵌入向量（前两个）
# embeddings[2:]：文档的嵌入向量（后两个）
# .T：转置操作
# * 100：将相似度分数缩放到0-100的范围
scores = (embeddings[:2] @ embeddings[2:].T) * 100
# 打印相似度分数
print(scores.tolist())

[[78.49691009521484, 17.042865753173828], [14.924494743347168, 75.37960815429688]]
