In [1]:
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np

In [3]:
# ! pip install transformers

In [None]:
# pip install transformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
# MODEL_NAME = "hfl/chinese-pert-large"
MODEL_NAME = "hfl/chinese-roberta-wwm-ext"
# MODEL_NAME = "hfl/rbt3"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME,
                 hidden_dropout_prob=0,
                 attention_probs_dropout_prob=0,)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

获取训练集中的物料名称向量

In [2]:
trainData = pd.read_csv("./Data_train.csv", index_col=0)
train_mat_names = list(trainData["MAT_NAME（物料名称）"].drop_duplicates())
trainData[:5]

Unnamed: 0_level_0,id（供应商代码）,MAT_NAME（物料名称）
QUOTE_MAT_LINE_ID（报价单行号）,Unnamed: 1_level_1,Unnamed: 2_level_1
8e24b75ce1c811ebaf97005056b12bb8,6603,液压缸吊座
f3afd8f4e1dc11ebaf97005056b12bb8,20958,六角头螺栓-C级GB/T5780-2000
1b99daf8e1de11ebaf97005056b12bb8,41982,高压水银荧光灯泡
1b9aac07e1de11ebaf97005056b12bb8,41982,手电筒螺口小灯泡（圆头）
bff9cf95e1e511ebaf97005056b12bb8,20238,外接头


In [4]:
def get_matVec(mat_names):
    input_id = tokenizer(mat_names, padding=True, truncation=True, max_length=10, return_tensors="pt")
    input_ids = input_id["input_ids"]
    input_chunks = torch.chunk(input_ids, 10, 0)
    outputs = []
    for chunk in input_chunks:
        chunk = chunk.to(device)
        with torch.no_grad():  
            output = model(chunk)['pooler_output']
        output = output.cpu().detach()
        torch.cuda.empty_cache()
        outputs.append(output)
    matvec = torch.cat(outputs,0)
    return matvec

In [None]:
train_mat_names = list(trainData["MAT_NAME（物料名称）"].drop_duplicates())
trainMatVec = get_matVec(train_mat_names)
# 训练数据中的物料名称向量
trainMatVec.shape

In [6]:
torch.save(trainMatVec, "trainMatVec"+".pt")

获取测试集中的物料名称向量

In [4]:
testData = pd.read_csv("./Data_test.csv", index_col = 0)
test_mat_names = list(testData["MAT_NAME（物料名称）"].drop_duplicates())

In [7]:
testData[:3]

Unnamed: 0_level_0,id（供应商代码）,MAT_NAME（物料名称）
QUOTE_MAT_LINE_ID（报价单行号）,Unnamed: 1_level_1,Unnamed: 2_level_1
75be1c94b70811ecbb62005056a00443,22071,PDC可开闭式钻头
75bde9a7b70811ecbb62005056a00443,22071,超高压水力送水装置
75be1fe0b70811ecbb62005056a00443,22071,高压水便


In [None]:
testMatVec = get_matVec(test_mat_names)
# 测试数据中的物料名称向量
testMatVec.shape

torch.Size([9391, 768])

In [12]:
torch.save(testMatVec, "testMatVec"+".pt")

In [5]:
trainMatVec = torch.load("trainMatVec"+".pt")
testMatVec = torch.load("testMatVec"+".pt")
train_mat_index = pd.DataFrame(np.arange(len(train_mat_names)), index = train_mat_names)
test_mat_index = pd.DataFrame(np.arange(len(test_mat_names)), index = test_mat_names)

计算测试集中物料名称与训练集中的物料名称的相似度

In [6]:
def cos_similar(p, q):
    sim_matrix = p.matmul(q.transpose(-2, -1))
    a = torch.norm(p, p=2, dim=-1)
    b = torch.norm(q, p=2, dim=-1)
    sim_matrix /= a.unsqueeze(-1)
    sim_matrix /= b.unsqueeze(-2)
    return sim_matrix

torch.cuda.empty_cache()
with torch.no_grad(): 
    # sim_mat_score = torch.cdist(testMatVec,trainMatVec,p=2)
    # sim_mat_score = F.cosine_similarity(testMatVec.unsqueeze(1), trainMatVec.unsqueeze(0), dim=2)
    sim_mat_score = cos_similar(testMatVec,trainMatVec)
    sim_mat_score = sim_mat_score.sort(axis=1, descending=True)
    sim_mat_index = sim_mat_score.indices[:,:10]
    sim_mat_score = sim_mat_score.values[:,:10] 
    torch.cuda.empty_cache()
torch.cuda.empty_cache()
print(sim_mat_score.shape)
print(sim_mat_index.shape)

torch.Size([9391, 10])
torch.Size([9391, 10])


In [7]:
sim_mat_index[:3]

tensor([[31287, 22788, 19393,  2549, 29353, 26875, 12694,  6232,  7099, 12706],
        [31289, 30265, 18943, 28708, 20327, 11074, 11073, 16645, 16643,  3409],
        [10565,  7198,  6002, 14922, 25644,  1311, 15300, 27975,  2657, 24914]])

In [8]:
sim_mat_score[:3]

tensor([[1.0000, 0.9709, 0.9708, 0.9698, 0.9694, 0.9687, 0.9687, 0.9686, 0.9685,
         0.9682],
        [1.0000, 0.9829, 0.9804, 0.9796, 0.9784, 0.9778, 0.9778, 0.9766, 0.9761,
         0.9751],
        [1.0000, 0.9839, 0.9838, 0.9837, 0.9837, 0.9829, 0.9825, 0.9824, 0.9818,
         0.9816]])

In [9]:
train_mat_index.loc[test_mat_names[:3]]

Unnamed: 0,0
PDC可开闭式钻头,31287
超高压水力送水装置,31289
高压水便,10565


In [10]:
test_mat_index[:3]

Unnamed: 0,0
PDC可开闭式钻头,0
超高压水力送水装置,1
高压水便,2


In [11]:
torch.save(sim_mat_index, "test-train-sim_mat_index-cosine"+".pt")
torch.save(sim_mat_score, "test-train-sim_mat_score-cosine"+".pt")

In [6]:
sim_mat_index = torch.load("test-train-sim_mat_index-cosine"+".pt")
sim_mat_score = torch.load("test-train-sim_mat_score-cosine"+".pt")

获取历史每个物料对应的可提供供应商

In [12]:
train_id_mat = trainData.copy()
train_id_mat["mat_id"] = train_id_mat["MAT_NAME（物料名称）"].apply(lambda x:train_mat_index.loc[x])
train_id_mat[:2]

Unnamed: 0_level_0,id（供应商代码）,MAT_NAME（物料名称）,mat_id
QUOTE_MAT_LINE_ID（报价单行号）,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8e24b75ce1c811ebaf97005056b12bb8,6603,液压缸吊座,0
f3afd8f4e1dc11ebaf97005056b12bb8,20958,六角头螺栓-C级GB/T5780-2000,1


In [13]:
mat_supply_ids = train_id_mat.groupby(by="mat_id")["id（供应商代码）"].apply(set)
# 提供物料的历史供应商
mat_supply_ids[:200]

mat_id
0                                                 {6603}
1                                                {20958}
2                                                {41982}
3                                                {41982}
4                                  {27177, 21484, 20238}
                             ...                        
195                         {43944, 10298, 22123, 33944}
196                         {43944, 10298, 22123, 33944}
197                  {13094, 43944, 22123, 33944, 10298}
198                         {43944, 10298, 22123, 33944}
199    {43944, 22123, 24431, 32919, 33944, 13945, 10298}
Name: id（供应商代码）, Length: 200, dtype: object

In [14]:
query_mat_index = test_mat_index.loc[testData["MAT_NAME（物料名称）"].to_list()]
query_mat_index = query_mat_index[0].to_list()
# 物料在相似度矩阵中的下标
query_mat_index[:5]

[0, 1, 2, 3, 4]

In [15]:
item_sim_mat = pd.DataFrame(sim_mat_index[query_mat_index].tolist())
item_sim_mat[:4]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,31287,22788,19393,2549,29353,26875,12694,6232,7099,12706
1,31289,30265,18943,28708,20327,11074,11073,16645,16643,3409
2,10565,7198,6002,14922,25644,1311,15300,27975,2657,24914
3,31288,29353,19123,18596,22354,1508,27985,14118,21359,18490


In [16]:
item_sim_score = pd.DataFrame(sim_mat_score[query_mat_index].tolist())
item_sim_score[:4]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.999999,0.970874,0.970842,0.96977,0.969441,0.968724,0.968707,0.968632,0.968453,0.968153
1,1.0,0.98286,0.980387,0.979624,0.978356,0.977822,0.977822,0.976629,0.976105,0.975116
2,1.0,0.983881,0.983783,0.983716,0.983662,0.982918,0.982466,0.982356,0.981823,0.981575
3,1.0,0.990682,0.990647,0.987569,0.987022,0.986968,0.986776,0.985769,0.985563,0.98474


In [17]:
item_sim_mat.shape
# 行数是测试集数量，列数对应在训练集中相关的前10个物料向量

(55461, 10)

In [18]:
from operator import add
from functools import reduce
def simMat2supp(x, suppNum = 50):
    return reduce(add, mat_supply_ids.loc[x].apply(list))[:suppNum]
    # mat_supply_ids.loc[x]
    # return x
    # return mat_supply_ids.loc[x]
item_sim_mat[:10].apply(simMat2supp, axis=1)

0    [35674, 37260, 35434, 525, 20240, 33777, 17499...
1    [35674, 37260, 35434, 31480, 34758, 34270, 137...
2    [23586, 12036, 3750, 41896, 4777, 35434, 8107,...
3    [35674, 37260, 35434, 3994, 25356, 23586, 3750...
4    [9600, 26309, 41896, 35434, 37260, 30351, 1068...
5    [41060, 1357, 21137, 36468, 32351, 14356, 2471...
6    [41060, 1357, 21137, 36468, 32351, 14356, 2471...
7    [35674, 37260, 35434, 8258, 25356, 28973, 3609...
8    [35674, 37260, 35434, 3994, 25356, 23586, 3750...
9    [35674, 37260, 35434, 26309, 27660, 43181, 204...
dtype: object

In [19]:
result1 = item_sim_mat.apply(simMat2supp, axis=1)
result1 = pd.DataFrame(result1)
result1 = result1.set_index(testData.index)
result1.shape

(55461, 1)

In [20]:
testData["recommend"] = result1
testData[:3]

Unnamed: 0_level_0,id（供应商代码）,MAT_NAME（物料名称）,recommend
QUOTE_MAT_LINE_ID（报价单行号）,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
75be1c94b70811ecbb62005056a00443,22071,PDC可开闭式钻头,"[35674, 37260, 35434, 525, 20240, 33777, 17499..."
75bde9a7b70811ecbb62005056a00443,22071,超高压水力送水装置,"[35674, 37260, 35434, 31480, 34758, 34270, 137..."
75be1fe0b70811ecbb62005056a00443,22071,高压水便,"[23586, 12036, 3750, 41896, 4777, 35434, 8107,..."


In [21]:
sumRecall = testData.apply(lambda x:x["id（供应商代码）"] in x["recommend"], axis=1).sum()
recall = sumRecall / len(testData)
print(recall)

0.23237950992589387


In [22]:
testData.to_csv("testDataRecommendCosineSim"+".csv")

In [23]:
from operator import add
from functools import reduce
def simMat2supp(x, suppNum = 50):
    suplists = list(mat_supply_ids.loc[x].apply(list))
    scores = list(item_sim_score.loc[x.name])
    return pd.Series([suplists,scores])
item_sim_mat[:10].apply(simMat2supp, axis=1)

Unnamed: 0,0,1
0,"[[35674, 37260, 35434], [525], [20240, 33777, ...","[0.9999993443489075, 0.9708736538887024, 0.970..."
1,"[[35674, 37260, 35434], [31480, 34758, 34270],...","[0.9999995827674866, 0.9828600883483887, 0.980..."
2,"[[23586, 12036, 3750, 41896, 4777, 35434, 8107...","[1.000000238418579, 0.9838810563087463, 0.9837..."
3,"[[35674, 37260, 35434], [3994, 25356, 23586, 3...","[0.9999999403953552, 0.9906815886497498, 0.990..."
4,"[[9600, 26309, 41896, 35434, 37260, 30351, 106...","[1.0000004768371582, 0.9836571216583252, 0.981..."
5,"[[41060, 1357, 21137, 36468, 32351], [14356, 2...","[0.9999992251396179, 0.9947300553321838, 0.992..."
6,"[[41060, 1357, 21137, 36468, 32351], [14356, 2...","[0.9999992251396179, 0.9947300553321838, 0.992..."
7,"[[35674, 37260, 35434], [8258, 25356, 28973, 3...","[1.0000001192092896, 0.9905298948287964, 0.985..."
8,"[[35674, 37260, 35434], [3994, 25356, 23586, 3...","[0.9999999403953552, 0.9906815886497498, 0.990..."
9,"[[35674, 37260, 35434], [26309, 27660, 43181, ...","[0.9999998211860657, 0.9797412157058716, 0.976..."


In [24]:
result = item_sim_mat.apply(simMat2supp, axis=1)

In [25]:
result = result.rename(columns={0:'supps',1:'scores'})

In [26]:
result

Unnamed: 0,supps,scores
0,"[[35674, 37260, 35434], [525], [20240, 33777, ...","[0.9999993443489075, 0.9708736538887024, 0.970..."
1,"[[35674, 37260, 35434], [31480, 34758, 34270],...","[0.9999995827674866, 0.9828600883483887, 0.980..."
2,"[[23586, 12036, 3750, 41896, 4777, 35434, 8107...","[1.000000238418579, 0.9838810563087463, 0.9837..."
3,"[[35674, 37260, 35434], [3994, 25356, 23586, 3...","[0.9999999403953552, 0.9906815886497498, 0.990..."
4,"[[9600, 26309, 41896, 35434, 37260, 30351, 106...","[1.0000004768371582, 0.9836571216583252, 0.981..."
...,...,...
55456,"[[26309, 40970, 8107, 5941, 22071], [14517, 39...","[0.9862000346183777, 0.9861775636672974, 0.985..."
55457,"[[6791, 40970, 25356, 33424, 33944, 31399, 439...","[1.0000001192092896, 0.9857597947120667, 0.984..."
55458,"[[6791, 32520, 40970, 25356, 2829, 38313, 2897...","[0.9864628911018372, 0.9797140955924988, 0.978..."
55459,"[[26309, 22071, 40970, 5941, 16983], [8258, 25...","[0.9999992847442627, 0.9907602667808533, 0.989..."


In [198]:
result.to_csv("testDataRecommendCosineSim.csv")

In [27]:
result = result.set_index(testData.index)

In [28]:
result

Unnamed: 0_level_0,supps,scores
QUOTE_MAT_LINE_ID（报价单行号）,Unnamed: 1_level_1,Unnamed: 2_level_1
75be1c94b70811ecbb62005056a00443,"[[35674, 37260, 35434], [525], [20240, 33777, ...","[0.9999993443489075, 0.9708736538887024, 0.970..."
75bde9a7b70811ecbb62005056a00443,"[[35674, 37260, 35434], [31480, 34758, 34270],...","[0.9999995827674866, 0.9828600883483887, 0.980..."
75be1fe0b70811ecbb62005056a00443,"[[23586, 12036, 3750, 41896, 4777, 35434, 8107...","[1.000000238418579, 0.9838810563087463, 0.9837..."
75be2260b70811ecbb62005056a00443,"[[35674, 37260, 35434], [3994, 25356, 23586, 3...","[0.9999999403953552, 0.9906815886497498, 0.990..."
75be23f0b70811ecbb62005056a00443,"[[9600, 26309, 41896, 35434, 37260, 30351, 106...","[1.0000004768371582, 0.9836571216583252, 0.981..."
...,...,...
1c4d361fff1e11eca911005056a03da3,"[[26309, 40970, 8107, 5941, 22071], [14517, 39...","[0.9862000346183777, 0.9861775636672974, 0.985..."
1c4d1337ff1e11eca911005056a03da3,"[[6791, 40970, 25356, 33424, 33944, 31399, 439...","[1.0000001192092896, 0.9857597947120667, 0.984..."
1c4e7d3fff1e11eca911005056a03da3,"[[6791, 32520, 40970, 25356, 2829, 38313, 2897...","[0.9864628911018372, 0.9797140955924988, 0.978..."
1c4d0adaff1e11eca911005056a03da3,"[[26309, 22071, 40970, 5941, 16983], [8258, 25...","[0.9999992847442627, 0.9907602667808533, 0.989..."


In [29]:
pd.to_pickle(result, "testDataRecommendCosineSim.pkl")

In [30]:
result[:10].apply(lambda x:x["supps"], axis=1)

QUOTE_MAT_LINE_ID（报价单行号）
75be1c94b70811ecbb62005056a00443    [[35674, 37260, 35434], [525], [20240, 33777, ...
75bde9a7b70811ecbb62005056a00443    [[35674, 37260, 35434], [31480, 34758, 34270],...
75be1fe0b70811ecbb62005056a00443    [[23586, 12036, 3750, 41896, 4777, 35434, 8107...
75be2260b70811ecbb62005056a00443    [[35674, 37260, 35434], [3994, 25356, 23586, 3...
75be23f0b70811ecbb62005056a00443    [[9600, 26309, 41896, 35434, 37260, 30351, 106...
89577c03b70811ecbb62005056a00443    [[41060, 1357, 21137, 36468, 32351], [14356, 2...
89579cfab70811ecbb62005056a00443    [[41060, 1357, 21137, 36468, 32351], [14356, 2...
aeb9e338b70811ecbb62005056a00443    [[35674, 37260, 35434], [8258, 25356, 28973, 3...
aeba525db70811ecbb62005056a00443    [[35674, 37260, 35434], [3994, 25356, 23586, 3...
aeb9c5dbb70811ecbb62005056a00443    [[35674, 37260, 35434], [26309, 27660, 43181, ...
dtype: object

In [31]:
def scoreBroadCast(x, suppNum = 50):
    suppslist = []
    scoreslist = []
    for i in range(len(x['supps'])):
        suppslist.extend(x['supps'][i])
        scoreslist.extend([x['scores'][i]]*len(x['supps'][i]))
    return pd.Series([suppslist[:suppNum],scoreslist[:suppNum]])

In [32]:
broadResult = result.apply(scoreBroadCast, axis=1)

In [33]:
broadResult = broadResult.rename(columns={0:'supps',1:'scores'})
broadResult[:10]

Unnamed: 0_level_0,supps,scores
QUOTE_MAT_LINE_ID（报价单行号）,Unnamed: 1_level_1,Unnamed: 2_level_1
75be1c94b70811ecbb62005056a00443,"[35674, 37260, 35434, 525, 20240, 33777, 17499...","[0.9999993443489075, 0.9999993443489075, 0.999..."
75bde9a7b70811ecbb62005056a00443,"[35674, 37260, 35434, 31480, 34758, 34270, 137...","[0.9999995827674866, 0.9999995827674866, 0.999..."
75be1fe0b70811ecbb62005056a00443,"[23586, 12036, 3750, 41896, 4777, 35434, 8107,...","[1.000000238418579, 1.000000238418579, 1.00000..."
75be2260b70811ecbb62005056a00443,"[35674, 37260, 35434, 3994, 25356, 23586, 3750...","[0.9999999403953552, 0.9999999403953552, 0.999..."
75be23f0b70811ecbb62005056a00443,"[9600, 26309, 41896, 35434, 37260, 30351, 1068...","[1.0000004768371582, 1.0000004768371582, 1.000..."
89577c03b70811ecbb62005056a00443,"[41060, 1357, 21137, 36468, 32351, 14356, 2471...","[0.9999992251396179, 0.9999992251396179, 0.999..."
89579cfab70811ecbb62005056a00443,"[41060, 1357, 21137, 36468, 32351, 14356, 2471...","[0.9999992251396179, 0.9999992251396179, 0.999..."
aeb9e338b70811ecbb62005056a00443,"[35674, 37260, 35434, 8258, 25356, 28973, 3609...","[1.0000001192092896, 1.0000001192092896, 1.000..."
aeba525db70811ecbb62005056a00443,"[35674, 37260, 35434, 3994, 25356, 23586, 3750...","[0.9999999403953552, 0.9999999403953552, 0.999..."
aeb9c5dbb70811ecbb62005056a00443,"[35674, 37260, 35434, 26309, 27660, 43181, 204...","[0.9999998211860657, 0.9999998211860657, 0.999..."


In [34]:
broadResult.to_csv("testDataRecommendFormattedCosineSim.csv")

In [73]:
len(broadResult.iloc[0,1])

28

In [124]:
lists = mat_supply_ids.loc[[197,198,199]].apply(list)

In [126]:
x = [197,198,199]
lists = mat_supply_ids.loc[x].apply(list)

In [167]:
lists

mat_id
197                  [13094, 43944, 22123, 33944, 10298]
198                         [43944, 10298, 22123, 33944]
199    [43944, 22123, 24431, 32919, 33944, 13945, 10298]
Name: id（供应商代码）, dtype: object

In [132]:
lists.apply(len).apply(lambda x:[1]) * lists.apply(len)

mat_id
197          [1, 1, 1, 1, 1]
198             [1, 1, 1, 1]
199    [1, 1, 1, 1, 1, 1, 1]
Name: id（供应商代码）, dtype: object

In [108]:
for i in range(10):
    

[6603]
[20958]
[41982]
