In [1]:
import numpy as np
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, util

from sklearn.metrics.pairwise import cosine_similarity

import csv
import pickle
import time
import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
print(torch.cuda.is_available())

True


In [3]:
## 测试，3万个分句后的description和 300多个分句后的abstract，每个dataframe里面都有一列是id

In [4]:
df=pd.read_csv("0-29999.csv") #这里分好的句子，三万条
df = df.loc[0:5000,:] #选取5000条测试

data=df.description.to_list()
df_abstract =pd.read_csv("abstract_seg.csv")
df_abstract.shape

(382, 4)

In [5]:
length_df = df.shape[0]
df.shape

(5001, 8)

## embed abstract, max_corpus_size = 100000，当然此处我们只有三百多条，无所谓的，如果没有pkl文件则encode，有的话直接导入

In [6]:
model_name = 'paraphrase-multilingual-mpnet-base-v2' #768的,多语言
model = SentenceTransformer(model_name,device='cuda')#gpu比cpu快20倍


dataset_path = "abstract_seg.csv"
# 数据集里选择多少条构建候选语料库
max_corpus_size = 100000
# embed后的语料保存在本地文件的名称，
abstra_embedding_cache_path = 'abstract-embeddings-{}-size-{}.pkl'.format(model_name.replace('/', '_'), max_corpus_size)


#Check if embedding cache path exists
if not os.path.exists(abstra_embedding_cache_path):
    corpus_sentences = list()  #list保证顺序，set不行
    with open(dataset_path, encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, quoting=csv.QUOTE_MINIMAL)
        for row in reader:
            corpus_sentences.append(row['abstract']) #提取abstract这一列
            if len(corpus_sentences) >= max_corpus_size:  
                break

    corpus_sentences = list(corpus_sentences)
    print("Encode the corpus. This might take a while")
    corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_numpy=True)

    print("Store file on disc")
    with open(abstra_embedding_cache_path, "wb") as fOut:
        pickle.dump({'abstra_sentences': corpus_sentences, 'abstra_embeddings': corpus_embeddings}, fOut)
else:
    print("Load pre-computed embeddings from disc")
    with open(abstra_embedding_cache_path, "rb") as fIn:
        cache_data = pickle.load(fIn)
        ab_corpus_sentences = cache_data['abstra_sentences'] #原始句子
        ab_corpus_embeddings = cache_data['abstra_embeddings'] #每个句子的embedding


Load pre-computed embeddings from disc


## 创建index，应把abstract当成corpus，输入decription查询（用description创建也可以，这只是个example）原则上最好谁大用谁，这样语料库是比较大的，但问题在于这个东西是要给faiss去训练的，如果用description去训练，可能会学到一些广告之类的，不是我们最终的目的，所以此处先用abstract

In [7]:
# 预训练模型的编码输出特征维度
embedding_size = 768

# 查询前K个最匹配结果，自定义
top_k_hits = 10
# 聚类的数量，这个数量一般介于4*sqrt(N) 到 16*sqrt(N)，N是语料库的大小，sqrt380* 14 =260,越大越快
n_clusters = 260
# 在最相关的多少个簇中搜索答案，这个参数越大查的越全，消耗的时间也就越多，准确度越高
_nprobe = 10

# 使用gpu version

quantizer = faiss.IndexFlatIP(embedding_size) #size=768
index_ivf = faiss.IndexIVFFlat(quantizer, embedding_size, n_clusters, faiss.METRIC_INNER_PRODUCT)
## here we could use METRIC_L2, by default it performs inner-product search
# Inverted File Index (IVF) index consists of search scope reduction through clustering. It’s a very popular index as it’s easy to use, 
#with high search-quality and reasonable search-speed.

index_ivf.nprobe = _nprobe

# make it an IVF GPU index
res = faiss.StandardGpuResources()
gpu_index_ivf = faiss.index_cpu_to_gpu(res, 0, index_ivf)

# 训练
# 因为对向量做点积计算以进行召回，先对所有语料的编码进行normalize
ab_corpus_embeddings = ab_corpus_embeddings / np.linalg.norm(ab_corpus_embeddings, axis=1)[:, None]
gpu_index_ivf.train(ab_corpus_embeddings)
gpu_index_ivf.add(ab_corpus_embeddings)

## 找出top1 的abtract-description pair，并把pub id放到 description中当成一列，5000条查询需要一分钟的样子

In [14]:
%%time
top_k_hits =1 
pub_id = list()
for i in range(0,len(df)):
    inp_question = df.iloc[i]['description']

#     start_time = time.time()
    question_embedding = model.encode(inp_question)
    #FAISS works with inner product (dot product). When we normalize vectors to unit length, inner product is equal to cosine similarity!!!
    question_embedding = question_embedding / np.linalg.norm(question_embedding)
    question_embedding = np.expand_dims(question_embedding, axis=0)
    # Search in FAISS. It returns a matrix with distances and corpus ids.
    distances, corpus_ids = gpu_index_ivf.search(question_embedding, top_k_hits) #distance 即score， corpurs id是在语料库（abstract中每条分好句子的abstract的index）
    #有了这个index那么可以把那一行的abstract pub id取出来！
    # We extract corpus ids and scores for the first query
#     location = [{'corpus_id': id} for id in corpus_id[0]
#     hits = [{'corpus_id': id, 'score': score} for id, score in zip(corpus_ids[0], distances[0])] ##去掉一个括号，原来是双括号
#     hits = sorted(hits, key=lambda x: x['score'], reverse=True)#从大到小
#     end_time = time.time()
    pub_id.append(df_abstract.iloc[int(corpus_ids[0])]['Publication number']) 
   

Wall time: 1min 4s


In [15]:
print(np.array(pub_id).shape)

(5001,)


In [16]:
pub_id ###是按description的index的顺序的

['CN113129366A',
 'CN306985044S',
 'CN110413367A',
 'CN105262596B',
 'CN113129366A',
 'CN110807728A',
 'CN109753177A',
 'CN111797013A',
 'CN111526425A',
 'CN306985044S',
 'CN111526425A',
 'CN110413367A',
 'CN306985044S',
 'CN112149542A',
 'CN111968028A',
 'CN112149542A',
 'CN112149542A',
 'CN112149542A',
 'CN111582376A',
 'WO2020207174A1',
 'CN111526425A',
 'CN110413367A',
 'CN108345630A',
 'CN105262596B',
 'CN108846636A',
 'TW202021346A',
 'TW202021346A',
 'TW202021346A',
 'TW202021346A',
 'TW202021346A',
 'TW202021346A',
 'WO2020207174A1',
 'CN306985044S',
 'CN110276345A',
 'TW202021346A',
 'CN109753177A',
 'CN111968028A',
 'CN110009101A',
 'CN111968028A',
 'CN306985044S',
 'CN109947973B',
 'CN110688508A',
 'CN109543113A',
 'CN306985044S',
 'CN113129366A',
 'TW202021346A',
 'CN306985044S',
 'CN110413367A',
 'TW202021346A',
 'TW202021346A',
 'TW202021346A',
 'CN110688508A',
 'CN111526425A',
 'CN110807728A',
 'CN110688508A',
 'CN112149542A',
 'CN111352634A',
 'CN112149542A',
 'CN112149

In [19]:
df['top1pub_id']= pd.DataFrame(pub_id) 
df.head()#不错不错

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,unified_publisher_id,unified_publisher_name,app_id,unified_app_id,description,category1,top1pub_id
0,0,0,56289c8802ac6486a70013a5,"Facebook, Inc.",284882215,55c530a702ac64f9c0002dff,"Connect with friends, family and people who sh...",6005,CN113129366A
1,0,0,56289c8802ac6486a70013a5,"Facebook, Inc.",284882215,55c530a702ac64f9c0002dff,"Communicate privately, watch your favorite con...",6005,CN306985044S
2,0,0,56289c8802ac6486a70013a5,"Facebook, Inc.",284882215,55c530a702ac64f9c0002dff,"On Facebook, keeping up with the people who ma...",6005,CN110413367A
3,0,0,56289c8802ac6486a70013a5,"Facebook, Inc.",284882215,55c530a702ac64f9c0002dff,"Discover, enjoy and do more together",6005,CN105262596B
4,0,0,56289c8802ac6486a70013a5,"Facebook, Inc.",284882215,55c530a702ac64f9c0002dff,Stay up to date with your loved ones:,6005,CN113129366A


In [26]:
from tqdm import tqdm
import time
top_k_hits =1 
pub_id = list()

for i in tqdm(range(0,5001)):
    inp_question = df.iloc[i]['description']

#     start_time = time.time()
    question_embedding = model.encode(inp_question)
    #FAISS works with inner product (dot product). When we normalize vectors to unit length, inner product is equal to cosine similarity!!!
    question_embedding = question_embedding / np.linalg.norm(question_embedding)
    question_embedding = np.expand_dims(question_embedding, axis=0)
    # Search in FAISS. It returns a matrix with distances and corpus ids.
    distances, corpus_ids = gpu_index_ivf.search(question_embedding, top_k_hits) #distance 即score， corpurs id是在语料库（abstract中每条分好句子的abstract的index）
    #有了这个index那么可以把那一行的abstract pub id取出来！
    # We extract corpus ids and scores for the first query
#     location = [{'corpus_id': id} for id in corpus_id[0]
#     hits = [{'corpus_id': id, 'score': score} for id, score in zip(corpus_ids[0], distances[0])] ##去掉一个括号，原来是双括号
#     hits = sorted(hits, key=lambda x: x['score'], reverse=True)#从大到小
#     end_time = time.time()
    pub_id.append(df_abstract.iloc[int(corpus_ids[0])]['Publication number']) 


  0%|          | 0/5001 [00:00<?, ?it/s][A
  0%|          | 8/5001 [00:00<01:05, 76.10it/s][A
  0%|          | 16/5001 [00:00<01:03, 78.26it/s][A
  0%|          | 24/5001 [00:00<01:03, 78.45it/s][A
  1%|          | 32/5001 [00:00<01:02, 79.00it/s][A
  1%|          | 41/5001 [00:00<01:01, 80.45it/s][A
  1%|          | 50/5001 [00:00<01:01, 80.65it/s][A
  1%|          | 59/5001 [00:00<01:01, 80.04it/s][A
  1%|▏         | 68/5001 [00:00<01:01, 79.66it/s][A
  2%|▏         | 77/5001 [00:00<01:01, 80.31it/s][A
  2%|▏         | 86/5001 [00:01<01:01, 80.19it/s][A
  2%|▏         | 95/5001 [00:01<01:02, 78.93it/s][A
  2%|▏         | 103/5001 [00:01<01:02, 77.79it/s][A
  2%|▏         | 112/5001 [00:01<01:02, 78.66it/s][A
  2%|▏         | 120/5001 [00:01<01:04, 75.33it/s][A
  3%|▎         | 128/5001 [00:01<01:05, 74.24it/s][A
  3%|▎         | 136/5001 [00:01<01:04, 75.07it/s][A
  3%|▎         | 144/5001 [00:01<01:04, 75.26it/s][A
  3%|▎         | 152/5001 [00:01<01:03, 76.24it/s

KeyboardInterrupt: 