In [2]:
import numpy as np

import os
import pandas as pd
from sentence_transformers import SentenceTransformer, util

from sklearn.metrics.pairwise import cosine_similarity

import tensorflow as tf

## 功能：给定一个list of string，找出跟某句话最像的句子

In [4]:
model = SentenceTransformer('bert-base-nli-mean-tokens')#768维embedding  
#语料库
sentences = ['I love UofT',
            'I really love UofT', 
            'I like UofT',
            "I don't like UofT",
             "I hate UofT",
             "I like eating bread",
             "I like earting bread in UofT",
             "I want to study in UofT",
             "I study in my home",
             "educate in UofT"
            ]

In [5]:
sentences_df=pd.DataFrame(sentences,columns=['sentences'])

sentences_df.head()

Unnamed: 0,sentences
0,I love UofT
1,I really love UofT
2,I like UofT
3,I don't like UofT
4,I hate UofT


In [6]:
embeddings = model.encode(sentences_df['sentences'])
#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print(embedding.shape)#每个string都是768维embedding
    print("Embedding:", embedding) 
    print("")

Sentence: I love UofT
(768,)
Embedding: [ 1.48847699e-02  3.31860423e-01  2.40926123e+00  3.54210705e-01
  5.04279546e-02 -1.07229330e-01 -5.55348277e-01  1.03152001e+00
  4.63475317e-01 -3.96950036e-01 -3.12554687e-01  6.20300114e-01
  3.28807175e-01  5.17476976e-01  1.07935250e+00  1.00930683e-01
 -6.49985492e-01 -4.31200743e-01  3.88473988e-01 -8.41502190e-01
 -3.10197055e-01 -3.80280524e-01 -4.45015520e-01 -1.01002252e+00
 -3.83376718e-01 -5.68908870e-01  5.31689048e-01 -1.54027021e+00
  6.31164014e-02  7.51606971e-02 -2.10418597e-01  2.90869862e-01
  7.72911072e-01 -2.02971220e-01  2.68494189e-01  8.18363905e-01
 -5.48317917e-02  2.86592662e-01 -2.61739582e-01 -3.93916726e-01
  2.10861373e+00 -7.77404726e-01  1.09025419e+00 -6.18715584e-01
 -1.21357322e+00 -4.69253868e-01 -9.90091324e-01  3.76046538e-01
  4.00614232e-01 -1.10332561e+00 -1.64356958e-02 -6.90018713e-01
  1.32902935e-01 -1.62569106e-01 -3.57575893e-01  3.49718094e-01
 -3.02040894e-02 -4.55458045e-01  5.85311830e-01  

In [7]:
def most_similar(index,similarity):
    print (f'Document: {sentences_df.iloc[index]["sentences"]}')
    print ('\n')
    print ('Similar Documents:')
    
    similar_index=np.argsort(similarity[index])[::-1] #从小到大排序，返回index，从后向前取，即从大到小排序
 
    for i in similar_index:
        if i == index: #跳过自己
            continue
            
        print('\n')
        print (f'Document: {sentences_df.iloc[i]["sentences"]}')
        print (f'Cos Similarity is: {similarity[index][i]}')

In [8]:
embeddings = model.encode(sentences_df['sentences'])
 
calculated_similarities=cosine_similarity(embeddings)
 
most_similar(0,calculated_similarities) #跟第0句最像的有哪些？

Document: I love UofT


Similar Documents:


Document: I really love UofT
Cos Similarity is: 0.9818402528762817


Document: I like UofT
Cos Similarity is: 0.9349514245986938


Document: educate in UofT
Cos Similarity is: 0.5952590107917786


Document: I like earting bread in UofT
Cos Similarity is: 0.5203454494476318


Document: I want to study in UofT
Cos Similarity is: 0.4918694794178009


Document: I like eating bread
Cos Similarity is: 0.44390997290611267


Document: I don't like UofT
Cos Similarity is: 0.3969878554344177


Document: I hate UofT
Cos Similarity is: 0.39125508069992065


Document: I study in my home
Cos Similarity is: 0.16622042655944824


## 另一种，util里面也有cos similarity， 写个loop循环，找出那两句话最像，并排序

In [9]:
cosine_scores = util.cos_sim(embeddings, embeddings)# 类似variance -covariance matrix的感觉
len(cosine_scores)#10
print(cosine_scores)#观察发现有1的similarity，是因为自己跟自己算的，第一行的第二个是第二行的第一个，应去掉，最后一行没有有用信息，

tensor([[1.0000, 0.9818, 0.9350, 0.3970, 0.3913, 0.4439, 0.5203, 0.4919, 0.1662,
         0.5953],
        [0.9818, 1.0000, 0.9256, 0.3842, 0.3755, 0.4800, 0.5505, 0.4559, 0.1485,
         0.5500],
        [0.9350, 0.9256, 1.0000, 0.4416, 0.3610, 0.5286, 0.5728, 0.5167, 0.1785,
         0.6015],
        [0.3970, 0.3842, 0.4416, 1.0000, 0.9167, 0.1908, 0.2061, 0.3096, 0.0659,
         0.3211],
        [0.3913, 0.3755, 0.3610, 0.9167, 1.0000, 0.1362, 0.1821, 0.3030, 0.0782,
         0.3015],
        [0.4439, 0.4800, 0.5286, 0.1908, 0.1362, 1.0000, 0.9089, 0.2604, 0.1831,
         0.2274],
        [0.5203, 0.5505, 0.5728, 0.2061, 0.1821, 0.9089, 1.0000, 0.3781, 0.2142,
         0.3526],
        [0.4919, 0.4559, 0.5167, 0.3096, 0.3030, 0.2604, 0.3781, 1.0000, 0.4794,
         0.8063],
        [0.1662, 0.1485, 0.1785, 0.0659, 0.0782, 0.1831, 0.2142, 0.4794, 1.0000,
         0.4131],
        [0.5953, 0.5500, 0.6015, 0.3211, 0.3015, 0.2274, 0.3526, 0.8063, 0.4131,
         1.0000]])


In [10]:
simi = []
for i in range(len(cosine_scores)-1): #此处为了不重复提取similarity 为1的（自己跟自己比）取到8即可
    for j in range(i+1, len(cosine_scores)): 
        simi.append({'index': [i, j], 'score': cosine_scores[i][j]})#同时取出index方便后面提取，dict：key value pair
print(simi)#list of dict

[{'index': [0, 1], 'score': tensor(0.9818)}, {'index': [0, 2], 'score': tensor(0.9350)}, {'index': [0, 3], 'score': tensor(0.3970)}, {'index': [0, 4], 'score': tensor(0.3913)}, {'index': [0, 5], 'score': tensor(0.4439)}, {'index': [0, 6], 'score': tensor(0.5203)}, {'index': [0, 7], 'score': tensor(0.4919)}, {'index': [0, 8], 'score': tensor(0.1662)}, {'index': [0, 9], 'score': tensor(0.5953)}, {'index': [1, 2], 'score': tensor(0.9256)}, {'index': [1, 3], 'score': tensor(0.3842)}, {'index': [1, 4], 'score': tensor(0.3755)}, {'index': [1, 5], 'score': tensor(0.4800)}, {'index': [1, 6], 'score': tensor(0.5505)}, {'index': [1, 7], 'score': tensor(0.4559)}, {'index': [1, 8], 'score': tensor(0.1485)}, {'index': [1, 9], 'score': tensor(0.5500)}, {'index': [2, 3], 'score': tensor(0.4416)}, {'index': [2, 4], 'score': tensor(0.3610)}, {'index': [2, 5], 'score': tensor(0.5286)}, {'index': [2, 6], 'score': tensor(0.5728)}, {'index': [2, 7], 'score': tensor(0.5167)}, {'index': [2, 8], 'score': tens

In [11]:
simi = []
for i in range(len(cosine_scores)-1): #此处为了不重复提取similarity 为1的（自己跟自己比）取到8即可
    for j in range(i+1, len(cosine_scores)): 
        simi.append({'index': [i, j], 'score': cosine_scores[i][j]})#同时取出index方便后面提取，dict：key value pair
print(simi)#list of dict

[{'index': [0, 1], 'score': tensor(0.9818)}, {'index': [0, 2], 'score': tensor(0.9350)}, {'index': [0, 3], 'score': tensor(0.3970)}, {'index': [0, 4], 'score': tensor(0.3913)}, {'index': [0, 5], 'score': tensor(0.4439)}, {'index': [0, 6], 'score': tensor(0.5203)}, {'index': [0, 7], 'score': tensor(0.4919)}, {'index': [0, 8], 'score': tensor(0.1662)}, {'index': [0, 9], 'score': tensor(0.5953)}, {'index': [1, 2], 'score': tensor(0.9256)}, {'index': [1, 3], 'score': tensor(0.3842)}, {'index': [1, 4], 'score': tensor(0.3755)}, {'index': [1, 5], 'score': tensor(0.4800)}, {'index': [1, 6], 'score': tensor(0.5505)}, {'index': [1, 7], 'score': tensor(0.4559)}, {'index': [1, 8], 'score': tensor(0.1485)}, {'index': [1, 9], 'score': tensor(0.5500)}, {'index': [2, 3], 'score': tensor(0.4416)}, {'index': [2, 4], 'score': tensor(0.3610)}, {'index': [2, 5], 'score': tensor(0.5286)}, {'index': [2, 6], 'score': tensor(0.5728)}, {'index': [2, 7], 'score': tensor(0.5167)}, {'index': [2, 8], 'score': tens

##### 看top5的几个pair

In [12]:
simi = sorted(simi, key=lambda x: x['score'], reverse=True) #对score降序排列

for each in simi[0:5]:
    i, j = each['index']
    print(f"sentence 1:{sentences[i]}, sencentce 2:{sentences[j]}, Score: {each['score']}")

sentence 1:I love UofT, sencentce 2:I really love UofT, Score: 0.9818404316902161
sentence 1:I love UofT, sencentce 2:I like UofT, Score: 0.9349513053894043
sentence 1:I really love UofT, sencentce 2:I like UofT, Score: 0.9255992770195007
sentence 1:I don't like UofT, sencentce 2:I hate UofT, Score: 0.9167152643203735
sentence 1:I like eating bread, sencentce 2:I like earting bread in UofT, Score: 0.908858060836792


In [11]:
import faiss
import warnings
warnings.filterwarnings('ignore')

In [33]:
import numpy as np
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, util

from sklearn.metrics.pairwise import cosine_similarity

import csv
import pickle
import time

# import tensorflow as tf

In [1]:
#读取AI的，并把AI keyword当成description
AI = pd.read_csv('AI_identifier_new.csv')
AI = AI.dropna(axis=0).iloc[:5000] #取5000行
AI.head()

NameError: name 'pd' is not defined

In [24]:
AI_abstract = pd.DataFrame(AI.iloc[:,2])#只选取abstract
AI_abstract.shape

(5000, 1)

In [29]:
#读取app的
app = pd.read_csv('final_app_info.csv')
app = app[['unified_publisher_id','app_id','unified_app_id','publisher_name','description']].dropna(axis=0)#随便挑几个列，
app = app.iloc[:5000]
app.shape

(5000, 5)

In [30]:
app.head()

Unnamed: 0,unified_publisher_id,app_id,unified_app_id,publisher_name,description
0,56289c8802ac6486a70013a5,284882215,55c530a702ac64f9c0002dff,"Facebook, Inc.","Connect with friends, family and people who sh..."
1,56289c8802ac6486a70013a5,com.facebook.phone,559cd21dc063bdebe300000e,Facebook,Make your phone smarter with Hello. Built by M...
2,56289c8802ac6486a70013a5,com.facebook.pages.app,55c5057102ac64f9c00020d2,Facebook,"With Facebook Business Suite (Pages Manager), ..."
3,56289c8802ac6486a70013a5,com.facebook.moments,559cd18ac063bdebe3000007,Facebook,Moments is going away soon and will be removed...
4,56289c8802ac6486a70013a5,com.facebook.orca,55c501db02ac64f9c0001f52,Facebook,"Be together whenever, with our free* all-in-on..."


In [31]:
app_descrip = pd.DataFrame(app.iloc[:,4]) #只选取description这个
app_descrip.shape

(5000, 1)