In [2]:
!pip install transformers
!pip install -U sentence-transformers
!pip install faiss-cpu



In [3]:
import pandas as pd
import numpy as np
import torch
import time
import itertools

import faiss
from transformers import  BertModel, BertTokenizer
from sentence_transformers import SentenceTransformer
from transformers import FNetTokenizer, FNetModel

In [4]:
# df = pd.read_csv('/content/drive/MyDrive/citation2.csv')
# df.dropna(inplace=True)

big_sentence = pd.read_csv('/content/drive/MyDrive/big_sentence.csv')

In [5]:
def get_tokens(tokenizer, sentences, max_length=128):
  input_ids = []
  attention_mask = []
  token_type_ids = []

  for sent in sentences:
      encoded_dict = tokenizer.encode_plus(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = max_length,           # Pad & truncate all sentences.
                          pad_to_max_length = True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
      input_ids.append(encoded_dict['input_ids'])
      attention_mask.append(encoded_dict['attention_mask'])
      token_type_ids.append(encoded_dict['token_type_ids'])

  input_ids = torch.cat(input_ids, dim=0)
  attention_mask = torch.cat(attention_mask, dim=0)
  token_type_ids = torch.cat(token_type_ids, dim=0)
  return input_ids, attention_mask, token_type_ids


In [6]:
def search_bert(index, model, tokenizer, query, max_length=128, sentences=None, return_attention_mask=True):
   t=time.time()
   encoded_dict = tokenizer.encode_plus(
                        query,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_length,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = return_attention_mask,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
   model.eval()
   with torch.no_grad():
     output = model(**encoded_dict)

   query_vector = output.last_hidden_state.permute(1,0,2).detach().numpy()
   query_vector = np.mean(query_vector, axis=0)
   k = 5
   top_k = index.search(query_vector, k)
   print('totaltime: {}'.format(time.time()-t))
   output = [data[_id] for _id in top_k[1].tolist()[0]]
   sent_b = model.encode(output)
   cos_simil = cosine_similarity(query_vector, sent_b)
   return [sentences[_id] for _id in top_k[1].tolist()[0]]

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
def final_result(index=None, model=None, tokenizer=None, max_length=128, model_bert=True, sentences=None, return_attention_mask=True):
  query = str(input())
  if model_bert:
    results, cos_simil = search_bert(index=index, model=model, tokenizer=tokenizer, query=query, max_length=max_length, sentences=sentences, return_attention_mask=return_attention_mask)
  else:
    results, cos_simil = search_sbert(query=query, index=index, data=sentences)
  print('results :')
  for i in range(len(results)):
    print('\t','Cosine Similarity: ' + str(cos_simil[i]) + '  ' +str(results[i]))

SBERT + FAISS

In [9]:
def search_sbert(query, index=None, data=None):
   t=time.time()
   query_vector = sbert_model.encode([query])
   k = 5
   top_k = index.search(query_vector, k)
   print('totaltime: {}'.format(time.time()-t))
   output = [data[_id] for _id in top_k[1].tolist()[0]]
   sent_b = sbert_model.encode(output)
   cos_simil = cosine_similarity(query_vector, sent_b)
   return output, cos_simil[0]

In [10]:
#Возьмем для примера 100 предложений 
# sentences = df['sentence'].sample(100).unique()
sentences = big_sentence['sentence'].sample(4000).unique()

In [11]:
MODEL_NAME = 'bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12'

sbert_tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
sbert_model = SentenceTransformer(MODEL_NAME)

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/bionlp_bluebert_pubmed_uncased_L-12_H-768_A-12 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
encoded_data = sbert_model.encode(sentences)

sbert_index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
sbert_index.add_with_ids(encoded_data, np.array(range(0, len(sentences))))
faiss.write_index(sbert_index, 'search')

In [15]:
final_result(index=sbert_index, model=sbert_model, tokenizer=sbert_tokenizer, max_length=128, model_bert=False, sentences=sentences)

namely the impact of cancer on life
totaltime: 0.032404422760009766
results :
	 Cosine Similarity: 0.7649097  Provision for the non-partisan cultural working through of the shared traumatic experience in the form of periodic reminders of the loss and reiteration of its meaning, and of the heroism of those who suffered expressed in media, arts, public works, monuments, and occasions of public mourning were encouraged as they have been found to be useful in post disaster situations [106].
	 Cosine Similarity: 0.7538009  This process also provides an opportunity to think about how students recognize the authority in other professional departments before a "symbolic and psychological transformation" [15].
	 Cosine Similarity: 0.7636231  The PGWBI reflects psychological well-being (or otherwise); it is based on theories of evaluation of the domestic environment and is an appropriate means of determining the distortion produced by TS within the household [30,32].
	 Cosine Similarity: 0.74661

BERT + FAISS

In [None]:
MODEL_NAME = 'bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12'
bert_model = BertModel.from_pretrained(MODEL_NAME)
bert_tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)


Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12 were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
input_ids, attention_mask, token_type_ids = get_tokens(tokenizer=bert_tokenizer, sentences=sentences, max_length=128)
bert_model.eval()
with torch.no_grad():
  output = bert_model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

encoded_data = output.last_hidden_state.permute(1,0,2).detach().numpy()
encoded_data = np.mean(encoded_data, axis=0)

bert_index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
bert_index.add_with_ids(encoded_data, np.array(range(0, len(sentences))))
faiss.write_index(bert_index, 'search')




In [None]:
final_result(index=bert_index, model=bert_model, tokenizer=bert_tokenizer, max_length=128, model_bert=True, sentences=sentences, return_attention_mask=True)

cancer




True
totaltime: 0.5208303928375244
results :
	 Along with some others [4,7,24], we have given the TALE group the rank of 'class' containing several 'gene families'; this maintains consistent terminology throughout the present paper.
	 The fact that these months coincide with a period of relatively cool daytime temperatures and relatively humid conditions is also likely to be highly significant in terms of adult survivorship and vectorial capacity [2].
	 And in the context of quality improvement Berwick talks about pragmatic science, by which he means methods of observation and reflection that are systematic, theoretically grounded, often quantitative, and powerful, but are not RCTs [15].
	 In Pl, the production of carbapenem has been speculated to control the likely growth of the insect gut flora following their potential migration into the hemocoel of the infected insect [51].
	 Those patients admitted with severe sepsis or who developed severe sepsis during the first 24 hours in the 

FNET + FAISS

In [10]:
MODEL_NAME = "google/fnet-base"
fnet_tokenizer = FNetTokenizer.from_pretrained(MODEL_NAME)
fnet_model = FNetModel.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at google/fnet-base were not used when initializing FNetModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing FNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [1]:
input_ids, _, token_type_ids = get_tokens(tokenizer=fnet_tokenizer, sentences=sentences, max_length=128)
fnet_model.eval()
with torch.no_grad():
  output = fnet_model(input_ids=input_ids, token_type_ids=token_type_ids)

encoded_data = output.last_hidden_state.permute(1,0,2).detach().numpy()
encoded_data = np.mean(encoded_data, axis=0)

fnet_index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
fnet_index.add_with_ids(encoded_data, np.array(range(0, len(sentences))))
faiss.write_index(fnet_index, 'search')


NameError: ignored

In [1]:
final_result(index=fnet_index, model=fnet_model, tokenizer=fnet_tokenizer, max_length=128, model_bert=True, sentences=sentences, return_attention_mask=False)

NameError: ignored