In [1]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 268 kB/s eta 0:00:01
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 829 kB/s eta 0:00:01
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 841 kB/s eta 0:00:01
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 959 kB/s eta 0:00:01
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 115 kB/s eta 0:00:01
Collecting regex!=2019.12.17
  Downloading regex-2022.6.2-cp39-cp39-manylinux_2_17_x86_64.

### make sure you have run ./DATA/preprocess.ipynb first 
### make sure you have installed pip install -U sentence-transformers
### make sure for first time you run 
```
transformer = Transformer("all-MiniLM-L12-v2")
transformer.preprocess(False)
```

In [3]:
import json
import numpy as np
from scipy.spatial import distance
from sentence_transformers import SentenceTransformer


In [4]:
source = "./"
f_source = lambda s : source+"/"+s
class Transformer:
  def __init__(self,model_name = 'all-MiniLM-L6-v2'):
    print(f"downloading model {model_name}")
    self.model = SentenceTransformer(model_name)
    self.documents = json.load(open(f_source("DATA/crawler/data/NLP.json"),"r"))
    self.representation = None
  
  def preprocess(self,pre_use = False):
    if not pre_use:
      docs = []
      keys = []
      print(f"creating representation for docs")
      for key in self.documents:
        abstract = self.documents[key]["abstract"]
        if type(abstract) == str:
          docs.append(abstract)
          keys.append(key)
      embeddings = self.model.encode(docs)
      self.representation = {}
      for key, embedding in zip(keys, embeddings):
        self.representation[key] = embedding.tolist()
      addr = f_source("DATA/Module_data/transformer.json")
      print(f"saving docs_rep in {addr}")
      open(addr,"w").write(json.dumps(self.representation))
    print(f"loading docs_rep")
    self.representation = json.load(open(f_source("DATA/Module_data/transformer.json"),"r"))
    self.representation = {key : np.array(self.representation[key]) for key in self.representation }
  def query(self,input_str:str , k = 10):
    q = self.model.encode(input_str)
    article_id = sorted([(key,np.abs(distance.cosine(q,self.representation[key]))) for key in self.representation],key = lambda x : x[1])[:k]
    article =  [self.documents[id[0]] for id in article_id]
    return (article_id,article)



In [5]:
transformer = Transformer("all-MiniLM-L12-v2")
transformer.preprocess(False)

downloading model all-MiniLM-L12-v2


Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/573 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/352 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

creating representation for docs
saving docs_rep in .//DATA/Module_data/transformer.json
loading docs_rep


In [6]:
transformer = Transformer("all-MiniLM-L12-v2")
transformer.preprocess(True)

downloading model all-MiniLM-L12-v2
loading docs_rep


In [8]:
ranking,articles = transformer.query("statistical measure that evaluates how relevant a word is to a document in a collection of documents. This is done by multiplying two metrics: how many times a word appears in a document, and the inverse document frequency of the word across a set of documents.")
for a,m in zip(ranking,articles):
    print(a)
#     print(m["title"])
#     print(m["abstract"])

('1b181af10a3319b48b99db494aebc1ab5b23f383', 0.43540612644222365)
('8b6b50ff16c289db6cbd66b46476679b27a0138f', 0.4427571412531979)
('b53162dffd265f3c3181371f2800d09e068ebf90', 0.46761421030745176)
('a06be899a51f1994232908977b76c3278ac86e9d', 0.47057690935727836)
('a8a0079b3814ec711dde28073e9c55fa765e11ea', 0.48584911136569897)
('830af3db2fd9a725919bdd4162a90a59b95534a1', 0.48932039974482144)
('662f3a5253d66d58dda4c982cfb63aa103aada9e', 0.49138043288384536)
('a008bffc0062bc46225dbb8758ed00c2b41cd042', 0.5022781009245005)
('3fdcaf958b7d184c4d84d07a76236b29b1e934fd', 0.5034677048431673)
('b550e2cb28e97ebd3b4e149abd7401619b01aaa8', 0.5062498120849198)
