# Install and Import Dependencies

In [None]:
! pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.0.0.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.7 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.9.0-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 13.0 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 60.4 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.0.14-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 2.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 58.3 MB/s 
Collecting huggingface-hub
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_

In [None]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
from google.colab import drive
import os

# Load Data

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
os.chdir(os.getcwd() + '/drive/MyDrive/Colab Notebooks')

In [None]:
abstracts = pd.read_csv('abstracts.csv')

In [None]:
titles = pd.read_csv('papers.csv')
titles.drop_duplicates(subset=['eid'], inplace=True)
drop = ['Unnamed: 0', 'doi', 'afid', 'coverDate', 'publicationName', 'source_id', 'citedby_count']
titles.drop(columns=drop, inplace=True)

In [None]:
merged = pd.merge(titles, abstracts, how='left', on='eid')
merged.dropna(subset=['description'], inplace=True)

In [None]:
merged.shape

(767968, 5)

# Load and Encode

In [None]:
# Load model
model = SentenceTransformer('allenai-specter')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=690.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3296.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=622.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=122.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=229.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=439832305.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=53.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=461628.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=331.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=222296.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=190.0, style=ProgressStyle(description_…




In [None]:
# Amend paper without a title - used DOI to search Google
merged.at[817948, 'title'] = 'Review and analysis of fuel cell system modelling and control'

In [None]:
# Encode papers
paper_texts = [row['title'] + '[SEP]' + row['description'] for i, row in merged.iterrows()]

In [None]:
paper_texts[0]

'Regulation of steroid sulphatase expression and activity in breast cancer[SEP]Steroid sulphatase (STS) catalysis the conversion of oestrone sulphate (E1S) to oestrone (E1) and its action in breast tumours makes a major contribution to in situ oestrogen production in this tissue. Although expression of STS mRNA and STS activity are increased in malignant breast tissues compared with that in non-malignant tissues, little is known about the regulation of its expression or activity. In the present study we have used a RT-PCR technique to investigate the regulation of STS mRNA expression in cultured breast tissue fibroblasts and MCF-7 cells. STS mRNA expression was readily detectable in fibroblasts derived from breast tissue proximal to tumours, breast tumour tissue and reduction mammoplasty tissue. For two pre-menopausal subjects, STS mRNA expression was similar in proximal and tumour fibroblasts whereas for a third, post-menopausal subject, expression in breast tumour fibroblasts was 2.4

In [None]:
corpus_embeddings = model.encode(paper_texts, convert_to_tensor=True)

# Search Corpus

In [None]:
def search_papers(title, abstract):
  query_embedding = model.encode(title+'[SEP]'+abstract, convert_to_tensor=True)

  search_hits = util.semantic_search(query_embedding, corpus_embeddings)
  search_hits = search_hits[0]

  print('Paper:', title)
  print('Most similar papers:')
  print(search_hits)
  for hit in search_hits:
    related_paper = merged.iloc[hit['corpus_id']]
    print(hit['score'], related_paper['title'])
    #print("{:.2f}\t{}\t{} {}".format(hit['score'], related_paper['title'], related_paper['venue'], related_paper['year']))

In [None]:
search_papers(title='Regulation of steroid sulphatase expression and activity in breast cancer',
              abstract='Steroid sulphatase (STS) catalysis the conversion of oestrone sulphate (E1S) to oestrone (E1) and its action in breast tumours makes a major contribution to in situ oestrogen production in this tissue. Although expression of STS mRNA and STS activity are increased in malignant breast tissues compared with that in non-malignant tissues, little is known about the regulation of its expression or activity. In the present study we have used a RT-PCR technique to investigate the regulation of STS mRNA expression in cultured breast tissue fibroblasts and MCF-7 cells. STS mRNA expression was readily detectable in fibroblasts derived from breast tissue proximal to tumours, breast tumour tissue and reduction mammoplasty tissue. For two pre-menopausal subjects, STS mRNA expression was similar in proximal and tumour fibroblasts whereas for a third, post-menopausal subject, expression in breast tumour fibroblasts was 2.4-fold that in proximal fibroblasts. The cytokine tumour necrosis factor α (TNFα) or the STS inhibitor, 2-methoxyoestrone-3-O-sulphamate, had no effect on STS mRNA expression in fibroblasts. STS mRNA was detectable in MCF-7 cells but neither TNFα nor interleukin 6 (IL-6) affected its expression. Transient transfection of COS-1 and MCF-7 cells with a STS cDNA lacking STS 5′ and 3′ sequences increased activity 17-fold and 2-fold, respectively. TNFα plus IL-6 increased STS activity in mock transfected MCF-7 cells and further increased STS activity in transfected MCF-7 cells. This indicates that activation can occur independently of STS promoter and enhancer elements. In conjunction with the lack of regulation of STS mRNA it suggest that TNFα and IL-6 may increase STS activity via a post-translational modification of the enzyme or by increasing substrate availability. © 2001 Elsevier Science Ltd.')

Paper: Regulation of steroid sulphatase expression and activity in breast cancer
Most similar papers:
[{'corpus_id': 0, 'score': 0.9999999403953552}, {'corpus_id': 2, 'score': 0.7828977704048157}, {'corpus_id': 5, 'score': 0.6106774806976318}, {'corpus_id': 4, 'score': 0.5788747668266296}, {'corpus_id': 8, 'score': 0.5634571313858032}, {'corpus_id': 3, 'score': 0.529052734375}, {'corpus_id': 7, 'score': 0.47814470529556274}, {'corpus_id': 1, 'score': 0.4768742322921753}, {'corpus_id': 9, 'score': 0.4410049021244049}, {'corpus_id': 6, 'score': 0.4281136989593506}]
0.9999999403953552 Regulation of steroid sulphatase expression and activity in breast cancer
0.7828977704048157 Inhibition of steroid sulphatase activity by tricyclic coumarin sulphamates
0.6106774806976318 In vivo analysis of Cajal body movement, separation, and joining in live human cells
0.5788747668266296 Analysing cache effects in distribution sorting
0.5634571313858032 Y chromosome microdeletions in idiopathic azoospermi

# Save Embeddings

In [None]:
np.save('specter_embeddings.npy', corpus_embeddings.cpu())

In [None]:
corpus_embeddings.shape

torch.Size([767968, 768])