In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 5.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.1 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 42.8 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 23.0 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
  

In [None]:
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances, linear_kernel
import pandas as pd
import numpy as np
from numpy import genfromtxt
import torch
import math

## Define model

In [None]:
#model_name = 'sentence-transformers/bert-base-nli-mean-tokens' # if want to use bert
model_name = 'emilyalsentzer/Bio_ClinicalBERT'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Load categories list and their embeddings

(making a look-up table which is obtained via ClinicalBERT)

In [None]:
url = 'https://raw.githubusercontent.com/casszhao/FAIR/main/0901_full_list.csv'
sorted_cat = pd.read_csv(url, header=None)

get a dictionary for embedding looking up later 

In [None]:
sorted_cat = sorted_cat[0].to_list()
sorted_cat = list(dict.fromkeys(sorted_cat))

a = (map(lambda x: x.lower(), sorted_cat))
lower_cat = list(a)
dic = {v: k for v, k in enumerate(lower_cat)}

In [None]:
!wget https://github.com/casszhao/FAIR/raw/main/candidate_emebdding.zip
!unzip candidate_emebdding.zip
pooled_emb_array = genfromtxt('candidate_emebdding.csv', delimiter=',')
pooled_emb_array

--2021-09-07 08:10:02--  https://github.com/casszhao/FAIR/raw/main/candidate_emebdding.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/casszhao/FAIR/main/candidate_emebdding.zip [following]
--2021-09-07 08:10:02--  https://raw.githubusercontent.com/casszhao/FAIR/main/candidate_emebdding.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16853560 (16M) [application/zip]
Saving to: ‘candidate_emebdding.zip’


2021-09-07 08:10:03 (122 MB/s) - ‘candidate_emebdding.zip’ saved [16853560/16853560]

Archive:  candidate_emebdding.zip
  inflating: candidate_emebdding.csv  


array([[ 0.19291636,  0.20812984, -0.09044071, ..., -0.06806536,
         0.17539963, -0.10158916],
       [-0.13815784,  0.10289687,  0.2408838 , ...,  0.08752791,
        -0.06546359, -0.35536486],
       [ 0.41229472,  0.24765727,  0.25566274, ...,  0.10910136,
         0.14033268, -0.01267239],
       ...,
       [ 0.31545714, -0.07819685, -0.03867328, ...,  0.26055223,
         0.13849889,  0.0446896 ],
       [ 0.25228524,  0.44211665,  0.09751181, ...,  0.21495025,
        -0.12337268,  0.06839384],
       [ 0.30591077,  0.37866551, -0.16940035, ...,  0.10000668,
        -0.03836991, -0.10729711]])

## Given any topic and get the top 20

In [None]:
topic = "trouble sleeping" #@param {type:"string"}

# input e.g. : Alzheimer's disease / Parkinson's disease / Social relation

In [None]:
def get_request_array(request, MAX_TOKEN):
  request_token = tokenizer.encode_plus(request, max_length=MAX_TOKEN, # length from 128 to 20
                                      truncation=True, padding='max_length',
                                      return_tensors='pt')

  request_id = request_token['input_ids'][0]
  request_attention_mask = request_token['attention_mask'][0]

  request_outputs = model(**request_token)
  request_embeddings = request_outputs.last_hidden_state
  request_mask = request_attention_mask.unsqueeze(-1).expand(request_embeddings.size()).float()
  request_masked_embeddings = request_embeddings * request_mask
  request_summed = torch.sum(request_masked_embeddings, 1)
  request_summed_mask = torch.clamp(request_mask.sum(1), min=1e-9)
  request_mean_pooled = request_summed / request_summed_mask
  return request_mean_pooled.detach().numpy()

In [None]:
topic_array = get_request_array(request=topic, MAX_TOKEN=20)

## cosine similarity

input a topic embedding array and get its cosine similar topics

In [None]:
def cosine_simi_list_for_one(topic_array):
  simi_array = cosine_similarity(topic_array, pooled_emb_array)
  simi_list = simi_array.tolist()[0]
  sorted_index = sorted(range(len(simi_list)), key=lambda k: simi_list[k])
  sorted_index.reverse()
  subs = list(map(dic.get, sorted_index, sorted_index))[:21] # only get the top 20 most similar words
  return subs

In [None]:
simi20_for_onewords = cosine_simi_list_for_one(topic_array)
simi20_for_onewords

['sore throat',
 'cold sore',
 'food poisoning',
 'binge eating',
 'vertigo',
 'chest pain',
 'deforestation',
 'diarrhea',
 'food contamination',
 'sleeping sickness commission',
 'developing countries',
 'substance abuse',
 'water contamination',
 'health disasters',
 'soil pollution',
 'sugary drink tax',
 'dystonia',
 'social relation',
 'gender role',
 'scabies',
 'asbestosis']

## euclidean_distances

In [None]:
# the closer the distance is more small 
def eucli_distance_list_for_one(topic_array):
  distance_array = euclidean_distances(topic_array, pooled_emb_array)
  distance_list = distance_array.tolist()[0]
  sorted_index = sorted(range(len(distance_list)), key=lambda k: distance_list[k])
  subs = list(map(dic.get, sorted_index, sorted_index))[:21]
  return subs

In [None]:
simi20_for_onewords = eucli_distance_list_for_one(topic_array)
simi20_for_onewords

['sore throat',
 'cold sore',
 'food poisoning',
 'binge eating',
 'vertigo',
 'chest pain',
 'deforestation',
 'diarrhea',
 'food contamination',
 'sleeping sickness commission',
 'developing countries',
 'substance abuse',
 'dystonia',
 'health disasters',
 'sugary drink tax',
 'soil pollution',
 'asbestosis',
 'gender role',
 'scabies',
 'water contamination',
 'hookworm infection']