In [None]:
import torch
import pandas as pd
from transformers import BertModel
from transformers import BertTokenizer
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
singleList= ['obsession', 'compulsion', 'urge', 'thought', 'obsessive', 'intrusive', 'impairment']
biGramList= ['intrusive_thought', 'compulsive_behavior', 'aggressive_obsession', 'sexual_obsession', 
'contamination_obsession', 'hoarding_obsession', 'religious_obsession', 'symmetry_obsession', 
'somatic_obsession', 'doubt_obsession', 'mental_act']

In [None]:
tokenizer = BertTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
model = BertModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT', output_hidden_states=True)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def bert_bert_similarities(ontologyClass, Bert_embedding_list, ontology_classes_List):
    print(ontologyClass)
    print('===============')

    # Expand the dimensions of the vector to match the matrix shape
    classIndx= ontology_classes_List.index(ontologyClass)
    expanded_vector = Bert_embedding_list[classIndx].unsqueeze(0).expand(model.embeddings.word_embeddings.weight.size(0), -1)

    # Calculate the cosine similarity
    cos_similarities = torch.nn.functional.cosine_similarity(expanded_vector, model.embeddings.word_embeddings.weight, dim=1)

    # Find the top ten similarities
    top_similarities, top_indices = torch.topk(cos_similarities, k=20)
    top_similarities, top_indices = top_similarities, tokenizer.convert_ids_to_tokens(top_indices)

    # 'top_similarities' will be a tensor of shape [20] containing the top 20 similarities
    # 'top_indices' will be a tensor of shape [20] containing the indices of the top 20 similarities in the original matrix

    return list(zip([round(t.item(),3) for t in top_similarities],top_indices))

In [None]:

Bert_embedding_list_single=[]
tokens100_single= []
for ontologyTerm in singleList:  # The word for which you want to obtain the embedding
    termsList= ontologyTerm.lower().split('_')
    if len(termsList)<=1:
       
        word_index = tokenizer.convert_tokens_to_ids(termsList[0])  # Convert the word to its corresponding index
        if word_index==100:
            tokens100_single.append(termsList[0])

        # Access the embedding matrix and retrieve the embedding vector
        embedding_matrix = model.embeddings.word_embeddings.weight
        embedding_vector = embedding_matrix[word_index]

        Bert_embedding_list_single.append(embedding_vector)
       
    else:
        temList=[]
        for word in termsList:
            word_index = tokenizer.convert_tokens_to_ids(word)  # Convert the word to its corresponding index
            if word_index==100:
                tokens100_single.append(word)


            # Access the embedding matrix and retrieve the embedding vector
            embedding_matrix = model.embeddings.word_embeddings.weight
            embedding_vector = embedding_matrix[word_index]

            temList.append(embedding_vector)
        meanTerm= torch.mean(torch.stack(temList), dim=0, keepdim=True).flatten()
        Bert_embedding_list_single.append(meanTerm)
        
set(tokens100_single), len(set(tokens100_single))

({'compulsion', 'impairment', 'intrusive', 'obsessive'}, 4)

In [None]:
bert_bert_similarities('obsession', Bert_embedding_list_single, singleList)

obsession


[(1.0, 'obsession'),
 (0.651, 'obsessed'),
 (0.636, 'fascination'),
 (0.588, 'urges'),
 (0.58, 'fantasies'),
 (0.573, 'irrational'),
 (0.57, 'insistence'),
 (0.569, 'preoccupied'),
 (0.567, 'worries'),
 (0.565, 'furiously'),
 (0.564, 'instincts'),
 (0.564, 'madness'),
 (0.563, 'flirting'),
 (0.562, 'instinct'),
 (0.561, '[unused47]'),
 (0.561, 'frenzy'),
 (0.561, '##म'),
 (0.558, 'Nightmare'),
 (0.557, 'psychedelic'),
 (0.557, 'torment')]

In [None]:
bert_bert_similarities('urge', Bert_embedding_list_single, singleList)

urge


[(1.0, 'urge'),
 (0.702, 'urges'),
 (0.573, 'urging'),
 (0.57, 'urged'),
 (0.535, 'urgency'),
 (0.519, 'encourages'),
 (0.51, 'encourage'),
 (0.496, 'invite'),
 (0.49, 'desire'),
 (0.489, 'obsession'),
 (0.483, 'uneasy'),
 (0.482, 'sentiments'),
 (0.479, 'invites'),
 (0.477, 'compelled'),
 (0.475, 'tempted'),
 (0.469, '##ề'),
 (0.469, 'obligation'),
 (0.469, 'temptation'),
 (0.469, 'provoked'),
 (0.467, 'ơ')]

In [None]:
bert_bert_similarities('thought', Bert_embedding_list_single, singleList)

thought


[(1.0, 'thought'),
 (0.594, 'Thought'),
 (0.579, 'believed'),
 (0.454, 'thinks'),
 (0.448, 'think'),
 (0.435, 'thinking'),
 (0.424, 'felt'),
 (0.392, 'deemed'),
 (0.39, 'presumed'),
 (0.376, 'thoughts'),
 (0.358, 'Thinking'),
 (0.356, 'considered'),
 (0.349, 'Think'),
 (0.348, 'said'),
 (0.341, 'believes'),
 (0.332, 'speculated'),
 (0.322, 'regarded'),
 (0.321, 'assumed'),
 (0.314, 'supposed'),
 (0.313, 'suspected')]

In [None]:
Bert_embedding_list_bi=[]
tokens100_bi= []
for ontologyTerm in biGramList:  # The word for which you want to obtain the embedding
    termsList= ontologyTerm.lower().split('_')
    if len(termsList)<=1:
       
        word_index = tokenizer.convert_tokens_to_ids(termsList[0])  # Convert the word to its corresponding index
        if word_index==100:
            tokens100_bi.append(termsList[0])

        # Access the embedding matrix and retrieve the embedding vector
        embedding_matrix = model.embeddings.word_embeddings.weight
        embedding_vector = embedding_matrix[word_index]

        Bert_embedding_list_bi.append(embedding_vector)
       
    else:
        temList=[]
        for word in termsList:
            word_index = tokenizer.convert_tokens_to_ids(word)  # Convert the word to its corresponding index
            if word_index==100:
                tokens100_bi.append(word)


            # Access the embedding matrix and retrieve the embedding vector
            embedding_matrix = model.embeddings.word_embeddings.weight
            embedding_vector = embedding_matrix[word_index]

            temList.append(embedding_vector)
        meanTerm= torch.mean(torch.stack(temList), dim=0, keepdim=True).flatten()
        Bert_embedding_list_bi.append(meanTerm)
        
set(tokens100_bi), len(set(tokens100_bi))

({'compulsive', 'hoarding', 'intrusive', 'somatic'}, 4)

In [None]:
bert_bert_similarities(biGramList[0], Bert_embedding_list_bi, biGramList)

intrusive_thought


[(0.737, '[UNK]'),
 (0.707, 'thought'),
 (0.476, 'Thought'),
 (0.464, 'believed'),
 (0.36, 'regarded'),
 (0.357, 'thinking'),
 (0.352, 'Thinking'),
 (0.35, 'deemed'),
 (0.35, 'thinks'),
 (0.345, 'speculated'),
 (0.342, 'thoughts'),
 (0.337, 'think'),
 (0.335, 'assumed'),
 (0.333, 'felt'),
 (0.333, 'envisioned'),
 (0.332, 'Introduction'),
 (0.33, '西'),
 (0.329, 'considered'),
 (0.328, 'imagined'),
 (0.326, 'ideas')]

In [None]:
bert_bert_similarities(biGramList[1], Bert_embedding_list_bi, biGramList)

compulsive_behavior


[(0.778, '[UNK]'),
 (0.718, 'behavior'),
 (0.612, 'behaviors'),
 (0.523, 'behaviour'),
 (0.493, 'behavioral'),
 (0.421, 'behave'),
 (0.384, 'γ'),
 (0.366, 'ρ'),
 (0.365, 'χ'),
 (0.364, 'β'),
 (0.363, 'α'),
 (0.36, 'morphology'),
 (0.358, '##vior'),
 (0.355, '##®'),
 (0.354, 'shapes'),
 (0.352, 'characterization'),
 (0.35, '≈'),
 (0.35, 'ε'),
 (0.348, 'properties'),
 (0.346, 'topography')]

In [None]:
bert_bert_similarities(biGramList[2], Bert_embedding_list_bi, biGramList)

aggressive_obsession


[(0.835, 'obsession'),
 (0.76, 'aggressive'),
 (0.634, 'aggressively'),
 (0.609, 'obsessed'),
 (0.602, 'relentless'),
 (0.588, 'aggression'),
 (0.581, 'ruthless'),
 (0.574, 'urges'),
 (0.568, '1803'),
 (0.561, 'oversized'),
 (0.56, 'irrational'),
 (0.56, 'eclectic'),
 (0.558, '##ミ'),
 (0.558, 'Assault'),
 (0.557, 'rebellious'),
 (0.557, 'brutality'),
 (0.555, 'insistence'),
 (0.554, 'ornate'),
 (0.554, '1881'),
 (0.553, 'frantic')]

In [None]:
bert_bert_similarities(biGramList[3], Bert_embedding_list_bi, biGramList)

sexual_obsession


[(0.822, 'obsession'),
 (0.788, 'sexual'),
 (0.762, 'Sexual'),
 (0.657, 'sexuality'),
 (0.641, 'homosexual'),
 (0.64, '##sexual'),
 (0.637, 'sexually'),
 (0.607, 'erotic'),
 (0.6, 'fantasies'),
 (0.598, 'homosexuality'),
 (0.596, 'sensual'),
 (0.595, 'flirting'),
 (0.588, 'orgasm'),
 (0.586, 'obsessed'),
 (0.583, 'Spiritual'),
 (0.583, 'fascination'),
 (0.579, 'urges'),
 (0.576, 'Romantic'),
 (0.573, 'pornography'),
 (0.572, 'mystical')]

In [None]:
bert_bert_similarities(biGramList[4], Bert_embedding_list_bi, biGramList)

contamination_obsession


[(0.832, 'obsession'),
 (0.8, 'contamination'),
 (0.624, 'fascination'),
 (0.616, 'contaminated'),
 (0.608, 'obsessed'),
 (0.58, '1764'),
 (0.579, 'domination'),
 (0.576, 'pollution'),
 (0.573, '1895'),
 (0.571, '1761'),
 (0.568, 'littered'),
 (0.567, '##म'),
 (0.566, 'scandal'),
 (0.566, '##明'),
 (0.563, 'riots'),
 (0.56, 'ת'),
 (0.56, '[unused47]'),
 (0.559, '##ミ'),
 (0.559, 'rumours'),
 (0.559, 'disgusted')]

In [None]:
bert_bert_similarities(biGramList[5], Bert_embedding_list_bi, biGramList)

hoarding_obsession


[(0.789, 'obsession'),
 (0.728, '[UNK]'),
 (0.546, 'fascination'),
 (0.538, 'obsessed'),
 (0.534, '正'),
 (0.526, '下'),
 (0.523, '山'),
 (0.518, '三'),
 (0.517, '大'),
 (0.517, '宮'),
 (0.515, '一'),
 (0.514, '川'),
 (0.514, '生'),
 (0.513, 'ọ'),
 (0.511, '公'),
 (0.509, '東'),
 (0.509, '方'),
 (0.509, '##म'),
 (0.508, '上'),
 (0.508, '十')]

In [None]:
bert_bert_similarities(biGramList[6], Bert_embedding_list_bi, biGramList)

religious_obsession


[(0.852, 'obsession'),
 (0.8, 'religious'),
 (0.73, 'Religious'),
 (0.66, 'religions'),
 (0.653, 'rituals'),
 (0.639, 'ideological'),
 (0.638, 'mystical'),
 (0.633, 'Spiritual'),
 (0.632, 'theological'),
 (0.629, 'religion'),
 (0.627, 'Religion'),
 (0.624, 'biblical'),
 (0.618, 'spirituality'),
 (0.617, 'theologian'),
 (0.616, 'believers'),
 (0.616, 'Philosophical'),
 (0.615, 'devotees'),
 (0.614, 'irrational'),
 (0.614, 'obsessed'),
 (0.614, 'ecclesiastical')]

In [None]:
bert_bert_similarities(biGramList[7], Bert_embedding_list_bi, biGramList)

symmetry_obsession


[(0.833, 'symmetry'),
 (0.818, 'obsession'),
 (0.59, 'obsessed'),
 (0.587, 'symmetrical'),
 (0.58, 'fascination'),
 (0.578, 'rituals'),
 (0.576, '[unused20]'),
 (0.575, 'giggling'),
 (0.57, '##ₚ'),
 (0.568, '1893'),
 (0.563, 'Madness'),
 (0.563, '1881'),
 (0.563, '##社'),
 (0.561, '1796'),
 (0.561, '1862'),
 (0.56, 'octagonal'),
 (0.56, '##明'),
 (0.56, '[unused55]'),
 (0.558, 'puzzles'),
 (0.558, 'fantasies')]

In [None]:
bert_bert_similarities(biGramList[8], Bert_embedding_list_bi, biGramList)

somatic_obsession


[(0.789, 'obsession'),
 (0.728, '[UNK]'),
 (0.546, 'fascination'),
 (0.538, 'obsessed'),
 (0.534, '正'),
 (0.526, '下'),
 (0.523, '山'),
 (0.518, '三'),
 (0.517, '大'),
 (0.517, '宮'),
 (0.515, '一'),
 (0.514, '川'),
 (0.514, '生'),
 (0.513, 'ọ'),
 (0.511, '公'),
 (0.509, '東'),
 (0.509, '方'),
 (0.509, '##म'),
 (0.508, '上'),
 (0.508, '十')]

In [None]:
bert_bert_similarities(biGramList[9], Bert_embedding_list_bi, biGramList)

doubt_obsession


[(0.796, 'doubt'),
 (0.791, 'obsession'),
 (0.682, 'doubts'),
 (0.634, 'doubted'),
 (0.615, 'dispute'),
 (0.594, 'disputes'),
 (0.592, 'worries'),
 (0.584, 'suspicions'),
 (0.576, 'obsessed'),
 (0.57, 'fascination'),
 (0.568, '1862'),
 (0.565, '1897'),
 (0.564, 'denying'),
 (0.562, 'myths'),
 (0.561, 'objections'),
 (0.559, '##म'),
 (0.559, '1881'),
 (0.559, 'resentment'),
 (0.557, 'disagreements'),
 (0.556, 'desperation')]

In [None]:
bert_bert_similarities(biGramList[10], Bert_embedding_list_bi, biGramList)

mental_act


[(0.731, 'mental'),
 (0.728, 'act'),
 (0.567, 'Mental'),
 (0.502, 'Act'),
 (0.496, 'acts'),
 (0.46, 'mentally'),
 (0.442, 'Acts'),
 (0.44, 'acted'),
 (0.394, 'Acting'),
 (0.386, 'acting'),
 (0.366, 'psychological'),
 (0.356, 'psychiatric'),
 (0.345, '1794'),
 (0.345, 'emotional'),
 (0.343, 'Psychological'),
 (0.343, 'insanity'),
 (0.34, '1804'),
 (0.337, '1788'),
 (0.337, '1796'),
 (0.336, 'functioned')]