# Information Retrieval with BERT


In [1]:
#import libraries
from transformers import BertTokenizer, BertModel

import numpy as np # for vector operations
from sklearn.metrics.pairwise import cosine_similarity # for cosine similarity

import warnings
warnings.filterwarnings("ignore")

In [2]:
#tokenizer and model create
model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name) # load the tokenizer
model = BertModel.from_pretrained(model_name) # load the model


In [10]:
# make dataset
documents = [
    "Machine Learning is a subset of Artificial Intelligence.",
    "Natural Language Processing is a subset of Artificial Intelligence.",
    "Deep Learning is a subset of Machine Learning.",
    "Artificial Intelligence is a subset of Computer Science.",
    "Computer Science is a subset of Science.",
    "Science is a subset of Knowledge.",
    "Knowledge is a subset of Information.",
    "Information is a subset of Data.",
    "Data is a subset of Knowledge.",
    "I am doing my homework."
]

query = "What is homework?"

In [11]:
# get embedding function
def get_embedding(text):
    # tokenization
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    # run the model
    outputs = model(**inputs)
    
    # get last hidden state
    last_hidden_state = outputs.last_hidden_state
    
    # create text representation
    embedding = last_hidden_state.mean(dim=1)
    
    # return vector as numpy array
    return embedding.detach().numpy()

In [12]:
# get embeddings for documents and query

doc_embeddings = np.vstack([get_embedding(doc) for doc in documents])
query_embedding = get_embedding(query)

# calculate cosine similarities
similarities = cosine_similarity(query_embedding, doc_embeddings)

In [13]:
similarities

array([[0.53380096, 0.51289576, 0.48994267, 0.55721414, 0.591884  ,
        0.5880113 , 0.548114  , 0.5134174 , 0.53442657, 0.72444445]],
      dtype=float32)

In [14]:
for i, score in enumerate(similarities[0]):
    print(f"Document: {documents[i]} \n{score}")

Document: Machine Learning is a subset of Artificial Intelligence. 
0.5338009595870972
Document: Natural Language Processing is a subset of Artificial Intelligence. 
0.5128957629203796
Document: Deep Learning is a subset of Machine Learning. 
0.48994266986846924
Document: Artificial Intelligence is a subset of Computer Science. 
0.5572141408920288
Document: Computer Science is a subset of Science. 
0.5918840169906616
Document: Science is a subset of Knowledge. 
0.5880113244056702
Document: Knowledge is a subset of Information. 
0.548114001750946
Document: Information is a subset of Data. 
0.5134174227714539
Document: Data is a subset of Knowledge. 
0.5344265699386597
Document: I am doing my homework. 
0.7244444489479065
