In [1]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import umap
import os
import json 

In [2]:
model_version = 'c:/Users/aadam/scibert_scivocab_uncased'
do_lower_case = True
model = BertModel.from_pretrained(model_version)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)

In [3]:
def embed_text(text, model):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states 

def get_similarity(em, em2):
    return cosine_similarity(em.detach().numpy(), em2.detach().numpy())

In [5]:
abs1 = "The polymer solar cell (PSC) technology has continued to be developed, and the power conversion efficiency (PCE) has now exceeded 10%. The rapid improvement of PCEs in the last decade has mainly resulted from versatile synthetic efforts for conjugated polymers as electron-donors and fullerene derivatives as electron-acceptors. This Feature Article highlights recent exploration of unique, attractive building blocks, i.e., quinoidal units, phospholes, porphyrins, and fluorinated aromatic rings, which can be incorporated into low bandgap conjugated polymers. As candidates for the next-generation acceptor materials that replace the benchmark acceptor, [6,6]-phenyl-C61-butyric acid methyl ester ([60]PCBM), fullerene bisadduct regioisomers are also overviewed. Furthermore, we summarized recent attempts for the construction of one-dimensionally confined, organic donor–acceptor heterojunction nanorods and their applications to photovoltaic and optoelectronic devices. The topics in this article are not intended to cover an exhaustive list of PSC research studies, but involve the fundamental aspect to stimulate further studies for getting new insights into the structure–property relationship in PSC devices."

abs2 = "In recent years, organic semiconductors have emerged as a promising and, in some situations, viable commercial alternative to traditional inorganic materials such as silicon. Organic-based light emitting diodes, photovoltaic devices, photodetectors, and transistors have attracted intense interest in the scientific community. In this review, we first present a discussion of the fundamental electronic nature of organic semiconductors, processing techniques, and their application to two main classes of optoelectronic devices, light emitting diodes, and photovoltaics. The second part of the review introduces organic photovoltaics in depth, including their operation principles, development history, current state of the art, and routes for further improvement."

abs3 = "We study the (relative) character variety of the three-holed projective plane and the action of the mapping class group on it. We describe a domain of discontinuity for this action, which strictly contains the set of primitive stable representations defined by Minsky, and also the set of convex-cocompact characters."

abs4 = "This work studies hydrogenated amorphous silicon germanium films, deposited by hot wire chemical vapor deposition, to be used as low band gap absorber material in thin film solar cells. Material properties, such as the bonding configurations, the ambipolar diffusion length and the optical band gap, were examined as a function of the substrate temperature and germanium content. Our best materials were incorporated in single junction solar cells with high long-wavelength response and a tandem solar cell with an efficiency of 10.42%."

abs5 = "This letter describes the fabrication and characteristics of high‐efficiency thin‐film CdS/CdTe heterojunction solar cells. CdS films have been prepared by chemical bath deposition and p‐CdTe films have been deposited by close‐spaced sublimation. A CdS/CdTe solar cell of greater than 1 cm2 area with an AM1.5 efficiency of 15.8 is reported."

abstract_list = [abs1, abs2, abs3, abs4, abs5]


In [12]:
abstract_embedding = []


for abstract in abstract_list:
    abstract_embedding.append(embed_text(abstract, model).mean(1))

abstract_embedding = torch.cat(abstract_embedding, dim=0)

print("Score for abstracts about semiconductors:",
                get_similarity(abstract_embedding[0].unsqueeze(0), abstract_embedding[1].unsqueeze(0)))
print("Score for abstract about semiconductors vs math:",
                get_similarity(abstract_embedding[0].unsqueeze(0), abstract_embedding[2].unsqueeze(0)))
print("Score for another abstract about semiconductors vs math:", 
                get_similarity(abstract_embedding[1].unsqueeze(0), abstract_embedding[2].unsqueeze(0)))
print("Score for abstract about thin films vs math:", 
                get_similarity(abstract_embedding[3].unsqueeze(0), abstract_embedding[2].unsqueeze(0)))
print("Score for abstracts about thin films:", 
                get_similarity(abstract_embedding[3].unsqueeze(0), abstract_embedding[4].unsqueeze(0)))
print("Score for abstract about semiconductors vs thin films:", 
                get_similarity(abstract_embedding[1].unsqueeze(0), abstract_embedding[4].unsqueeze(0)))

Score for abstracts about semiconductors: [[0.9338739]]
Score for abstract about semiconductors vs math: [[0.75004995]]
Score for another abstract about semiconductors vs math: [[0.7380649]]
Score for abstract about thin films vs math: [[0.7706391]]
Score for abstracts about thin films: [[0.93154097]]
Score for abstract about semiconductors vs thin films: [[0.8610289]]
