In [8]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import umap
import os
import json 

In [4]:
model_version = 'c:/Users/aadam/scibert_scivocab_uncased'
do_lower_case = True
model = BertModel.from_pretrained(model_version)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)

In [5]:
def embed_text(text, model):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states 

def get_similarity(em, em2):
    return cosine_similarity(em.detach().numpy(), em2.detach().numpy())

In [13]:
# We will use a mean of all word embeddings. To do that we will take mean over dimension 1 which is the sequence length.
coronavirus_em = embed_text("", model).mean(1)
mers_em = embed_text("Middle East Respiratory Virus", model).mean(1)
flu_em = embed_text("vefvsdvnjkafdnvkjfdln", model).mean(1)
bog_em = embed_text("Bog", model).mean(1)
covid_2019 = embed_text("COVID-2019", model).mean(1)
print("Similarity for Your Mom and Flu:" + str(get_similarity(coronavirus_em, flu_em)))
print("Similarity for Coronavirus and MERs:" + str(get_similarity(coronavirus_em, mers_em)))
print("Similarity for Coronavirus and COVID-2019:" + str(get_similarity(coronavirus_em, covid_2019)))
print("Similarity for Coronavirus and Bog:" + str(get_similarity(coronavirus_em, bog_em)))

Similarity for Your Mom and Flu:[[0.5256629]]
Similarity for Coronavirus and MERs:[[0.5779236]]
Similarity for Coronavirus and COVID-2019:[[0.72343904]]
Similarity for Coronavirus and Bog:[[0.73704994]]


In [6]:
reducer = umap.UMAP()

def make_the_embeds(number_files, start_range=0, 
                    the_path="C:/Users/aadam/Desktop/sciBERT/Data/100_papers_scibert_opv.json"):
    json_file = json.load(open(the_path))
    title_embedding_list = [] 
    title_list = []

    for i in json_file["search-results"]["entry"]:
        title = i["dc:title"]
        title_embedding_list.append(embed_text(title, model).mean(1))
        title_list.append(title)
    return torch.cat(title_embedding_list, dim = 0), title_list
    
embed_list, title_list = make_the_embeds(200)
red = reducer.fit_transform(embed_list.detach().numpy())#

Hi
Study of p-type doping effect on P3HT: ICBA based organic photovoltaic solar cell performance
Influence of active layer thickness on photovoltaic performance of PTB7:PC70BM bulk heterojunction solar cell
Analysis of the GaSb-p<ce:supÂ loc=post>+</ce:sup>/GaSb-p/GaSb-n<ce:supÂ loc=post>+</ce:sup>/GaSb-n structure performances at room temperature, for thermo-photovoltaic applications
Quasi-white light emission involving FÃ¶rster resonance energy transfer in a new organic inorganic tin chloride based material (AMPS)[SnCl<ce:infÂ loc=post>6</ce:inf>]H<ce:infÂ loc=post>2</ce:inf>O
Life-cycle assessment of cradle-to-grave opportunities and environmental impacts of organic photovoltaic solar panels compared to conventional technologies
Advances in approaches and methods for self-cleaning of solar photovoltaic panels
Adaptive Genetic Algorithm Based Multi-Objective Optimization for Photovoltaic Cell Design Parameter Extraction
A review of photovoltaic module technologies for increased perfo

In [12]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
from bokeh.palettes import Spectral10, Category20c
from bokeh.palettes import magma
import pandas as pd
output_notebook()

In [13]:
def make_plot(red, title_list, number=200, color = True, color_mapping_cat=None, color_cats = None, bg_color="white"):   
    digits_df = pd.DataFrame(red, columns=('x', 'y'))
    if color_mapping_cat:
        digits_df['colors'] = color_mapping_cat
    digits_df['digit'] = title_list
    datasource = ColumnDataSource(digits_df)
    plot_figure = figure(
    title='UMAP projection of the article title embeddings',
    plot_width=890,
    plot_height=600,
    tools=('pan, wheel_zoom, reset'),
    background_fill_color = bg_color
    )
    plot_figure.legend.location = "top_left",
    plot_figure.add_tools(HoverTool(tooltips="""
    <div>
    <div>
        <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
    </div>
    <div>
        <span style='font-size: 10px; color: #224499'></span>
        <span style='font-size: 10px'>@digit</span>
    </div>
    </div>
    """))
    if color:   
        color_mapping = CategoricalColorMapper(factors=title_list, palette=magma(number))
        plot_figure.circle(
            'x',
            'y',
            source=datasource,
            color=dict(field='digit', transform=color_mapping),
            line_alpha=0.6,
            fill_alpha=0.6,
            size=7
        )
        show(plot_figure)
    elif color_mapping_cat:
        color_mapping = CategoricalColorMapper(factors=color_cats, palette=magma(len(color_cats)+2)[2:])
        plot_figure.circle(
            'x',
            'y',
            source=datasource,
            color=dict(field='colors', transform=color_mapping),
            line_alpha=0.6,
            fill_alpha=0.6,
            size=8,
            legend_field='colors'
        )
        show(plot_figure)
    else:
        
        plot_figure.circle(
            'x',
            'y',
            source=datasource,
            color=dict(field='digit'),
            line_alpha=0.6,
            fill_alpha=0.6,
            size=7
        )
        show(plot_figure)
    
make_plot(red, title_list, number=200)

In [18]:
embed_list2, title_list2 = make_the_embeds(401, 201)
red2 = reducer.fit_transform(embed_list.detach().numpy())
print(len(title_list2))
make_plot(red2, title_list2, number=198)

Hi
Study of p-type doping effect on P3HT: ICBA based organic photovoltaic solar cell performance
Influence of active layer thickness on photovoltaic performance of PTB7:PC70BM bulk heterojunction solar cell
Analysis of the GaSb-p<ce:supÂ loc=post>+</ce:sup>/GaSb-p/GaSb-n<ce:supÂ loc=post>+</ce:sup>/GaSb-n structure performances at room temperature, for thermo-photovoltaic applications
Quasi-white light emission involving FÃ¶rster resonance energy transfer in a new organic inorganic tin chloride based material (AMPS)[SnCl<ce:infÂ loc=post>6</ce:inf>]H<ce:infÂ loc=post>2</ce:inf>O
Life-cycle assessment of cradle-to-grave opportunities and environmental impacts of organic photovoltaic solar panels compared to conventional technologies
Advances in approaches and methods for self-cleaning of solar photovoltaic panels
Adaptive Genetic Algorithm Based Multi-Objective Optimization for Photovoltaic Cell Design Parameter Extraction
A review of photovoltaic module technologies for increased perfo

Search attempt on titles:

In [15]:
import collections
q1 = "organic photovoltaics"
search_terms = embed_text(q1, model).mean(1)

In [16]:
def top_n_closest(search_term_embedding, title_embeddings, original_titles, n=10):
    proximity_dict = {}
    i = 0 
    for title_embedding in title_embeddings:
        proximity_dict[original_titles[i]] = {"score": get_similarity(title_embedding.unsqueeze(0),search_term_embedding), 
                                              "title_embedding":title_embedding.unsqueeze(0)}
        i+=1
    order_dict = collections.OrderedDict({k: v for k, v in sorted(proximity_dict.items(), key=lambda item: item[1]["score"])})
    proper_list = list(order_dict.keys())[-n:]
    return proper_list, order_dict

In [19]:
top_titles, order_dict = top_n_closest(search_terms, embed_list2, title_list+title_list2)

top_titles

['Organic solar cells characterized by dark lock-in thermography',
 'Highly efficient betanin dye based ZnO and ZnO/Au Schottky barrier solar cell',
 'Polymer-Ceramic Nanocomposites and Converging Technologies',
 'Electrochemical copolymerization of thiophene derivatives; a precursor to photovoltaic devices',
 'Plasmon enhanced photovoltaic effect in metallically nanomodified photocells',
 'IIe-1: Photoelectrochemical Solar Cells',
 'IIe-1: Photoelectrochemical solar cells',
 'Recent developments in thin film solar cells',
 'Solid state dye solar cell modules',
 'Photovoltaic Devices from Organic Semiconductors']