In [50]:
import json
import random
import pyperclip
import torch
import numpy as np
import itertools
import operator
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoModel
from seqlbtoolkit.embs import build_bert_token_embeddings
from seqlbtoolkit.text import split_overlength_bert_input_sequence

from tokenizations import get_alignments
from open_ner.base.dataset import load_data_from_json


In [39]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', add_prefix_space=True)

In [57]:
tks = ['The', 'HOMO-LUMO', '(', 'HL', ')', 'gap', 'of', 'each', 'structure', 'calculated', 'using', 'the', 'hybrid', 'B3LYP', 'functional', 'is', 'notably', 'smaller', ',', 'by', '∼', '0.4', 'eV', ',', 'than', 'that', 'using', 'the', 'meta-hybrid', 'M06', 'functional', ',', 'however', ',', 'the', 'calculated', 'optical', 'gaps', 'are', 'only', 'marginally', 'smaller', ',', 'with', 'a', 'difference', 'of', '∼', '0.1', 'eV', '.', 'In', 'Table', '2', ',', 'we', 'also', 'provide', 'the', 'character', 'of', 'the', 'first', 'allowed', 'excitations', 'only', 'for', 'contributions', 'larger', 'than', '4', '%', '.', 'The', 'first', 'excitation', ',', 'as', 'calculated', 'by', 'each', 'of', 'the', 'functional', 'for', 'all', 'three', 'structures', ',', 'clearly', 'exhibits', 'a', 'single', '-', 'configuration', 'character', '.', 'In', 'Fig.', '7', ',', 'we', 'have', 'plotted', 'the', 'isosurfaces', '(', 'isovalue', '=', '0.02', ')', 'of', 'the', 'HOMO', 'and', 'LUMO', 'for', 'both', 'structures', '.', 'In', 'both', 'cases', 'the', 'HOMO', 'extends', 'evenly', 'over', 'the', 'main', 'body', '.', 'For', 'the', 'LUMO', 'of', 'each', 'structure', 'the', 'delocalizations', 'are', 'also', 'similar', '.', 'The', 'LUMO', 'of', 'P1', 'and', 'P2', 'extends', 'over', 'the', 'main', 'structure', 'but', 'considerably', 'more', 'over', 'the', 'triazole', 'group', 'than', 'in', 'the', 'case', 'of', 'the', 'respective', 'HOMOs', '.', 'To', 'quantify', 'the', 'contributions', 'of', 'the', 'moieties', 'to', 'the', 'frontier', 'orbitals', 'we', 'have', 'calculated', 'the', 'total', 'and', 'partial', 'density', 'of', 'states', '(', 'PDOS', ').', 'The', 'PDOSs', 'for', 'P1', 'and', 'P2', 'are', 'shown', 'in', 'Fig.', 'S6', '(', 'ESI', '†', ').', 'We', 'partition', 'all', 'of', 'the', 'structures', 'into', 'the', 'silolodithiophene', '(', 'SDT', ')', 'and', 'fluorobenzotriazole', '(', 'FBT', ')', 'moieties', '.', 'As', 'expected', ',', 'structures', 'P1', 'and', 'P2', 'have', 'significant', 'similarities', 'on', 'the', 'delocalization', 'of', 'the', 'frontier', 'orbitals', '.']

In [41]:
bert_tks = tokenizer.tokenize(tks, is_split_into_words=True, add_special_tokens=True)

In [52]:
word_ids = tokenizer(tks, is_split_into_words=True).word_ids(batch_index=0)
ori_tk_ids = np.arange(len(tks))

word_ids_shifted_left = np.asarray([-100] + word_ids[:-1])
word_ids = np.asarray(word_ids)

is_first_wordpiece = (word_ids_shifted_left != word_ids) & (word_ids != None)
word_ids[~is_first_wordpiece] = -100  # could be anything less than 0

tks = np.asarray(tks)
tks[np.setdiff1d(ori_tk_ids, word_ids)] = tokenizer.unk_token


In [58]:
enc = tokenizer(tks, is_split_into_words=True)

In [60]:
enc.input_ids

[101,
 1996,
 24004,
 1011,
 11320,
 5302,
 1006,
 1044,
 2140,
 1007,
 6578,
 1997,
 2169,
 3252,
 10174,
 2478,
 1996,
 8893,
 1038,
 2509,
 2135,
 2361,
 8360,
 2003,
 5546,
 3760,
 1010,
 2011,
 100,
 1014,
 1012,
 1018,
 23408,
 1010,
 2084,
 2008,
 2478,
 1996,
 18804,
 1011,
 8893,
 1049,
 2692,
 2575,
 8360,
 1010,
 2174,
 1010,
 1996,
 10174,
 9380,
 16680,
 2024,
 2069,
 14785,
 2135,
 3760,
 1010,
 2007,
 1037,
 4489,
 1997,
 100,
 1014,
 1012,
 1015,
 23408,
 1012,
 1999,
 2795,
 1016,
 1010,
 2057,
 2036,
 3073,
 1996,
 2839,
 1997,
 1996,
 2034,
 3039,
 4654,
 26243,
 10708,
 2069,
 2005,
 5857,
 3469,
 2084,
 1018,
 1003,
 1012,
 1996,
 2034,
 4654,
 26243,
 3370,
 1010,
 2004,
 10174,
 2011,
 2169,
 1997,
 1996,
 8360,
 2005,
 2035,
 2093,
 5090,
 1010,
 4415,
 10637,
 1037,
 2309,
 1011,
 9563,
 2839,
 1012,
 1999,
 20965,
 1012,
 1021,
 1010,
 2057,
 2031,
 27347,
 1996,
 11163,
 26210,
 12172,
 2015,
 1006,
 11163,
 10175,
 5657,
 1027,
 1014,
 1012,
 6185,
 1007,
 1