In [84]:
import torch
import json
import logging
import numpy as np
import itertools
import operator
from lstm_crf.dataset import Dataset
from lstm_crf.args import Config
from transformers import AutoTokenizer, AutoModel
from nltk.tokenize import word_tokenize, sent_tokenize

from seqlbtoolkit.text import split_overlength_bert_input_sequence
from seqlbtoolkit.data import merge_list_of_lists
from seqlbtoolkit.io import save_json

logger = logging.getLogger(__name__)

In [None]:
with open('./data/MSIE/train.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [None]:
tks = data['0']['data']['text']

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
for data_id in range(len(data)):
    print(data_id)
    tks = data[f'{data_id}']['data']['text']
    split_overlength_bert_input_sequence(tks, tokenizer)


In [25]:
ori_tks = data[f'{38}']['data']['text']


In [26]:
tks = word_tokenize(' '.join(ori_tks))

In [30]:
max_seq_length = 512
if len(tokenizer.tokenize(' '.join(tks), add_special_tokens=True)) >= max_seq_length:
    sent_tks_list = [tks]
    bert_length_list = [len(tokenizer.tokenize(' '.join(t), add_special_tokens=True)) for t in sent_tks_list]

    while (np.asarray(bert_length_list) >= max_seq_length).any():
        sep_sent_tks_list = list()

        for tks_list, bert_len in zip(sent_tks_list, bert_length_list):

            if bert_len < max_seq_length:
                sep_sent_tks_list.append(tks_list)
                continue

            sep_sent_list = sent_tokenize(' '.join(tks_list))

            sent_lens = list()
            for sep_sent in sep_sent_list:
                sent_lens.append(len(word_tokenize(sep_sent)))
            end_ids = [np.sum(sent_lens[:i]) for i in range(1, len(sent_lens) + 1)]

            # try to separate sentences as evenly as possible
            halfway_idx = np.argmin((np.array(end_ids) - len(tks_list) / 2) ** 2)
            sep_sent_tks_list.append(tks_list[:end_ids[halfway_idx]])  # split 1
            sep_sent_tks_list.append(tks_list[end_ids[halfway_idx]:])  # split 2

        sent_tks_list = sep_sent_tks_list
        bert_length_list = [len(tokenizer.tokenize(' '.join(t), add_special_tokens=True)) for t in sent_tks_list]

        print(bert_length_list)
        # if len(bert_length_list) > 17:
        #     break
        # print(sent_tks_list)

    sent_lengths = [len(s) for s in sent_tks_list]
    assert np.sum(sent_lengths) == len(tks), \
        ValueError(f'Text splitting failed: {tks} ---> {sent_tks_list}')
else:
    pass



[2506, 2306]
[1193, 1315, 1109, 1199]
[612, 583, 736, 581, 608, 503, 597, 604]
[309, 305, 286, 299, 550, 188, 300, 283, 316, 294, 503, 344, 255, 268, 338]
[309, 305, 286, 299, 137, 415, 188, 300, 283, 316, 294, 503, 344, 255, 268, 338]


In [43]:
sent_list = sent_tokenize(' '.join(tks))

In [81]:
tks_seq_list = [word_tokenize(sent) for sent in sent_tokenize(' '.join(tks))]
seq_bert_len_list = [len(tokenizer.tokenize(' '.join(tks_seq), add_special_tokens=True)) for tks_seq in tks_seq_list]
split_points = [0, len(tks_seq_list)]
split_bert_lens = [sum(seq_bert_len_list[split_points[i]:split_points[i+1]]) for i in range(len(split_points)-1)]

while (np.asarray(split_bert_lens) >= max_seq_length).any():

    new_split_points = list()
    for idx, bert_len in enumerate(split_bert_lens):
        print(idx, bert_len)
        if bert_len > max_seq_length:
            seq_bert_len_sub_list = seq_bert_len_list[split_points[idx]:split_points[idx+1]]
            seq_bert_len_sub_accu_list = list(itertools.accumulate(seq_bert_len_sub_list, operator.add))
            print(f"bert len: {bert_len}, accu list: {seq_bert_len_sub_accu_list}")
            # try to separate sentences as evenly as possible
            split_offset = np.argmin((np.array(seq_bert_len_sub_accu_list) - bert_len / 2) ** 2)
            new_split_points.append(split_offset + split_points[idx] + 1)
            print(f"split offset: {split_offset}")
    print(f"new split points: {new_split_points}")

    split_points += new_split_points
    split_points.sort()

    split_bert_lens = [sum(seq_bert_len_list[split_points[i]:split_points[i+1]]) for i in range(len(split_points)-1)]
    print(f"split bert lengths: {split_bert_lens}")

split_tks_seq_list = [tks_seq_list[split_points[i]:split_points[i+1]] for i in range(len(split_points)-1)]



0 4984
bert len: 4984, accu list: [82, 156, 183, 287, 317, 355, 402, 504, 585, 630, 727, 788, 832, 922, 963, 1021, 1089, 1117, 1182, 1191, 1224, 1235, 1257, 1288, 1329, 1378, 1793, 1867, 1936, 1985, 2039, 2082, 2133, 2167, 2227, 2235, 2297, 2357, 2458, 2505, 2586, 2711, 2749, 2830, 2859, 2910, 2948, 2973, 3003, 3036, 3106, 3214, 3269, 3281, 3293, 3364, 3368, 3390, 3501, 3570, 3631, 3635, 3676, 3736, 3741, 3803, 3868, 3932, 3940, 4093, 4142, 4193, 4228, 4267, 4290, 4309, 4360, 4422, 4479, 4556, 4599, 4636, 4650, 4727, 4821, 4908, 4931, 4984]
split offset: 39
new split points: [40]
split bert lengths: [2505, 2479]
0 2505
bert len: 2505, accu list: [82, 156, 183, 287, 317, 355, 402, 504, 585, 630, 727, 788, 832, 922, 963, 1021, 1089, 1117, 1182, 1191, 1224, 1235, 1257, 1288, 1329, 1378, 1793, 1867, 1936, 1985, 2039, 2082, 2133, 2167, 2227, 2235, 2297, 2357, 2458, 2505]
split offset: 22
1 2479
bert len: 2479, accu list: [81, 206, 244, 325, 354, 405, 443, 468, 498, 531, 601, 709, 764, 776, 

In [32]:
from chemdataextractor.nlp.tokenize import ChemWordTokenizer

In [33]:
cwt = ChemWordTokenizer()

In [34]:
cwt.tokenize(' '.join(ori_tks))

['Three',
 'donor',
 '-',
 'acceptor',
 'type',
 'of',
 'narrow',
 'band',
 'gap',
 'conjugated',
 'polymers',
 'with',
 'enlarged',
 'coplanar',
 'skeleton',
 'were',
 'synthesized',
 'via',
 'Stille',
 'copolymerization',
 'of',
 'indacenodithiophene',
 '(',
 'IDT',
 ')',
 'and',
 'naphtho',
 '[1,2-c:5,6-c]bis(1,2,5-thiadiazole)',
 '(',
 'NT',
 ')',
 'based',
 'monomers.',
 'The',
 'energy',
 'levels',
 ',',
 'absorption',
 'spectra',
 'and',
 'band',
 'gaps',
 'of',
 'the',
 'resulting',
 'polymers',
 'were',
 'well',
 'tuned',
 'by',
 'utilizing',
 'different',
 'thiophene',
 'derivatives',
 'as',
 'spacer',
 'between',
 'IDT',
 'and',
 'NT',
 'units',
 ',',
 'and',
 'polymer',
 'PIDT-C12NT',
 'which',
 'employed',
 'bithiophene',
 'attached',
 'with',
 'dodecyl',
 'side',
 'chain',
 'as',
 'spacer',
 'exhibited',
 'superior',
 'properties',
 'compared',
 'to',
 'the',
 'other',
 'two',
 'copolymers.',
 'All',
 'polymers',
 'exhibited',
 'deep',
 'highest',
 'occupied',
 'molecular

In [35]:
from chemdataextractor.doc import Paragraph

In [36]:
para = Paragraph(' '.join(ori_tks))

In [39]:
para.raw_sentences

['Three donor - acceptor type of narrow band gap conjugated polymers with enlarged coplanar skeleton were synthesized via Stille copolymerization of indacenodithiophene ( IDT ) and naphtho [1,2-c:5,6-c]bis(1,2,5-thiadiazole) ( NT ) based monomers.',
 'The energy levels , absorption spectra and band gaps of the resulting polymers were well tuned by utilizing different thiophene derivatives as spacer between IDT and NT units , and polymer PIDT-C12NT which employed bithiophene attached with dodecyl side chain as spacer exhibited superior properties compared to the other two copolymers.',
 'All polymers exhibited deep highest occupied molecular orbital energy levels and subsequently lead to high open circuit voltages of fabricated solar cell devices.',
 'Best performance of the bulk - heterojunction solar cells with a power conversion efficiency of 5.05% was achieved with PIDT-C12NT as donor and (6,6)-phenyl-C 71 -butyric acid methyl ester ( PC71BM ) as acceptor , which can be attributed t

In [93]:
tks = para.tokens

In [97]:
len(tks[0])

34

In [87]:
len(merge_list_of_lists(tks))

3318

In [92]:
len(merge_list_of_lists([sent.tokens for sent in para.sentences]))

3318

In [96]:
para.sentences[0].end

245