In [1]:
import sys
sys.path.append('../..')

import re
import regex
import json
import random
import numpy as np
from tqdm.auto import tqdm
from chemdataextractor.doc import Paragraph
from seqlbtoolkit.Data import txt_to_token_span, span_dict_to_list
from seqlbtoolkit.Utils import remove_combining_marks, format_text
from textspan import align_spans

from transformers import AutoTokenizer

random.seed(42)

In [9]:
text = "Miscibility in blends involving copolymers of styrene and acrylonitrile (SAN's) has been an interesting subject of diversified studies. Most notably, the miscibility of SAN with poly(methyl methacrylate) (PMMA) and with poly(ε-caprolactone) (PCL) has been widely reported since 1974. Miscibility in SAN with acrylic polymers other than PMMA has not been reported until lately (1991) when Kishore et al. and Mandal et al. independently and simultaneously reported miscibility of poly(phenyl acrylate) with SAN's (15−35 or 11.5−32 wt % AN, respectively). In many reported miscible binary blend systems whose constituents involve at least one copolymer, it has been commonly observed that miscibility occurs in a range of copolymer compositions. This phenomenon has been attributed to the so-called “copolymer effect” by some investigators. The “copolymer effect” suggests that mutual repulsion between the constituents (copolymer units) of the copolymer prevails over other interactions and that the repulsion in the copolymer leads to miscibility in the homopolymer−copolymer pair."

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [6]:
data_file = '../data/polymer_names/multimodal_annos.jsonl'
with open(data_file, 'r', encoding='utf-8') as f:
    json_ls = [json.loads(jline) for jline in f.readlines()]

In [14]:
tokens_list = list()
spans_list = list()

for json_l in tqdm(json_ls):
    ori_txt = json_l['text']
    txt = format_text(ori_txt)
    spans = json_l['spans']

    unlabeled_spans = [(s['start'], s['end']) for s in spans]
    aligned_spans = align_spans(unlabeled_spans, ori_txt, txt)
    lbs_dict = dict()
    for span, lb in zip(aligned_spans, spans):
        assert len(span) >= 1
        s = span[0][0]
        e = span[-1][1]
        lbs_dict[(s, e)] = lb['label']

    doc = Paragraph(txt)    
    for sent in doc.sentences:
        tokens = [tk.text for tk in sent.tokens]
        txt_spans = dict()
        for (s, e), v in lbs_dict.items():
            if sent.start <= s and e <= sent.end:
                txt_spans[(s-sent.start, e-sent.start)] = v
        tk_spans = txt_to_token_span(tokens, sent.text, txt_spans)

        tokens_list.append(tokens)
        spans_list.append(tk_spans)

100%|██████████| 702/702 [00:07<00:00, 88.37it/s] 


In [55]:
f_tokens_list = list()
f_spans_list = list()

# only keep the POLYMER entities
for tks, spans in zip(tokens_list, spans_list):
    f_spans = dict()
    for (s, e), v in spans.items():
        if v == 'POLYMER':
            f_spans[(s, e)] = v
    if f_spans.values():
        f_tokens_list.append(tks)
        f_spans_list.append(f_spans)
    elif random.random() < 0.15:
        f_tokens_list.append(tks)
        f_spans_list.append(f_spans)

In [56]:
len(f_tokens_list)

2372

In [35]:
f_tokens_list[0]

['Modification',
 'of',
 'sulfonated',
 'poly(ether',
 'ether',
 'ketone',
 ')',
 '(',
 'SPEEK',
 ')',
 'membrane',
 'was',
 'attempted',
 'by',
 'blending',
 'charged',
 'surface',
 'modifying',
 'macromolecule',
 '(',
 'cSMM',
 ')',
 '.']

In [31]:
f_spans_list[0]

{(3, 7): 'POLYMER', (8, 9): 'POLYMER'}

In [5]:
inst_ids = list(range(len(tokens_list)))
random.shuffle(inst_ids)

train_ids = inst_ids[:int(np.ceil(0.7 * len(inst_ids)))]
valid_ids = inst_ids[int(np.ceil(0.7 * len(inst_ids))): int(np.ceil(0.85 * len(inst_ids)))]
test_ids = inst_ids[int(np.ceil(0.85 * len(inst_ids))):]

In [6]:
train_list = list()
for idx in train_ids:
    train_list.append({'text': tokens_list[idx], 'label': span_dict_to_list(spans_list[idx])})
train_dict = {idx: inst for idx, inst in enumerate(train_list)}

valid_list = list()
for idx in valid_ids:
    valid_list.append({'text': tokens_list[idx], 'label': span_dict_to_list(spans_list[idx])})
valid_dict = {idx: inst for idx, inst in enumerate(valid_list)}

test_list = list()
for idx in test_ids:
    test_list.append({'text': tokens_list[idx], 'label': span_dict_to_list(spans_list[idx])})
test_dict = {idx: inst for idx, inst in enumerate(test_list)}

with open('../data/pet/train.json', 'w', encoding='utf-8') as f:
    json.dump(train_dict, f, ensure_ascii=False, indent=2)

with open('../data/pet/valid.json', 'w', encoding='utf-8') as f:
    json.dump(valid_dict, f, ensure_ascii=False, indent=2)

with open('../data/pet/test.json', 'w', encoding='utf-8') as f:
    json.dump(test_dict, f, ensure_ascii=False, indent=2)

In [7]:
sent_lens = [len(tks) for tks in tokens_list]
max_len = np.max(sent_lens)
mean_len = np.mean(sent_lens)

meta_info = {
    'entity_types': ['pol_IUPAC', 'pol_traditional', 'pol_acronym'],
    'train_size': len(train_ids),
    'valid_size': len(valid_ids),
    'test_size': len(test_ids),
    'max_length': int(max_len),
    'mean_length': float(mean_len),
    'num_labels': 5
}
with open('../data/pet/meta.json', 'w', encoding='utf-8') as f:
    json.dump(meta_info, f, ensure_ascii=False, indent=2)

In [11]:
text = 'us to assign them to the repeating carrabiose 2,4′-disulfate of ι-carrageenan.'
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [12]:
tokenizer.tokenize(text)

['us',
 'to',
 'assign',
 'them',
 'to',
 'the',
 'repeating',
 'carr',
 '##abi',
 '##ose',
 '2',
 ',',
 '4',
 '′',
 '-',
 'di',
 '##sul',
 '##fat',
 '##e',
 'of',
 'ι',
 '-',
 'carr',
 '##age',
 '##ena',
 '##n',
 '.']

In [8]:
len('sao')

3

In [9]:
len('São')

4

In [10]:
'^ssp^ssp'.split('^ssp')

['', '', '']

In [13]:
txt = 'The number‐average molecular weight (Mn) and polydispersity ratio ­(Mw/Mn) were estimated on the basis of a polystyrene calibration.'
new_txt = format_text(txt)