In [1]:
import sys

sys.path.insert(0, '/home/feedback/working/feedback/models_training/longformer/sumbission/codes/new_transformers_branch/transformers/src')

In [2]:
import re
from collections import Counter
from collections import deque
from glob import glob

import pandas as pd
import numpy as np

import h5py
import dill as pickle
from tqdm import tqdm

import spacy
from spacy import displacy

import torch
from transformers import DebertaV2TokenizerFast

In [3]:
tokenizer = DebertaV2TokenizerFast.from_pretrained('microsoft/deberta-v3-large')
tokenizer.model_max_length = 2048

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
data = pd.read_csv('../../../feedback-prize-2021/train.csv')
data.loc[data.discourse_id==1623258656795.0, 'discourse_text'] =  data.loc[data.discourse_id==1623258656795.0, 
                                                                           'discourse_text'].map(lambda x: x.replace('florida', 'LOCATION_NAME')).values

In [5]:
label_names = ['None', 'Lead', 'Position', 'Evidence', 'Claim',
               'Concluding Statement', 'Counterclaim', 'Rebuttal']

In [6]:
colors = {
            'Lead': '#8000ff',
            'Position': '#2b7ff6',
            'Evidence': '#2adddd',
            'Claim': '#80ffb4',
            'Concluding Statement': 'd4dd80',
            'Counterclaim': '#ff8042',
            'Rebuttal': '#ff0000'
         }
options = {"ents": list(colors.keys()), "colors": colors}

In [7]:
token_maps = dict(zip(colors, range(1, 2 * len(colors), 2)))

In [8]:
token_maps

{'Lead': 1,
 'Position': 3,
 'Evidence': 5,
 'Claim': 7,
 'Concluding Statement': 9,
 'Counterclaim': 11,
 'Rebuttal': 13}

In [9]:
def make_more_targets(targets):
    linkage = np.zeros((len(targets), 2), 'f4')
    class_index = np.zeros((len(targets),), 'f4')
    linkage_mask = np.ones((len(targets),), 'f4')
    
    current_target = -2
    for ix in range(1, len(targets) - 1):
        if ((current_target % 2 == 0 and current_target == targets[ix]) 
              or (targets[ix] == current_target + 1 and current_target % 2 == 1)):
            linkage[ix - 1, 1] = 1
            linkage[ix, 0] = 1
        
        current_target = targets[ix]
        class_index[:] = [x // 2 for x in targets + 1]
    
    link_sums = (linkage * np.array([2, 1])).sum(-1).astype('i4')
    
    bi =  np.zeros((len(targets), 2), 'f4')
    bi[link_sums < 2, 0] = 1
    bi[link_sums >= 2, 1] = 1
    
    bio = np.array(bi)
    bio[targets == 0] = 0
    
    bies =  np.zeros((len(targets), 4), 'f4')
    bies[:, :2] = bi
    bies[link_sums == 0] = (0, 0, 0, 1)
    bies[link_sums == 2] = (0, 0, 1, 0)
    
    bieso = np.array(bies)
    bieso[targets==0] = 0
    
    return class_index, bi, bio, bies, bieso

In [10]:
def combine_labels(class_index, bi, bio, bies, bieso):
    
    combined_bi = class_index * 2 + bi[:, 0]
    combined_bies = class_index * 4 + bies @ np.array([0, 1, 2, 3])
    
    non_o_index = np.where(class_index != 0)[0]
    
    combined_bieso = np.array(class_index)
    combined_bieso[non_o_index] = (class_index[non_o_index] - 1) * 4 + bieso[non_o_index] @ np.array([1, 2, 3, 4])
    
    combined_bio = np.array(class_index)
    combined_bio[non_o_index] = (class_index[non_o_index] - 1) * 2 + bio[non_o_index] @ np.array([1, 2])
    
    return combined_bi, combined_bio, combined_bies, combined_bieso

In [11]:
# add +1 num text to fill the last text with 0, reason? I don't know why
num_texts = len(glob('../../../feedback-prize-2021/train/*.txt')) + 1

In [12]:
def make_one_hot(indices, num_labels):
    array = np.zeros((len(indices), num_labels))
    array[np.arange(len(indices)), indices.astype('i4')] = 1

    return array

## create dataset

- [🔥Transformer: tokenize a text [for beginner]🌱](tokenizer_outshttps://www.kaggle.com/sytuannguyen/transformer-tokenize-a-text-for-beginner)

In [14]:
# deverta3 newline tokens is gone?
fix_text = lambda x: x.replace('\n', '‽')

regexp = re.compile('[0-9a-zA-z]')

52

### Token labeling version 1

In [58]:
id_to_ix_map = {}
broken_indices = []

for filename_ix, filename in tqdm(enumerate(glob('../../../feedback-prize-2021/train/*.txt')), total=num_texts - 1):
    
    # read the textfile by ID
    ID = filename.split('/')[-1].split('.')[0]
    with open(filename) as f:
        text = fix_text(f.read().strip())
    
    # convert text to token
    # -------------------------------------------------------------------------------------------------------------
    # return_offsets_mapping: (optional) Set to True to return (char_start, char_end) for each token (default False).
    #                         If using Python's tokenizer, this method will raise NotImplementedError.
    #                         This one is only available on
    # -------------------------------------------------------------------------------------------------------------
    tokenizer_outs = tokenizer(text, return_offsets_mapping=True)
    tokenizer_outs['input_ids'] = [input_id if input_id != 126861 else 128000 for input_id in tokenizer_outs['input_ids']]
    
    # get text meta information from specific ID
    id_df = data.loc[data.id == ID].sort_values('discourse_start')

    ent_boundaries = []
    pointer = 0
    
    total_pred_n = 0
    for row_id, row in id_df.iterrows():
        total_pred_n += len(row.predictionstring.split(' '))
        
        # fix_text = lambda x: x.replace('\n', '‽')
        entity_text = fix_text(row.discourse_text.strip())

        # regex to find text start with alphanumeric (a-zA-Z0-9)
        entity_text = entity_text[next(regexp.finditer(entity_text)).start():]
        
        # if the first character length is 1, then check the previous text chunk
        if len(entity_text.split()[0]) == 1 and pointer != 0:
            entity_start_ix = text[pointer:].index(entity_text)
            prev_text = text[:pointer + entity_start_ix]
            
            # current text is not the beginning and the previous text last char is alphanumeric
            if pointer + entity_start_ix > 0 and prev_text[-1].isalpha():
                broken_indices.append((filename_ix, ID))
                print('cut entity ', filename_ix, ID)
                cut_word_chunk_size = len(prev_text.split()[-1])
                
                # if the previous text last word length is not 1
                if cut_word_chunk_size > 1:
                    # TODO: add `:` at the end
                    # -------------------------------------------------------
                    # s it can tell me if I enjoy the club or activity that I signed up
                    # o if you would dear principal please consider policy number 1 over 2.
                    # t they could all come to the same conclusion.

                    # i
                    # i
                    # t
                    entity_text = entity_text[next(regexp.finditer(entity_text[1:])).start() + 1:]

        # TODO: what is the meaning of this?
        if row.discourse_id in (1620147556527.0, 1622983056026.0):
            pointer += 10

        offset = text[pointer:].index(entity_text)
        starts_at = offset + pointer
        ent_boundaries.append((starts_at, starts_at + len(entity_text), row.discourse_type))
        pointer = starts_at + len(entity_text)
        
    # [(0, 'Lead', 'start'), (174, 'Lead', 'end'), (176, 'Position', 'start'), (271, 'Position', 'end')]
    all_boundaries = deque([])
    for ent_boundary in ent_boundaries:
        for position, boundary_type in zip(ent_boundary[:2], ('start', 'end')):
            discourse_type = ent_boundary[-1]
            all_boundaries.append((position, discourse_type, boundary_type))
            
    current_target = 0
    targets = np.zeros(len(tokenizer_outs['input_ids']), 'i8')
    token_positions = np.vstack(tokenizer_outs['offset_mapping']).astype('i4')
    
#     print('token ids', tokenizer_outs['input_ids'])
#     print(f"token len {len(tokenizer_outs['input_ids'])}, boundary len {len(all_boundaries)}")
    
    for token_ix in range(len(tokenizer_outs['input_ids'])):
        token_start_ix, token_end_ix = tokenizer_outs['offset_mapping'][token_ix]
        
        cur_pos, cur_dis_type, cur_bound_type = all_boundaries[0]

        if token_end_ix != 0 \
           and (cur_bound_type == 'end' and token_end_ix >= cur_pos) \
           or (cur_bound_type == 'start' and token_end_ix > cur_pos):
            
            if len(all_boundaries) > 1:
                next_pos, next_dis_type, next_bound_type = all_boundaries[1]
            if cur_bound_type == 'start':
                # token map {'Lead': 1, 'Position': 3, ..., 'Rebuttal': 13}
                current_target = token_maps[cur_dis_type]
                targets[token_ix] = current_target
                
                if token_end_ix == next_pos:
                    current_target = 0
                    all_boundaries.popleft()
                else:
                    current_target += 1
            else:
                # If there is more entity left to consider and current is already on the next pos
                if len(all_boundaries) > 1 and token_end_ix > next_pos:
                    
                    # can this actually happen?
                    if token_start_ix >= next_pos:
                        assert text[cur_pos - 1] == '¨'

                    all_boundaries.popleft()
                    current_target = token_maps[cur_dis_type]
                    targets[token_ix] = current_target
                    current_target += 1
                else:
                    if token_start_ix >= cur_pos:
                        current_target = 0

                    targets[token_ix] = current_target
                    current_target = 0

            all_boundaries.popleft()
            if not all_boundaries:
                break
        else:
            targets[token_ix] = current_target

    print((targets > 0).sum())
    class_index, bi, bio, bies, bieso = make_more_targets(targets)
    combined_bi, combined_bio, combined_bies, combined_bieso = combine_labels(class_index, bi, bio, bies, bieso)
    assert (combined_bio[1:-1] == targets[1:-1]).all()
    num_tokens = len(targets)
    

  0% 2/15594 [00:00<18:14, 14.24it/s]

287
527
282
376


  0% 8/15594 [00:00<13:13, 19.64it/s]

276
178
308
374
577


  0% 12/15594 [00:00<18:15, 14.22it/s]

532
408
346
876


  0% 17/15594 [00:01<23:43, 10.94it/s]

637
290
289
303
266
1192


  0% 19/15594 [00:02<28:59,  8.96it/s]

844





KeyboardInterrupt: 

### Token labeling version 2

- [TensorFlow - LongFormer - NER - [CV 0.633]](https://www.kaggle.com/cdeotte/tensorflow-longformer-ner-cv-0-633)

In [73]:
def token_labeling(tokenizer_outs, ent_boundaries, token_maps):

    targets = np.zeros(len(tokenizer_outs['input_ids']), 'i8')

    offsets = tokenizer_outs['offset_mapping']
    offset_index = 0    
    for a, b, discourse_type in ent_boundaries:
        if offset_index > len(offsets) - 1:
            break

        c = offsets[offset_index][0]
        d = offsets[offset_index][1]
        beginning = True
        while b > c:
            if (c >= a) & (b >= d):
                if beginning:
                    targets[offset_index] = token_maps[discourse_type]
                    beginning = False
                else:
                    targets[offset_index] = token_maps[discourse_type] + 1

            offset_index += 1
            if offset_index > len(offsets) - 1:
                break

            c = offsets[offset_index][0]
            d = offsets[offset_index][1]
            
    return targets

In [103]:
id_to_ix_map = {}
broken_indices = []

for filename_ix, filename in tqdm(enumerate(glob('../../../feedback-prize-2021/train/*.txt')), total=num_texts - 1):
    
    # read the textfile by ID
    ID = filename.split('/')[-1].split('.')[0]
    with open(filename) as f:
        text = fix_text(f.read().strip())
    
    # convert text to token
    # -------------------------------------------------------------------------------------------------------------
    # return_offsets_mapping: (optional) Set to True to return (char_start, char_end) for each token (default False).
    #                         If using Python's tokenizer, this method will raise NotImplementedError.
    #                         This one is only available on
    # -------------------------------------------------------------------------------------------------------------
    tokenizer_outs = tokenizer(text, return_offsets_mapping=True)
    tokenizer_outs['input_ids'] = [input_id if input_id != 126861 else 128000 for input_id in tokenizer_outs['input_ids']]
    
    # get text meta information from specific ID
    id_df = data.loc[data.id == ID].sort_values('discourse_start')

    ent_boundaries = []
    pointer = 0
    
    total_pred_n = 0
    for row_id, row in id_df.iterrows():
        total_pred_n += len(row.predictionstring.split(' '))
        
        # fix_text = lambda x: x.replace('\n', '‽')
        entity_text = fix_text(row.discourse_text.strip())

        # regex to find text start with alphanumeric (a-zA-Z0-9)
        entity_text = entity_text[next(regexp.finditer(entity_text)).start():]
        
        # if the first character length is 1, then check the previous text chunk
        if len(entity_text.split()[0]) == 1 and pointer != 0:
            entity_start_ix = text[pointer:].index(entity_text)
            prev_text = text[:pointer + entity_start_ix]
            
            # current text is not the beginning and the previous text last char is alphanumeric
            if pointer + entity_start_ix > 0 and prev_text[-1].isalpha():
                broken_indices.append((filename_ix, ID))
                print('cut entity ', filename_ix, ID)
                cut_word_chunk_size = len(prev_text.split()[-1])
                
                # if the previous text last word length is not 1
                if cut_word_chunk_size > 1:
                    entity_text = entity_text[next(regexp.finditer(entity_text[1:])).start() + 1:]

        # TODO: what is the meaning of this?
        if row.discourse_id in (1620147556527.0, 1622983056026.0):
            pointer += 10

        offset = text[pointer:].index(entity_text)
        starts_at = offset + pointer
        ent_boundaries.append((starts_at, starts_at + len(entity_text), row.discourse_type))
        pointer = starts_at + len(entity_text)
        
    # [(0, 'Lead', 'start'), (174, 'Lead', 'end'), (176, 'Position', 'start'), (271, 'Position', 'end')]
    all_boundaries = deque([])
    for ent_boundary in ent_boundaries:
        for position, boundary_type in zip(ent_boundary[:2], ('start', 'end')):
            discourse_type = ent_boundary[-1]
            all_boundaries.append((position, discourse_type, boundary_type))
            
    current_target = 0
    targets = np.zeros(len(tokenizer_outs['input_ids']), 'i8')
    token_positions = np.vstack(tokenizer_outs['offset_mapping']).astype('i4')
    
#     print('token ids', tokenizer_outs['input_ids'])
#     print(f"token len {len(tokenizer_outs['input_ids'])}, boundary len {len(all_boundaries)}")
    
    for token_ix in range(len(tokenizer_outs['input_ids'])):
        token_start_ix, token_end_ix = tokenizer_outs['offset_mapping'][token_ix]
        
        cur_pos, cur_dis_type, cur_bound_type = all_boundaries[0]

        if token_end_ix != 0 \
           and (cur_bound_type == 'end' and token_end_ix >= cur_pos) \
           or (cur_bound_type == 'start' and token_end_ix > cur_pos):
            
            if len(all_boundaries) > 1:
                next_pos, next_dis_type, next_bound_type = all_boundaries[1]
            if cur_bound_type == 'start':
                # token map {'Lead': 1, 'Position': 3, ..., 'Rebuttal': 13}
                current_target = token_maps[cur_dis_type]
                targets[token_ix] = current_target
                
                if token_end_ix == next_pos:
                    current_target = 0
                    all_boundaries.popleft()
                else:
                    current_target += 1
            else:
                # If there is more entity left to consider and current is already on the next pos
                if len(all_boundaries) > 1 and token_end_ix > next_pos:
                    
                    # can this actually happen?
                    if token_start_ix >= next_pos:
                        assert text[cur_pos - 1] == '¨'

                    all_boundaries.popleft()
                    current_target = token_maps[cur_dis_type]
                    targets[token_ix] = current_target
                    current_target += 1
                else:
                    if token_start_ix >= cur_pos:
                        current_target = 0

                    targets[token_ix] = current_target
                    current_target = 0

            all_boundaries.popleft()
            if not all_boundaries:
                break
        else:
            targets[token_ix] = current_target

    print('sergei version', (targets > 0).sum())
#     print('sergei version', targets)
            
    targets = token_labeling(tokenizer_outs, ent_boundaries, token_maps)
    print('chris version', (targets > 0).sum())
#     print('chris version', targets)
    print('-' * 30)
    
    class_index, bi, bio, bies, bieso = make_more_targets(targets)
    combined_bi, combined_bio, combined_bies, combined_bieso = combine_labels(class_index, bi, bio, bies, bieso)
    assert (combined_bio[1:-1] == targets[1:-1]).all()
    num_tokens = len(targets)
    
    break
    

  0% 0/15594 [00:00<?, ?it/s]

sergei version 287
chris version 285
------------------------------





AssertionError: 

In [104]:
combined_bio[1:-1]

array([ 1.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        0.,  0.,  3.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,
        4.,  4.,  4.,  0.,  7.,  8.,  8.,  8.,  8.,  8.,  8.,  8.,  8.,
        8.,  8.,  8.,  8.,  0.,  5.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,
        6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,
        6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,
        6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,
        6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,
        6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,
        6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,
        6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,
        6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6

In [105]:
targets[1:-1]

array([ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  0,  0,  3,  4,  4,  4,  4,  4,  4,  4,  4,  4,
        4,  4,  4,  4,  0,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
        8,  0,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  0,  0,
        9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10

In [85]:
linkage = np.zeros((len(targets), 2), 'f4')
class_index = np.zeros((len(targets),), 'f4')
linkage_mask = np.ones((len(targets),), 'f4')

In [87]:
-2 % 2

0

In [93]:
targets

array([ 3,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
        4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  0,
        5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  0,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,
        8,  8,  8,  8,  8,  8,  8,  8,  8,  0,  5,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  0,  0,  9, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10,  0])

In [94]:
class_index

array([2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 0.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 0., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
       4., 4., 4., 4., 4., 4., 4., 4., 4., 0., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 0., 0., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 0.], dtype=float32)

In [88]:
current_target = -2
for ix in range(1, len(targets) - 1):
    # 1. its the token included inside entity but not the first
    # 2. token right after the first token inside the entity 
    if ((current_target % 2 == 0 and current_target == targets[ix])
          or (targets[ix] == current_target + 1 and current_target % 2 == 1)):
        linkage[ix - 1, 1] = 1
        linkage[ix, 0] = 1

    current_target = targets[ix]
    
    # literally class?
    class_index[:] = [x // 2 for x in targets + 1]

link_sums = (linkage * np.array([2, 1])).sum(-1).astype('i4')

bi = np.zeros((len(targets), 2), 'f4')
bi[link_sums < 2, 0] = 1
bi[link_sums >= 2, 1] = 1

bio = np.array(bi)
bio[targets == 0] = 0

bies =  np.zeros((len(targets), 4), 'f4')
bies[:, :2] = bi
bies[link_sums == 0] = (0, 0, 0, 1)
bies[link_sums == 2] = (0, 0, 1, 0)

bieso = np.array(bies)
bieso[targets==0] = 0


In [92]:
targets

array([ 3,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
        4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  0,
        5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  0,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,
        8,  8,  8,  8,  8,  8,  8,  8,  8,  0,  5,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  0,  0,  9, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10,  0])

In [98]:
link_sums

array([0, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 0, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 0, 1,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 0, 1, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 2, 1, 2, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 0], dtype=int32)

In [89]:
link_sums

array([0, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 0, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 0, 1,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 0, 1, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 2, 1, 2, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 0], dtype=int32)

### Original Sergei Code vs Chris

- [TensorFlow - LongFormer - NER - [CV 0.633]](https://www.kaggle.com/cdeotte/tensorflow-longformer-ner-cv-0-633)

In [84]:
id_to_ix_map = {}
broken_indices = []

for filename_ix, filename in tqdm(enumerate(glob('../../../feedback-prize-2021/train/*.txt')), total=num_texts - 1):
    
    # read the textfile by ID
    ID = filename.split('/')[-1].split('.')[0]
    with open(filename) as f:
        text = fix_text(f.read().strip())
    
    # convert text to token
    # -------------------------------------------------------------------------------------------------------------
    # return_offsets_mapping: (optional) Set to True to return (char_start, char_end) for each token (default False).
    #                         If using Python's tokenizer, this method will raise NotImplementedError.
    #                         This one is only available on
    # -------------------------------------------------------------------------------------------------------------
    tokenizer_outs = tokenizer(text, return_offsets_mapping=True)
    tokenizer_outs['input_ids'] = [input_id if input_id != 126861 else 128000 for input_id in tokenizer_outs['input_ids']]
    
    # get text meta information from specific ID
    id_df = data.loc[data.id == ID].sort_values('discourse_start')

    ent_boundaries = []
    pointer = 0
    
    total_pred_n = 0
    for row_id, row in id_df.iterrows():
        total_pred_n += len(row.predictionstring.split(' '))
        
        # fix_text = lambda x: x.replace('\n', '‽')
        entity_text = fix_text(row.discourse_text.strip())

        # regex to find text start with alphanumeric (a-zA-Z0-9)
        entity_text = entity_text[next(regexp.finditer(entity_text)).start():]
        
        # if the first character length is 1, then check the previous text chunk
        if len(entity_text.split()[0]) == 1 and pointer != 0:
            entity_start_ix = text[pointer:].index(entity_text)
            prev_text = text[:pointer + entity_start_ix]
            
            # current text is not the beginning and the previous text last char is alphanumeric
            if pointer + entity_start_ix > 0 and prev_text[-1].isalpha():
                broken_indices.append((filename_ix, ID))
                print('cut entity ', filename_ix, ID)
                cut_word_chunk_size = len(prev_text.split()[-1])
                
                # if the previous text last word length is not 1
                if cut_word_chunk_size > 1:
                    entity_text = entity_text[next(regexp.finditer(entity_text[1:])).start() + 1:]

        # TODO: what is the meaning of this?
        if row.discourse_id in (1620147556527.0, 1622983056026.0):
            pointer += 10

        offset = text[pointer:].index(entity_text)
        starts_at = offset + pointer
        ent_boundaries.append((starts_at, starts_at + len(entity_text), row.discourse_type))
        pointer = starts_at + len(entity_text)
        
    # [(0, 'Lead', 'start'), (174, 'Lead', 'end'), (176, 'Position', 'start'), (271, 'Position', 'end')]
    all_boundaries = [(z, x[-1], t) for x in ent_boundaries for z, t in zip(x[:2], ('start', 'end'))]

    current_target = 0
    targets = np.zeros(len(tokenizer_outs['input_ids']), 'i8')
    token_positions = np.vstack(tokenizer_outs['offset_mapping']).astype('i4')
    
#     print('token ids', tokenizer_outs['input_ids'])
#     print(f"token len {len(tokenizer_outs['input_ids'])}, boundary len {len(all_boundaries)}")
    
    for token_ix in range(len(tokenizer_outs['input_ids'])):
        token_start_ix, token_end_ix = tokenizer_outs['offset_mapping'][token_ix]
        
        
        if token_end_ix != 0 and (all_boundaries[0][2] == 'end' and token_end_ix >= all_boundaries[0][0])\
                            or (all_boundaries[0][2] == 'start' and token_end_ix > all_boundaries[0][0]):
            if all_boundaries[0][2] == 'start':
                current_target = token_maps[all_boundaries[0][1]]
                targets[token_ix] = current_target
                if token_end_ix == all_boundaries[1][0]:
                    current_target = 0
                    all_boundaries.pop(0)
                else:
                    current_target += 1
            else:
                if len(all_boundaries) > 1 and token_end_ix > all_boundaries[1][0]:
                    if token_start_ix >= all_boundaries[1][0]:
                        assert text[all_boundaries[0][0] - 1] == '¨'
                    all_boundaries.pop(0)
                    current_target = token_maps[all_boundaries[0][1]]
                    targets[token_ix] = current_target
                    current_target += 1
                else:
                    if token_start_ix >= all_boundaries[0][0]:
                        current_target = 0
                    targets[token_ix] = current_target
                    current_target = 0

            all_boundaries.pop(0)
            if not all_boundaries:
                break
        else:
            targets[token_ix] = current_target

    print('sergei version', (targets > 0).sum())
#     print('sergei version', targets)
            
    targets = token_labeling(tokenizer_outs, ent_boundaries, token_maps)
    print('chris version', (targets > 0).sum())
#     print('chris version', targets)
    print('-' * 30)    
    

  0% 11/15594 [00:00<02:28, 104.74it/s]

sergei version 287
chris version 285
------------------------------
sergei version 527
chris version 521
------------------------------
sergei version 282
chris version 276
------------------------------
sergei version 376
chris version 370
------------------------------
sergei version 276
chris version 271
------------------------------
sergei version 178
chris version 171
------------------------------
sergei version 308
chris version 304
------------------------------
sergei version 374
chris version 367
------------------------------
sergei version 577
chris version 569
------------------------------
sergei version 532
chris version 529
------------------------------
sergei version 408
chris version 404
------------------------------
sergei version 346
chris version 338
------------------------------
sergei version 876
chris version 867
------------------------------
sergei version 637
chris version 631
------------------------------
sergei version 290
chris version 283
-----------

  0% 32/15594 [00:00<02:36, 99.56it/s] 

sergei version 419
chris version 417
------------------------------
sergei version 823
chris version 816
------------------------------
sergei version 463
chris version 457
------------------------------
sergei version 405
chris version 403
------------------------------
sergei version 170
chris version 167
------------------------------
sergei version 609
chris version 600
------------------------------
sergei version 748
chris version 741
------------------------------
sergei version 533
chris version 532
------------------------------
sergei version 384
chris version 377
------------------------------
sergei version 531
chris version 523
------------------------------
sergei version 196
chris version 194
------------------------------
sergei version 221
chris version 214
------------------------------
sergei version 930
chris version 926
------------------------------
sergei version 621
chris version 615
------------------------------
sergei version 638
chris version 633
-----------

  0% 53/15594 [00:00<02:34, 100.58it/s]

sergei version 393
chris version 389
------------------------------
sergei version 905
chris version 901
------------------------------
sergei version 372
chris version 367
------------------------------
sergei version 168
chris version 166
------------------------------
sergei version 450
chris version 448
------------------------------
sergei version 461
chris version 451
------------------------------
sergei version 309
chris version 307
------------------------------
sergei version 733
chris version 722
------------------------------
sergei version 238
chris version 234
------------------------------
sergei version 212
chris version 205
------------------------------
sergei version 330
chris version 329
------------------------------
sergei version 551
chris version 544
------------------------------
sergei version 701
chris version 695
------------------------------
sergei version 454
chris version 447
------------------------------
sergei version 374
chris version 368
-----------

  0% 74/15594 [00:00<02:37, 98.39it/s] 

sergei version 545
chris version 539
------------------------------
sergei version 1059
chris version 1049
------------------------------
sergei version 278
chris version 272
------------------------------
sergei version 580
chris version 577
------------------------------
sergei version 313
chris version 312
------------------------------
sergei version 615
chris version 607
------------------------------
sergei version 679
chris version 673
------------------------------
sergei version 525
chris version 519
------------------------------
sergei version 458
chris version 453
------------------------------
sergei version 216
chris version 209
------------------------------
sergei version 441
chris version 435
------------------------------
sergei version 824
chris version 818
------------------------------
sergei version 674
chris version 667
------------------------------
sergei version 530
chris version 524
------------------------------
sergei version 442
chris version 438
---------

  1% 90/15594 [00:00<02:38, 97.92it/s]

sergei version 716
chris version 714
------------------------------
sergei version 333
chris version 326
------------------------------
sergei version 357
chris version 353
------------------------------
sergei version 659
chris version 657
------------------------------
sergei version 697
chris version 695
------------------------------
sergei version 387
chris version 381
------------------------------
sergei version 206
chris version 203
------------------------------
sergei version 672
chris version 665
------------------------------
sergei version 188
chris version 185
------------------------------





KeyboardInterrupt: 