In [1]:
import sys

sys.path.insert(0, '/home/feedback/working/feedback/models_training/longformer/sumbission/codes/new_transformers_branch/transformers/src')

In [2]:
import re
from collections import Counter
from collections import deque
from glob import glob

import pandas as pd
import numpy as np

import h5py
import dill as pickle
from tqdm import tqdm

import spacy
from spacy import displacy

import torch
from transformers import DebertaV2TokenizerFast

In [3]:
tokenizer = DebertaV2TokenizerFast.from_pretrained('microsoft/deberta-v3-large')
tokenizer.model_max_length = 2048

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
data = pd.read_csv('../../../feedback-prize-2021/train.csv')
data.loc[data.discourse_id==1623258656795.0, 'discourse_text'] =  data.loc[data.discourse_id==1623258656795.0, 
                                                                           'discourse_text'].map(lambda x: x.replace('florida', 'LOCATION_NAME')).values

In [5]:
label_names = ['None', 'Lead', 'Position', 'Evidence', 'Claim',
               'Concluding Statement', 'Counterclaim', 'Rebuttal']

In [6]:
colors = {
            'Lead': '#8000ff',
            'Position': '#2b7ff6',
            'Evidence': '#2adddd',
            'Claim': '#80ffb4',
            'Concluding Statement': 'd4dd80',
            'Counterclaim': '#ff8042',
            'Rebuttal': '#ff0000'
         }
options = {"ents": list(colors.keys()), "colors": colors}

In [7]:
token_maps = dict(zip(colors, range(1, 2 * len(colors), 2)))

In [8]:
token_maps

{'Lead': 1,
 'Position': 3,
 'Evidence': 5,
 'Claim': 7,
 'Concluding Statement': 9,
 'Counterclaim': 11,
 'Rebuttal': 13}

In [9]:
def make_more_targets(targets):
    linkage = np.zeros((len(targets), 2), 'f4')
    class_index = np.zeros((len(targets),), 'f4')
    linkage_mask = np.ones((len(targets),), 'f4')
    
    current_target = -2
    for ix in range(1, len(targets) - 1):
        if ((current_target % 2 == 0 and current_target == targets[ix]) 
              or (targets[ix] == current_target + 1 and current_target % 2 == 1)):
            linkage[ix - 1, 1] = 1
            linkage[ix, 0] = 1
        
        current_target = targets[ix]
        class_index[:] = [x // 2 for x in targets + 1]
    
    link_sums = (linkage * np.array([2, 1])).sum(-1).astype('i4')
    
    bi =  np.zeros((len(targets), 2), 'f4')
    bi[link_sums < 2, 0] = 1
    bi[link_sums >= 2, 1] = 1
    
    bio = np.array(bi)
    bio[targets == 0] = 0
    
    bies =  np.zeros((len(targets), 4), 'f4')
    bies[:, :2] = bi
    bies[link_sums == 0] = (0, 0, 0, 1)
    bies[link_sums == 2] = (0, 0, 1, 0)
    
    bieso = np.array(bies)
    bieso[targets==0] = 0
    
    return class_index, bi, bio, bies, bieso

In [10]:
def combine_labels(class_index, bi, bio, bies, bieso):
    
    combined_bi = class_index * 2 + bi[:, 0]
    combined_bies = class_index * 4 + bies @ np.array([0, 1, 2, 3])
    
    non_o_index = np.where(class_index != 0)[0]
    
    combined_bieso = np.array(class_index)
    combined_bieso[non_o_index] = (class_index[non_o_index] - 1) * 4 + bieso[non_o_index] @ np.array([1, 2, 3, 4])
    
    combined_bio = np.array(class_index)
    combined_bio[non_o_index] = (class_index[non_o_index] - 1) * 2 + bio[non_o_index] @ np.array([1, 2])
    
    return combined_bi, combined_bio, combined_bies, combined_bieso

In [11]:
# add +1 num text to fill the last text with 0, reason? I don't know why
num_texts = len(glob('../../../feedback-prize-2021/train/*.txt')) + 1

In [12]:
def make_one_hot(indices, num_labels):
    array = np.zeros((len(indices), num_labels))
    array[np.arange(len(indices)), indices.astype('i4')] = 1

    return array

## create dataset

- [🔥Transformer: tokenize a text [for beginner]🌱](tokenizer_outshttps://www.kaggle.com/sytuannguyen/transformer-tokenize-a-text-for-beginner)

In [13]:
# deverta3 newline tokens is gone?
fix_text = lambda x: x.replace('\n', '‽')

regexp = re.compile('[0-9a-zA-z]')

### Token labeling Comparition

- [TensorFlow - LongFormer - NER - [CV 0.633]](https://www.kaggle.com/cdeotte/tensorflow-longformer-ner-cv-0-633)

In [74]:
def sergei_token_labeling(tokenizer_outs, ent_boundaries, token_maps):

    targets = np.zeros(len(tokenizer_outs['input_ids']), 'i8')

    all_boundaries = deque([])
    for ent_boundary in ent_boundaries:
        for position, boundary_type in zip(ent_boundary[:2], ('start', 'end')):
            discourse_type = ent_boundary[-1]
            all_boundaries.append((position, discourse_type, boundary_type))
            
    current_target = 0
    targets = np.zeros(len(tokenizer_outs['input_ids']), 'i8')
    token_positions = np.vstack(tokenizer_outs['offset_mapping']).astype('i4')
    
    for token_ix in range(len(tokenizer_outs['input_ids'])):
        token_start_ix, token_end_ix = tokenizer_outs['offset_mapping'][token_ix]
        
        cur_pos, cur_dis_type, cur_bound_type = all_boundaries[0]

        if token_end_ix != 0 \
           and (cur_bound_type == 'end' and token_end_ix >= cur_pos) \
           or (cur_bound_type == 'start' and token_end_ix > cur_pos):
            
            if len(all_boundaries) > 1:
                next_pos, next_dis_type, next_bound_type = all_boundaries[1]
            if cur_bound_type == 'start':
                # token map {'Lead': 1, 'Position': 3, ..., 'Rebuttal': 13}
                current_target = token_maps[cur_dis_type]
                targets[token_ix] = current_target
                
                if token_end_ix == next_pos:
                    current_target = 0
                    all_boundaries.popleft()
                else:
                    current_target += 1
            else:
                # If there is more entity left to consider and current is already on the next pos
                if len(all_boundaries) > 1 and token_end_ix > next_pos:
                    
                    # can this actually happen?
                    if token_start_ix >= next_pos:
                        assert text[cur_pos - 1] == '¨'

                    all_boundaries.popleft()
                    current_target = token_maps[cur_dis_type]
                    targets[token_ix] = current_target
                    current_target += 1
                else:
                    if token_start_ix >= cur_pos:
                        current_target = 0

                    targets[token_ix] = current_target
                    current_target = 0

            all_boundaries.popleft()
            if not all_boundaries:
                break
        else:
            targets[token_ix] = current_target
            
    return targets

In [75]:
def chris_token_labeling(tokenizer_outs, ent_boundaries, token_maps):

    targets = np.zeros(len(tokenizer_outs['input_ids']), 'i8')

    offsets = tokenizer_outs['offset_mapping']
    offset_index = 0    
    for entity_start, entity_end, discourse_type in ent_boundaries:
        if offset_index > len(offsets) - 1:
            break

        token_start = offsets[offset_index][0]
        token_end = offsets[offset_index][1]
        beginning = True
        while token_start < entity_end:
            if token_start == 0 and token_end == 0:
                targets[offset_index] = 0
            elif (entity_start <= token_start) and (token_end <= entity_end):
                if beginning:
                    targets[offset_index] = token_maps[discourse_type]
                    beginning = False
                else:
                    targets[offset_index] = token_maps[discourse_type] + 1

            offset_index += 1
            if offset_index > len(offsets) - 1:
                break

            token_start = offsets[offset_index][0]
            token_end = offsets[offset_index][1]
            
    return targets

In [76]:
def get_entity_boundary(id_df, tokenizer_outs):
    ent_boundaries = []
    pointer = 0
    
    total_pred_n = 0
    for row_id, row in id_df.iterrows():
        total_pred_n += len(row.predictionstring.split(' '))
        
        # fix_text = lambda x: x.replace('\n', '‽')
        entity_text = fix_text(row.discourse_text.strip())

        # regex to find text start with alphanumeric (a-zA-Z0-9)
        entity_text = entity_text[next(regexp.finditer(entity_text)).start():]
        
        # if the first character length is 1, then check the previous text chunk
        if len(entity_text.split()[0]) == 1 and pointer != 0:
            entity_start_ix = text[pointer:].index(entity_text)
            prev_text = text[:pointer + entity_start_ix]
            
            # current text is not the beginning and the previous text last char is alphanumeric
            if pointer + entity_start_ix > 0 and prev_text[-1].isalpha():
                broken_indices.append((filename_ix, ID))
                print('cut entity ', filename_ix, ID)
                cut_word_chunk_size = len(prev_text.split()[-1])
                
                # if the previous text last word length is not 1
                if cut_word_chunk_size > 1:
                    entity_text = entity_text[next(regexp.finditer(entity_text[1:])).start() + 1:]

        # TODO: what is the meaning of this?
        if row.discourse_id in (1620147556527.0, 1622983056026.0):
            pointer += 10

        offset = text[pointer:].index(entity_text)
        starts_at = offset + pointer
        ent_boundaries.append((starts_at, starts_at + len(entity_text), row.discourse_type))
        pointer = starts_at + len(entity_text)
            
    return ent_boundaries

In [77]:
def show_sample(ID, ent_boundaries):
    
    with open(f'../../../feedback-prize-2021/train/{ID}.txt') as f:
        org_text = f.read()
    
    ents = []
    pointer = 0
    for entity_start, entity_end, discourse_type in ent_boundaries:
        ents.append({'start': entity_start, 'end': entity_end, 'label': discourse_type})

    spacy.displacy.render({'text': org_text, 'ents': ents, 'title': ID}, style="ent",
                          options=options, manual=True, jupyter=True)

In [142]:
def target2entity(tokenizer_outs, targets, token_maps):
    id2label = {v: k for k, v in token_maps.items()}
    offset = tokenizer_outs['offset_mapping']

    # find the token index of each entity
    label_token_group = []
    label_checking = -1
    total_len = len(targets)
    for pos, label_id in enumerate(targets):
        # token is start of the entity
        if label_id in token_maps.values():
            if label_checking != -1:
                end_position = pos - 1
                label_token_group.append((start_position, end_position, label))

            label = id2label[label_id]
            label_checking = label_id + 1
            start_position = pos
            continue

        # token is on the entity
        if label_checking != -1:

            # the token is located at the last
            if pos == total_len - 1:
                end_position = pos - 1
                label_token_group.append((start_position, end_position, label))
                break

            # still on the entity
            if label_id == label_checking:
                continue

            # out of the entity
            elif label_id == 0:
                label_checking = -1
                end_position = pos - 1
                label_token_group.append((start_position, end_position, label))

    # convert token index to char index
    label_char_group = []
    for token_start_id, token_end_id, label in label_token_group:
        char_start = offset[token_start_id][0]
        char_end = offset[token_end_id][1]

        label_char_group.append((char_start, char_end, label))
        
    return label_char_group

In [161]:
def compare_boundary(bound1, bound2):
    diff = 0
    for i in range(len(bound1)):
        start1, end1, entity1  = bound1[i]
        start2, end2, entity2  = bound2[i]
        
        diff += abs(start1 - start2) + abs(end1 - end2)
    
    return diff

In [172]:
id_to_ix_map = {}
broken_indices = []

for filename_ix, filename in tqdm(enumerate(glob('../../../feedback-prize-2021/train/*.txt')), total=num_texts - 1):
    
    # read the textfile by ID
    ID = filename.split('/')[-1].split('.')[0]

    with open(filename) as f:
        text = fix_text(f.read().strip())
        
     # get text meta information from specific ID
    id_df = data.loc[data.id == ID].sort_values('discourse_start')
    
    tokenizer_outs = tokenizer(text, return_offsets_mapping=True)
    tokenizer_outs['input_ids'] = [input_id if input_id != 126861 else 128000 for input_id in tokenizer_outs['input_ids']]
    
    ent_boundaries = get_entity_boundary(id_df, tokenizer_outs)
        
    # [(0, 'Lead', 'start'), (174, 'Lead', 'end'), (176, 'Position', 'start'), (271, 'Position', 'end')]
    
    sergei_targets = sergei_token_labeling(tokenizer_outs, ent_boundaries, token_maps)
    chris_targets = chris_token_labeling(tokenizer_outs, ent_boundaries, token_maps)
    
    sergei_recovered_ent_boundaries = target2entity(tokenizer_outs, sergei_targets, token_maps)
    chris_recovered_ent_boundaries = target2entity(tokenizer_outs, chris_targets, token_maps)
    #     ent_boundaries, sergei_recovered_ent_boundaries
    #     ent_boundaries, chris_recovered_ent_boundaries

    sergei_diff = compare_boundary(ent_boundaries, sergei_recovered_ent_boundaries)
    chris_diff = compare_boundary(ent_boundaries, chris_recovered_ent_boundaries)
    
    print(ID)
#     print('sergei version', (sergei_targets > 0).sum())
#     print('sergei version', sergei_targets)
    print('sergei diff', sergei_diff)

            
#     print('chris version', (chris_targets > 0).sum())
#     print('chris version', chris_targets)
    print('chris version', chris_diff)

    print('-' * 30)
    
    class_index, bi, bio, bies, bieso = make_more_targets(targets)
    combined_bi, combined_bio, combined_bies, combined_bieso = combine_labels(class_index, bi, bio, bies, bieso)
    assert (combined_bio[1:-1] == targets[1:-1]).all()
    num_tokens = len(targets)
    
    if ID == '0F10DC1AE015':
        break
    

  0% 3/15594 [00:00<09:53, 26.29it/s]

CF48D9415493
sergei diff 3
chris version 12
------------------------------
9741E87BE7CE
sergei diff 7
chris version 26
------------------------------
43B24ED85767
sergei diff 6
chris version 20
------------------------------
7C898E56E23C
sergei diff 6
chris version 22
------------------------------
EB9D8FFB4F45
sergei diff 6
chris version 22
------------------------------
F45F03EBB2CA
sergei diff 7
chris version 19
------------------------------


  0% 9/15594 [00:00<10:08, 25.61it/s]

75BD2CB94662
sergei diff 4
chris version 11
------------------------------
5FB75CCB31F8
sergei diff 8
chris version 33
------------------------------
46527BA000FB
sergei diff 9
chris version 48
------------------------------
16585724607E
sergei diff 4
chris version 18
------------------------------
4434757F78FB
sergei diff 5
chris version 16
------------------------------
9FD65F435FD4
sergei diff 9
chris version 62
------------------------------


  0% 12/15594 [00:00<10:59, 23.61it/s]

0F10DC1AE015
sergei diff 10
chris version 31
------------------------------





In [177]:
ent_boundaries, sergei_recovered_ent_boundaries, chris_recovered_ent_boundaries

([(0, 376, 'Lead'),
  (386, 478, 'Position'),
  (530, 624, 'Claim'),
  (634, 1394, 'Evidence'),
  (1406, 1505, 'Claim'),
  (1530, 2064, 'Evidence'),
  (2086, 2165, 'Claim'),
  (2166, 2877, 'Evidence'),
  (2888, 3061, 'Counterclaim'),
  (3063, 3504, 'Evidence'),
  (3506, 3655, 'Rebuttal'),
  (3657, 4404, 'Evidence'),
  (4406, 4854, 'Concluding Statement')],
 [(0, 376, 'Lead'),
  (385, 478, 'Position'),
  (529, 624, 'Claim'),
  (633, 1394, 'Evidence'),
  (1405, 1505, 'Claim'),
  (1529, 2064, 'Evidence'),
  (2085, 2165, 'Claim'),
  (2165, 2877, 'Evidence'),
  (2887, 3061, 'Counterclaim'),
  (3062, 3504, 'Evidence'),
  (3506, 3655, 'Rebuttal'),
  (3656, 4404, 'Evidence'),
  (4406, 4854, 'Concluding Statement')],
 [(0, 376, 'Lead'),
  (387, 478, 'Position'),
  (539, 624, 'Claim'),
  (635, 1394, 'Evidence'),
  (1415, 1505, 'Claim'),
  (1531, 2064, 'Evidence'),
  (2088, 2165, 'Claim'),
  (2167, 2877, 'Evidence'),
  (2892, 3061, 'Counterclaim'),
  (3064, 3504, 'Evidence'),
  (3506, 3655, 'Rebu

### Comparing with Text Entity Visualization
- original
- sergei
- chris

In [173]:
show_sample(ID, ent_boundaries)

In [174]:
show_sample(ID, sergei_recovered_ent_boundaries)

In [175]:
show_sample(ID, chris_recovered_ent_boundaries)