# Imports

In [1]:
%run PII_Util.py

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
text_test = """Meet Jane Doe, a brilliant student at XYZ University. She can be reached at jane.doe@email.com or through her phone number +1234567890. Jane resides at 123 Main Street, Cityville. Her student ID is 987654 and her personal website is www.janedoe.com. Connect with her on social media using the username @janedoe.

Meanwhile, John Smith, another outstanding student, can be contacted at john.smith@email.com or at +9876543210. John lives at 456 Oak Avenue, Townsville. His student ID is 123456, and you can visit his personal blog at www.johnsmithblog.com. Follow him on Twitter with the handle @johnsmith123.

For any inquiries about the university's programs, you can contact the administration office at admin@xyzuniversity.edu or call +5551234567. The office is located at 789 University Boulevard.

Visit our official website at www.xyzuniversity.edu for more information on courses and admission procedures.

"""


In [3]:
text_test

"Meet Jane Doe, a brilliant student at XYZ University. She can be reached at jane.doe@email.com or through her phone number +1234567890. Jane resides at 123 Main Street, Cityville. Her student ID is 987654 and her personal website is www.janedoe.com. Connect with her on social media using the username @janedoe.\n\nMeanwhile, John Smith, another outstanding student, can be contacted at john.smith@email.com or at +9876543210. John lives at 456 Oak Avenue, Townsville. His student ID is 123456, and you can visit his personal blog at www.johnsmithblog.com. Follow him on Twitter with the handle @johnsmith123.\n\nFor any inquiries about the university's programs, you can contact the administration office at admin@xyzuniversity.edu or call +5551234567. The office is located at 789 University Boulevard.\n\nVisit our official website at www.xyzuniversity.edu for more information on courses and admission procedures.\n\n"

# Trainer API

## Instantiate

In [4]:
%%time
model_adapter = Yanis_Adapter(threshold = 0.1)
# text = text_test
# text = df_train.loc[0].full_text
# labels_true = df_train.loc[0].labels
# threshold = 0.1

CPU times: total: 6.47 s
Wall time: 12 s


# Results Analysis

In [5]:
from datasets import load_dataset, load_from_disk
from collections import Counter

In [6]:
%%time
def pad_lists(lst, pad_value, max_length = None):
    if max_length is None:
        max_length = max(len(sublist) for sublist in lst)
    padded_lst = np.full((len(lst), max_length), pad_value)
    for i, sublist in enumerate(lst):
        padded_lst[i, :len(sublist)] = sublist
    return padded_lst

# Example list of lists with varying lengths
# list_of_lists = [[1, 2, 3], [4, 5], [6], [7, 8, 9, 10]]

# # Padding value
# pad_value = -100

# # Convert to numpy array with padding
# padded_array = pad_lists(tokenized_datasets['labels'], pad_value)
# padded_array.shape

CPU times: total: 0 ns
Wall time: 0 ns


In [7]:
%%time
tokenized_datasets = load_from_disk("./in/tokenized_datasets")

CPU times: total: 328 ms
Wall time: 3.66 s


In [8]:
%%time
loaded_predictions = np.load("./in/preds01.npy")

CPU times: total: 734 ms
Wall time: 20.1 s


In [9]:
%%time
labels_padded = pad_lists(tokenized_datasets['labels'], -100, loaded_predictions.shape[1])

CPU times: total: 3.86 s
Wall time: 3.88 s


In [10]:
tokenized_datasets

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'input_ids', 'attention_mask', 'words', 'word_labels', 'word_ids'],
    num_rows: 6807
})

# Align Predictions

## Refactored

In [11]:
list_word_ids = [[word_id if word_id is not None else -100 for word_id in word_ids] for word_ids in tokenized_datasets['word_ids'] ]

word_ids_padded = pad_lists(list_word_ids, -100,  loaded_predictions.shape[1])

In [12]:
np_probs = loaded_predictions
true_labels_id = labels_padded
threshold = 0.18

#     logits, true_labels_id = eval_preds
#     np_probs = np.argmax(logits, axis=-1)
#     tokens = inputs.tokens()

label2id =  model_adapter.model.config.label2id
o_index = label2id['O']

np_sorted_indices = np.argsort(np_probs)
np_max_indices = np_sorted_indices[:,:, -1]
# np_max = np_sorted_indices[np.arange(np_max_indices.shape[0]), np_max_indices]
np_max_prob = np.take_along_axis(np_probs, np_max_indices[:, :, np.newaxis], axis=2).squeeze()

np_2nd_max_indices = np_sorted_indices[:, :, -2]
# np_2nd_max = np_sorted_indices[np.arange(np_sorted_indices.shape[0]), np_max_indices]
np_2nd_max_prob = np.take_along_axis(np_probs, np_2nd_max_indices[:, :, np.newaxis], axis=2).squeeze()

np_O_mask = np_max_indices == o_index
np_threshold_mask = np_2nd_max_prob > threshold

np_replace_mask = np_threshold_mask & np_O_mask

np_label_ids = np.where(np_replace_mask, np_2nd_max_indices, np_max_indices)


# Postprocess labels, convert irrelevant labels to 'O'
np_labels_irrelevant = np.array(model_adapter.labels_irrelevant)
label_ids_mask = np.isin(np_label_ids, np_labels_irrelevant)
np_label_ids[label_ids_mask] = o_index

flat_label_ids = np_label_ids.flatten()
flat_true_labels_id = true_labels_id.flatten()

# Remove padding preds for metrics
mask_padding_inv = flat_true_labels_id != -100
flat_true_labels_id = flat_true_labels_id[mask_padding_inv]
flat_label_ids = flat_label_ids[mask_padding_inv]


In [13]:
pred_tokens = np_label_ids

#Exclude beginning [CLS] Token 
#Also excludes end padding tokens by not appending row size (?)
row_size = word_ids_padded.shape[1]
# diff_array = np.diff(word_ids_padded[:,:], axis=1)
diff_array = np.diff(word_ids_padded[:,:], axis=1, append=row_size)

#Calculate row boundaries
non_zero = np.where(diff_array != 0, 1, 0) # Orig
# non_zero = np.where(diff_array > 0, 1, 0)  # Changed
group_ids = np.cumsum(non_zero, axis=1) # Orig
# group_ids = np.cumsum(non_zero, axis=1) + 1 # + 1 because x diffs = x+1 groups
row_boundaries = np.cumsum(group_ids.max(axis = 1))
# row_boundaries = np.cumsum(group_ids.max(axis = 1)) + 1

#Calculate splits, split into groups, Truncate > 10
split_indices =  np.where(diff_array.ravel() != 0)[0] + 1 # Orig
# split_indices =  np.where(diff_array.ravel() > 0)[0] + 1
# list_groups = np.split(pred_tokens.ravel(), split_indices)
list_groups = np.split(pred_tokens[:,:].ravel(), split_indices)
list_groups = [group[:10] if len(group) >= 10 else group for group in list_groups]

#30s Calculate modes per group
word_labels = [Counter(arr.tolist()).most_common(1)[0][0] if len(arr) > 0 else None for arr in list_groups]

#Fast (split into rows)
np_word_labels = np.array(word_labels)
list_pred_incomp = np.split(np_word_labels, row_boundaries)

#10s Initialize row arrays for word-level predictions
list_pred_words = [np.full((len(words)), model_adapter.O_id, dtype='int8') for words in tokenized_datasets['words']]

#5s #Get word_indices not skipped in word_ids
list_word_indices = []
for word_ids in tokenized_datasets['word_ids']:
    indices = set(word_ids)
    indices.discard(None)
    list_word_indices.append(np.array(list(indices)))
    
#200ms Align the processed word-level predictions to initialized array
for pred_words, word_indices, pred_incomp in zip(list_pred_words, list_word_indices, list_pred_incomp):
    pred_words[word_indices] = pred_incomp[1:-1]

In [674]:
list_pred_words[0][list_word_indices[0]] = list_pred_incomp[0][1:-1]

In [682]:
for pred_words, word_indices, pred_incomp in zip(list_pred_words[1:], list_word_indices[1:], list_pred_incomp[1:]):
    pred_words[word_indices] = pred_incomp[1:]

In [702]:
i = 0

for pred_words, word_indices, pred_incomp in zip(list_pred_words, list_word_indices, list_pred_incomp):
    try:
#         pred_words[word_indices] = pred_incomp[:-1]
        pred_words[word_indices] = pred_incomp[1:-1]
        i+=1
    except Exception as e:
        print(i)
        raise e

0


ValueError: shape mismatch: value array of shape (691,) could not be broadcast to indexing result of shape (690,)

### Debug

In [710]:
list_groups[row_boundaries[0]: row_boundaries[0]+15]

[array([2], dtype=int64),
 array([2], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64)]

In [711]:
list_groups[row_boundaries[0] - 15: row_boundaries[0]]

[array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([2], dtype=int64),
 array([2, 2], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64),
 array([0], dtype=int64)]

In [714]:
list_groups[row_boundaries[10] - 15: row_boundaries[10]]

[array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([ 0,  0,  0,  0, 14, 14, 14,  0, 14, 14], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64),
 array([0], dtype=int64)]

In [451]:
list_groups[row_boundaries[0]-1]

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [545]:
list_groups[row_boundaries[1]-2]

array([0, 0, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [544]:
list_groups[row_boundaries[3]-4]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [670]:
len(list_pred_incomp)

6808

In [720]:
i = 6806

print(f'{list_word_indices[i].shape} | {list_pred_incomp[i].shape}')

(767,) | (769,)


In [242]:
row_boundaries.shape

(6807,)

In [250]:
len(list_pred_incomp)

6808

In [251]:
list_pred_incomp[-1].shape

(1,)

In [243]:
row_boundaries[0]

689

In [245]:
row_boundaries[1] - row_boundaries[0]

519

In [233]:
len(list_pred_incomp)

6808

In [240]:
list_word_indices[-1].shape

(767,)

In [239]:
list_pred_incomp[-2].shape

(766,)

In [365]:
len(list_pred_incomp)

6808

In [374]:
list_pred_incomp[6786].shape

(0,)

In [364]:
row_boundaries.shape

(6807,)

In [220]:
row_boundaries[0]

689

In [224]:
list_pred_incomp[1][:15]

array([2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [222]:
len(list_pred_incomp[0])

689

In [235]:
pred_words[word_indices].shape

(520,)

In [219]:
len(word_indices)

690

In [236]:
pred_incomp.shape

(519,)

In [154]:
group_ids

array([[  1,   2,   3, ..., 690, 690, 690],
       [  1,   2,   3, ..., 520, 520, 520],
       [  1,   2,   3, ..., 677, 677, 677],
       ...,
       [  1,   2,   3, ..., 592, 592, 592],
       [  0,   1,   2, ..., 637, 637, 637],
       [  1,   2,   3, ..., 767, 767, 767]])

In [25]:
# np.save("./in/np_label_ids.npy", np_label_ids)
# np.savez("./in/list_pred_words.npz", *list_pred_words)

# Inspect

In [202]:
temp_tokens =  tokenized_datasets[0]['tokens']
temp_words = tokenized_datasets[0]['words']
temp_word_ids =  tokenized_datasets[0]['word_ids']
temp_word_indices = list_word_indices[0]

In [209]:
diff_array[0][700:720]

array([   1,    1,    1,    1,    1,    1,    0,    2,    1,    1,    1,
          1,    1,    1,    1, -851,    0,    0,    0,    0])

In [210]:
word_ids_padded[0,700:720]

array([ 736,  736,  737,  738,  739,  740,  741,  742,  742,  744,  745,
        746,  747,  748,  749,  750,  751, -100, -100, -100])

In [187]:
np_label_ids[0].shape

(3072,)

In [190]:
np_label_ids[0][0]

2

In [195]:
true_labels_id[0][0]

0

In [199]:
np.where(true_labels_id[0] != -100)[0].shape

(718,)

In [204]:
len(temp_word_ids)

718

In [194]:
np.where(true_labels_id[0] == -100)

(array([ 718,  719,  720, ..., 3069, 3070, 3071], dtype=int64),)

In [193]:
np.where(np_label_ids[0] == -100)

(array([], dtype=int64),)

In [192]:
np.where(np_label_ids[0] == 2)

(array([  0,  11,  12,  13,  51,  52,  53,  54,  55,  56,  57,  58,  59,
        460, 461, 462, 706, 707, 708, 717, 718, 719, 720, 721, 722, 723,
        724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736,
        737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749,
        750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762,
        763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775,
        776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788,
        789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801,
        802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814,
        815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827,
        828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840,
        841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853,
        854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866,
        867, 868, 869, 870, 871, 872, 873, 874, 875

In [172]:
temp_tokens[-15:]

['▁Avril',
 '▁2021',
 '▁-',
 '▁Nathalie',
 '▁S',
 'ylla',
 '▁Annex',
 '▁1',
 '▁:',
 '▁Mind',
 '▁Map',
 '▁Shared',
 '▁facilities',
 '▁project',
 '[SEP]']

In [183]:
temp_word_indices[680:]

array([741, 742, 744, 745, 746, 747, 748, 749, 750, 751])

In [185]:
temp_words[741:]

['Nathalie',
 'Sylla',
 '\n\n',
 'Annex',
 '1',
 ':',
 'Mind',
 'Map',
 'Shared',
 'facilities',
 'project',
 '\n\n']

In [176]:
word_labels[680:700]

[2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0]

In [174]:
np_label_ids[0][700:717]

array([0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [166]:
np_label_ids[1][:15]

array([0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [160]:
list_pred_incomp[1][:15]

array([2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [164]:
cur_tokens[:15]

['[CLS]',
 '▁Diego',
 '▁Estrada',
 '▁Design',
 '▁Thinking',
 '▁Assignment',
 '▁Visualization',
 '▁Tool',
 '▁Challenge',
 '▁&',
 '▁Selection',
 '▁The',
 '▁elderly',
 '▁were',
 '▁having']

In [165]:
cur_words[:15]

['Diego',
 'Estrada',
 '\n\n',
 'Design',
 'Thinking',
 'Assignment',
 '\n\n',
 'Visualization',
 'Tool',
 '\n\n',
 'Challenge',
 '&',
 'Selection',
 '\n\n',
 'The']

In [162]:
list_word_ids[1][:15]

[-100, 0, 1, 3, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16, 17]

In [159]:
word_labels[690:700]

[2, 2, 2, 0, 0, 0, 0, 0, 0, 0]

array([0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [158]:
len(list_word_indices[0])

690

In [139]:
cur_tokens[:15]

['[CLS]',
 '▁Diego',
 '▁Estrada',
 '▁Design',
 '▁Thinking',
 '▁Assignment',
 '▁Visualization',
 '▁Tool',
 '▁Challenge',
 '▁&',
 '▁Selection',
 '▁The',
 '▁elderly',
 '▁were',
 '▁having']

In [55]:
cur_word_ids[:15]

[None, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 10, 12]

In [56]:
list_groups[:15]

[array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0, 0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([2], dtype=int64),
 array([2, 2], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64)]

In [51]:
cur_tokens[:14]

['[CLS]',
 '▁Design',
 '▁Thinking',
 '▁for',
 '▁innovation',
 '▁reflex',
 'ion',
 '▁-',
 '▁Avril',
 '▁2021',
 '▁-',
 '▁Nathalie',
 '▁S',
 'ylla']

In [65]:
flat_label_ids[:14]

array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2], dtype=int64)

In [46]:
cur_tokens[:20]

['[CLS]',
 '▁Design',
 '▁Thinking',
 '▁for',
 '▁innovation',
 '▁reflex',
 'ion',
 '▁-',
 '▁Avril',
 '▁2021',
 '▁-',
 '▁Nathalie',
 '▁S',
 'ylla',
 '▁Challenge',
 '▁&',
 '▁selection',
 '▁The',
 '▁tool',
 '▁I']

In [40]:
list_groups[10]

array([2, 2], dtype=int64)

In [78]:
print(f'temp_words: {len(cur_words)} | pred_words: {len(cur_pred_words)} | word_indices: {len(cur_word_indices)} | pred_incomp: {len(cur_pred_incomp)}')

temp_words: 601 | pred_words: 601 | word_indices: 555 | pred_incomp: 555


In [74]:
len(list_pred_incomp)

6808

In [92]:
list_pred_incomp[-1]

array([0])

In [75]:
list_pred_incomp[-1]

array([0])

In [80]:
cur_word_indices

array([  0,   1,   2,   3,   5,   6,   8,   9,  10,  11,  12,  13,  14,
        15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
        29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
        42,  43,  44,  45,  46,  47,  48,  50,  51,  52,  53,  54,  55,
        56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,
        70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,
        83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  95,  96,
        97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 109, 110,
       112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124,
       125, 126, 127, 128, 130, 131, 132, 133, 134, 135, 136, 137, 138,
       139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 151, 152,
       153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 166,
       167, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
       181, 182, 183, 184, 185, 187, 188, 189, 190, 191, 192, 19

In [88]:
row_boundaries[-5] - row_boundaries[-6]

555

In [79]:
list_word_indices[i_cur]

array([  0,   1,   2,   3,   5,   6,   8,   9,  10,  11,  12,  13,  14,
        15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
        29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
        42,  43,  44,  45,  46,  47,  48,  50,  51,  52,  53,  54,  55,
        56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,
        70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,
        83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  95,  96,
        97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 109, 110,
       112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124,
       125, 126, 127, 128, 130, 131, 132, 133, 134, 135, 136, 137, 138,
       139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 151, 152,
       153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 166,
       167, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
       181, 182, 183, 184, 185, 187, 188, 189, 190, 191, 192, 19

# Visualization

In [14]:
tokenized_datasets

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'input_ids', 'attention_mask', 'words', 'word_labels', 'word_ids'],
    num_rows: 6807
})

In [43]:
tokenized_datasets[0]

{'document': 7,
 'full_text': "Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla\n\nChallenge & selection\n\nThe tool I use to help all stakeholders finding their way through the complexity of a project is the  mind map.\n\nWhat exactly is a mind map? According to the definition of Buzan T. and Buzan B. (1999, Dessine-moi  l'intelligence. Paris: Les Éditions d'Organisation.), the mind map (or heuristic diagram) is a graphic  representation technique that follows the natural functioning of the mind and allows the brain's  potential to be released. Cf Annex1\n\nThis tool has many advantages:\n\n•  It is accessible to all and does not require significant material investment and can be done  quickly\n\n•  It is scalable\n\n•  It allows categorization and linking of information\n\n•  It can be applied to any type of situation: notetaking, problem solving, analysis, creation of  new ideas\n\n•  It is suitable for all people and is easy to learn\n\n•  It is fun and encourages

In [15]:
nlp = spacy.load("en_core_web_sm")

In [723]:
%%time

i_cur = 6807 - 5
i_cur = 0

cur_sample = tokenized_datasets[i_cur]

cur_pred_ids = list_pred_words[i_cur]
cur_text = cur_sample['full_text']
cur_label_words = cur_sample['word_labels']
cur_pred_words = [model.config.id2label[e] for e in cur_pred_ids]

cur_words = cur_sample['words']


#For comparisons only
cur_tokens = cur_sample['tokens']
cur_word_ids = cur_sample['word_ids']
# cur_group = list_groups[i_cur]
cur_word_indices = list_word_indices[i_cur]
cur_pred_incomp = list_pred_incomp[i_cur]


doc = nlp(cur_text)


CPU times: total: 266 ms
Wall time: 816 ms


In [101]:
print(f'list_pred_incomp: {len(list_pred_incomp)} | list_pred_words: {len(list_pred_words)} | list_word_indices: {len(list_word_indices)} | dataset: {len(tokenized_datasets)}')

list_pred_incomp: 6808 | list_pred_words: 6807 | list_word_indices: 6807 | dataset: 6807


In [102]:
print(f'temp_words: {len(cur_words)} | pred_words: {len(cur_pred_words)} | word_indices: {len(cur_word_indices)} | pred_incomp: {len(cur_pred_incomp)}')

temp_words: 563 | pred_words: 563 | word_indices: 520 | pred_incomp: 520


In [41]:
# cur_tokens = tokenized_datasets['tokens'][i_cur]

In [21]:
model_adapter.model.config.id2label

{0: 'O',
 1: 'Racial or ethnic information',
 2: 'Name',
 3: 'Sexual orientation',
 4: 'Phone Numbers',
 5: 'Health insurance information',
 6: 'Religious beliefs',
 7: 'Political opinions',
 8: 'Social Security number',
 9: 'Civil state',
 10: 'Medical record numbers',
 11: 'Trade union membership',
 12: 'Physical addresses',
 13: 'Philosophical beliefs',
 14: 'Email address',
 15: 'Genetic information',
 16: 'Financial information',
 17: 'Biometric infomration'}

In [39]:
cur_words[9]

'Nathalie'

In [657]:
cur_words[:15]

['Reporting',
 'process',
 '\n\n',
 'by',
 'Gilberto',
 'Gamboa',
 '\n\n',
 'Challenge',
 '\n\n',
 'I',
 'received',
 'a',
 'promotion',
 'of',
 'being']

## First sample

In [724]:
cur_tokens = cur_sample['tokens']
cur_token_ids = np_label_ids[i_cur][:len(cur_tokens)]
cur_token_labels = [model.config.id2label[label_id] for label_id in cur_token_ids]

nlp = spacy.load("en_core_web_sm")
doc = Doc(nlp.vocab, words=cur_tokens)

In [725]:
visualize_label(nlp, doc, cur_tokens, cur_token_labels, options = options_pii)

### Debug

In [458]:
row_boundaries[0]

691

In [459]:
len(list_groups[0:691])

691

In [460]:
list_groups[690]

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [461]:
list_groups[680:691]

[array([2], dtype=int64),
 array([2, 2], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)]

In [384]:
diff_array[0][700:750]

array([   1,    1,    1,    1,    1,    1,    0,    2,    1,    1,    1,
          1,    1,    1,    1, -851,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0])

In [356]:
temp_split_indices = np.where(diff_array[0] != 0)[0] + 1

In [457]:
temp_split_indices

array([  1,   2,   3,   4,   6,   7,   8,   9,  10,  11,  13,  14,  15,
        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
        29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
        42,  43,  44,  45,  46,  47,  48,  49,  50,  52,  54,  55,  57,
        59,  60,  61,  62,  65,  66,  67,  70,  71,  72,  73,  74,  77,
        81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,
        94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106,
       107, 108, 109, 111, 112, 113, 114, 115, 116, 117, 119, 120, 121,
       122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
       135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
       148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
       161, 162, 163, 164, 165, 166, 168, 169, 170, 171, 172, 173, 174,
       175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187,
       188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 19

In [462]:
temp_list_groups = np.split(pred_tokens[0,1:], temp_split_indices)
len(temp_list_groups)

691

In [463]:
temp_list_groups[680:]

[array([2], dtype=int64),
 array([2, 2], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([2, 2, 2, ..., 0, 0, 0], dtype=int64)]

In [381]:
cur_token_ids[700:]

array([0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2], dtype=int64)

In [382]:
len(cur_word_ids)

718

In [383]:
cur_word_ids[700:]

[736,
 736,
 737,
 738,
 739,
 740,
 741,
 742,
 742,
 744,
 745,
 746,
 747,
 748,
 749,
 750,
 751,
 None]

In [726]:
doc = nlp(cur_text)
visualize_label(nlp, doc, cur_words, cur_pred_words, options = options_pii)

In [727]:
visualize_label(nlp, doc, cur_words, cur_label_words, options = options_pii)

## 2nd sample

In [535]:
row_boundaries[1]

1212

In [536]:
row_boundaries[1] - row_boundaries[0]

521

In [425]:
len(list_word_indices[1])

520

In [451]:
list_groups[row_boundaries[0]-1]

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [545]:
list_groups[row_boundaries[1]-2]

array([0, 0, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [544]:
list_groups[row_boundaries[3]-4]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [547]:
list_groups[row_boundaries[1] - 15: row_boundaries[1]]

[array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0, 0, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64),
 array([0], dtype=int64)]

In [449]:
list_groups[row_boundaries[0]:row_boundaries[1]]]

array([0, 0, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [437]:
list_groups[row_boundaries[0]: row_boundaries[0]+15]

[array([2], dtype=int64),
 array([2], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64)]

In [443]:
row_boundaries[1]

1212

In [446]:
1211 - row_boundaries[0]

520

In [537]:
list_groups[1200:1212]

[array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0, 0, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64),
 array([0], dtype=int64)]

In [466]:
list_groups[1211]

array([0], dtype=int64)

In [467]:
cur_token_ids.shape

(525,)

In [470]:
cur_token_ids[-15:]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [469]:
cur_tokens[-15:]

['▁.',
 '▁(',
 '▁This',
 '▁story',
 '▁is',
 '▁fictional',
 '▁and',
 '▁was',
 '▁created',
 '▁for',
 '▁solving',
 '▁the',
 '▁assignment',
 '▁)',
 '[SEP]']

In [103]:
len(cur_tokens)

525

In [104]:
len(cur_words)

563

In [99]:
len(cur_pred_ids)

563

In [106]:
cur_tokens[:10]

['[CLS]',
 '▁Diego',
 '▁Estrada',
 '▁Design',
 '▁Thinking',
 '▁Assignment',
 '▁Visualization',
 '▁Tool',
 '▁Challenge',
 '▁&']

In [402]:
cur_pred_ids

array([2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [111]:
cur_word_ids[:10]

[None, 0, 1, 3, 4, 5, 7, 8, 10, 11]

In [109]:
cur_word_indices[:10]

array([ 0,  1,  3,  4,  5,  7,  8, 10, 11, 12])

In [108]:
cur_pred_ids[:10]

array([2, 2, 0, 2, 0, 0, 0, 0, 0, 0], dtype=int8)

In [677]:
cur_tokens = cur_sample['tokens']
cur_token_ids = np_label_ids[i_cur][:len(cur_tokens)]
cur_token_labels = [model.config.id2label[label_id] for label_id in cur_token_ids]

nlp = spacy.load("en_core_web_sm")
doc = Doc(nlp.vocab, words=cur_tokens)

In [680]:
visualize_label(nlp, doc, cur_tokens, cur_token_labels, options = options_pii)

In [681]:
visualize_label(nlp, doc, cur_words, cur_pred_words, options = options_pii)

In [439]:
visualize_label(nlp, doc, cur_words, cur_label_words, options = options_pii)

## 5th Sample

### Old

In [31]:
visualize_label(nlp, doc, cur_words, cur_pred_words, options = options_pii)

### New

In [260]:
visualize_label(nlp, doc, cur_words, cur_pred_words, options = options_pii)

In [32]:
visualize_label(nlp, doc, cur_words, cur_label_words, options = options_pii)

## Nth sample

In [810]:
[label for label in tokenized_datasets[ ]['labels'] if label != 0]

[2, 2, 2, 2, 2, 2]

In [811]:
i_cur = 1482

cur_sample = tokenized_datasets[i_cur]

cur_pred_ids = list_pred_words[i_cur]
cur_text = cur_sample['full_text']
cur_label_words = cur_sample['word_labels']
cur_pred_words = [model.config.id2label[e] for e in cur_pred_ids]

cur_words = cur_sample['words']


#For comparisons only
cur_tokens = cur_sample['tokens']
cur_token_ids = np_label_ids[i_cur][:len(cur_tokens)]

cur_word_ids = cur_sample['word_ids']

# cur_group = list_groups[i_cur]
cur_word_indices = list_word_indices[i_cur]
cur_pred_incomp = list_pred_incomp[i_cur]


doc = nlp(cur_text)


In [812]:
cur_tokens = cur_sample['tokens']
cur_token_ids = np_label_ids[i_cur][:len(cur_tokens)]
cur_token_labels = [model.config.id2label[label_id] for label_id in cur_token_ids]

nlp = spacy.load("en_core_web_sm")
doc = Doc(nlp.vocab, words=cur_tokens)

In [813]:
visualize_label(nlp, doc, cur_tokens, cur_token_labels, options = options_pii)

In [814]:
doc = nlp(cur_text)
visualize_label(nlp, doc, cur_words, cur_pred_words, options = options_pii)

In [815]:
visualize_label(nlp, doc, cur_words, cur_label_words, options = options_pii)

In [527]:
ds_word_ids = tokenized_datasets['word_ids']

In [518]:
np.where(cur_token_ids != 0)

(array([  1,   2, 433, 434], dtype=int64),)

In [534]:
for i, word_ids in enumerate(ds_word_ids):
    if len(set(word_ids)) == len(list_pred_incomp[i]):
        continue
    else: 
        print(f'{len(set(word_ids))} | {len(list_pred_incomp[i])} ')
        print(i)
        break

739 | 275 
6794


In [552]:
cur_tokens[:10]

['[CLS]',
 '▁Reporting',
 '▁process',
 '▁by',
 '▁Gilberto',
 '▁Gamb',
 'oa',
 '▁Challenge',
 '▁I',
 '▁received']

In [560]:
row_boundaries[1]

1212

In [689]:
list_groups[row_boundaries[1]:row_boundaries[1] + 15]

[array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0, 2], dtype=int64),
 array([2], dtype=int64),
 array([2], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64),
 array([0], dtype=int64)]

In [553]:
cur_word_ids[:10]

[None, 0, 1, 3, 4, 5, 5, 7, 9, 10]

In [555]:
cur_pred_words[:10]

['O', 'O', 'O', 'O', 'Name', 'Name', 'O', 'O', 'O', 'O']

In [None]:
len(set(tokenized_datasets['word_ids'][0]))

## Token Level

In [317]:
cur_tokens = cur_sample['tokens']
cur_token_ids = np_label_ids[i_cur][:len(cur_tokens)]
cur_token_labels = [model.config.id2label[label_id] for label_id in cur_token_ids]

nlp = spacy.load("en_core_web_sm")
doc = Doc(nlp.vocab, words=cur_tokens)

In [476]:
visualize_label(nlp, doc, cur_tokens, cur_token_labels, options = options_pii)

In [311]:
nlp = spacy.blank("en")

# Example tokens and their labels
# tokens = ["Washington", "is", "the", "capital", "of", "the", "United", "States", "."]
# labels = ["B-GPE", "O", "O", "O", "O", "O", "B-GPE", "I-GPE", "O"]

tokens = cur_tokens
labels = [model.config.id2label[label_id] for label_id in cur_token_ids]

# Create a Doc from the list of tokens
doc = Doc(nlp.vocab, words=tokens)

# Create a list to hold the entity spans
ents = []

# Iterate over tokens and labels to create an entity for each token
for index, (token, label) in enumerate(zip(tokens, labels)):
    # For this visualization, every token is considered an entity. The label is used as the entity label.
    # Here, we simply mark each token as a start of its own entity span.
    ent = Span(doc, index, index+1, label=label)
    ents.append(ent)

# Set the entities in doc.ents
doc.ents = ents

# Visualize entities
# Since every token is treated as an entity, all labels, including "O", are included.
options = {"ents": [label for label in set(labels)], "colors": {label: "lightgray" for label in set(labels) if label != "O"}}
displacy.render(doc, style="ent", options=options)

# Tests

In [576]:
test_word_ids = np.random.randint(0, 70, size=(100,100), dtype=int)
test_word_ids = np.sort(test_word_ids,axis = 1)
test_word_ids

array([[ 0,  0,  0, ..., 65, 65, 67],
       [ 0,  1,  1, ..., 65, 66, 68],
       [ 0,  1,  1, ..., 68, 68, 69],
       ...,
       [ 0,  1,  1, ..., 65, 69, 69],
       [ 1,  3,  3, ..., 68, 69, 69],
       [ 0,  0,  0, ..., 66, 67, 69]])

In [577]:
test_pred_tokens = np.random.randint(0, 17, size=(100,100), dtype=int)

In [610]:
# test_diff_array = np.diff(np_test[:,1:], axis=1)
test_diff_array = np.diff(np_test[:,:], axis=1)
test_diff_array.shape

(100, 99)

In [639]:
test_non_zero = np.where(test_diff_array != 0, 1, 0) 
# test_group_ids = np.cumsum(test_non_zero, axis=1) + 1 # + 1 because x diffs = x+1 groups
test_group_ids = np.cumsum(test_non_zero, axis=1)
# test_row_boundaries = np.cumsum(test_group_ids.max(axis = 1))
test_row_boundaries = np.cumsum(test_group_ids.max(axis = 1)) + 1

In [628]:
test_row_boundaries.shape

(100,)

In [629]:
test_split_indices =  np.where(test_diff_array.ravel() != 0)[0] + 1

In [643]:
# test_list_groups = np.split(test_pred_tokens[:,1:].ravel(), test_split_indices)
test_list_groups = np.split(test_pred_tokens[:,:].ravel(), test_split_indices)
test_list_groups = [group[:10] if len(group) >= 10 else group for group in test_list_groups]

In [644]:
test_word_labels = word_labels = [Counter(arr.tolist()).most_common(1)[0][0] if len(arr) > 0 else None for arr in test_list_groups]

In [645]:
test_np_word_labels = np.array(test_word_labels)
test_list_pred_incomp = np.split(test_np_word_labels,test_row_boundaries)

In [646]:
test_list_word_indices = []
for i in range(test_word_ids.shape[0]):
    word_ids = test_word_ids[i]
    indices = set(word_ids)
    indices.discard(None)
    test_list_word_indices.append(np.array(list(indices)))

In [647]:
len(test_list_pred_incomp)

101

In [649]:
test_list_pred_incomp[-2]

array([ 1,  7, 14, 11, 14, 12,  3, 15, 10, 14,  1,  6,  5,  0,  7, 16,  7])

In [651]:
# test_list_pred_words = [np.full((len(word_ids)), model_adapter.O_id, dtype='int8') for word_ids in test_list_word_indices]
test_list_pred_words = [np.full((100), model_adapter.O_id, dtype='int8') for word_ids in test_list_word_indices]

In [652]:
#200ms Align the processed word-level predictions to initialized array
for pred_words, word_indices, pred_incomp in zip(test_list_pred_words, test_list_word_indices, test_list_pred_incomp):
    pred_words[word_indices] = pred_incomp[:-1]
#     pred_words[word_indices]

ValueError: shape mismatch: value array of shape (17,) could not be broadcast to indexing result of shape (48,)

In [None]:
pred_tokens = np_label_ids
    
#200ms Align the processed word-level predictions to initialized array
for pred_words, word_indices, pred_incomp in zip(list_pred_words, list_word_indices, list_pred_incomp):
    pred_words[word_indices] = pred_incomp[:-1]

# Word-Level Metrics

# Bin

In [None]:
pred_tokens = np_label_ids

#Exclude beginning [CLS] Token 
#Also excludes end padding tokens by not appending row size (?)
diff_array = np.diff(word_ids_padded[:,1:], axis=1)

#Calculate row boundaries
non_zero = np.where(diff_array != 0, 1, 0) # Orig
# non_zero = np.where(diff_array > 0, 1, 0)  # Changed
# group_ids = np.cumsum(non_zero, axis=1) # Orig
group_ids = np.cumsum(non_zero, axis=1) + 1 # + 1 because x diffs = x+1 groups
row_boundaries = np.cumsum(group_ids.max(axis = 1))
# row_boundaries = np.cumsum(group_ids.max(axis = 1)) + 1

#Calculate splits, split into groups, Truncate > 10
split_indices =  np.where(diff_array.ravel() != 0)[0] + 1 # Orig
# split_indices =  np.where(diff_array.ravel() > 0)[0] + 1
# list_groups = np.split(pred_tokens.ravel(), split_indices)
list_groups = np.split(pred_tokens[:,1:].ravel(), split_indices)
list_groups = [group[:10] if len(group) >= 10 else group for group in list_groups]

#30s Calculate modes per group
word_labels = [Counter(arr.tolist()).most_common(1)[0][0] if len(arr) > 0 else None for arr in list_groups]

#Fast (split into rows)
np_word_labels = np.array(word_labels)
list_pred_incomp = np.split(np_word_labels, row_boundaries)

#10s Initialize row arrays for word-level predictions
list_pred_words = [np.full((len(words)), model_adapter.O_id, dtype='int8') for words in tokenized_datasets['words']]

#5s #Get word_indices not skipped in word_ids
list_word_indices = []
for word_ids in tokenized_datasets['word_ids']:
    indices = set(word_ids)
    indices.discard(None)
    list_word_indices.append(np.array(list(indices)))
    
#200ms Align the processed word-level predictions to initialized array
for pred_words, word_indices, pred_incomp in zip(list_pred_words, list_word_indices, list_pred_incomp):
    pred_words[word_indices] = pred_incomp[:-1]

In [702]:
i = 0

for pred_words, word_indices, pred_incomp in zip(list_pred_words, list_word_indices, list_pred_incomp):
    try:
#         pred_words[word_indices] = pred_incomp[:-1]
        pred_words[word_indices] = pred_incomp[1:-1]
        i+=1
    except Exception as e:
        print(i)
        raise e

0


ValueError: shape mismatch: value array of shape (691,) could not be broadcast to indexing result of shape (690,)