# Imports

In [2]:
from datasets import load_dataset, load_from_disk
from collections import Counter
import functools

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# %load_ext autoreload
# %autoreload 2

In [4]:
%run PII_Util.py

# Initialize Model Adapter

In [5]:
model_adapter = Yanis_Adapter(threshold = 0.01, config_only=True)

# Data

In [6]:
%%time
preprocessed_dataset = load_from_disk("./in/tokenized_datasets")
# preprocessed_dataset = load_from_disk("./in/preprocessed_dataset_2")

CPU times: total: 0 ns
Wall time: 2.52 s


In [7]:
%%time
loaded_predictions = np.load("./in/preds01.npy")

CPU times: total: 1.39 s
Wall time: 20.7 s


In [8]:
temp_labels = preprocessed_dataset[0]['labels']

In [9]:
np.unique(temp_labels, return_counts=True)

(array([0, 2]), array([709,   9], dtype=int64))

# Compute Metrics

## Label Names

In [30]:
def col_id2label(col_name):
    list_name = col_name.split('_')
    
    
    if list_name[-1].isdigit():
        label_id = int(list_name[-1])
        
        if list_name[0] == 'token':
            list_name[-1] = model_adapter.config.id2label[label_id]
        else:
            list_name[-1] = id2classes[label_id]
        return '_'.join(list_name)
        
    else:
        return col_name

In [11]:
model_adapter.labels_irrelevant

[1, 3, 5, 6, 7, 9, 11, 13, 15, 16, 17]

In [12]:
labels_padded = pad_lists(preprocessed_dataset['labels'], -100, loaded_predictions.shape[1])

In [21]:
%%time
%run PII_Util.py
threshold = 0.18
# model_adapter = Yanis_Adapter(threshold = 0.01, config_only=True)
model_adapter = Yanis_Adapter(threshold = threshold, config_only=True, will_bio_tokens=False, will_bio_words=True)
compute_metrics = functools.partial(compute_metrics_base, preprocessed_dataset=preprocessed_dataset, model_adapter=model_adapter, threshold=0.01, return_processed=True)

CPU times: total: 484 ms
Wall time: 980 ms


In [22]:
%%time
eval_preds = (loaded_predictions, labels_padded, None)
dict_all_scores, dict_processed = compute_metrics(eval_preds)

CPU times: total: 1min 1s
Wall time: 1min 1s


In [24]:
del loaded_predictions, labels_padded

In [26]:
import gc
gc.collect()

1384

In [28]:
flat_label_ids = dict_processed['flat_label_ids']
flat_true_labels_id =  dict_processed['flat_true_labels_id']
flat_word_label_ids =  dict_processed['flat_word_label_ids']
flat_word_true_labels_id =  dict_processed['flat_word_true_labels_id']
token_confusion_matrix = dict_all_scores.pop('token_confusion_matrix')
word_confusion_matrix = dict_all_scores.pop('word_confusion_matrix')

In [31]:
%%time
df_all_metrics = pd.DataFrame(dict_all_scores)
df_all_metrics.columns = df_all_metrics.columns.map(col_id2label)
df_all_metrics = df_all_metrics.T
df_all_metrics

CPU times: total: 31.2 ms
Wall time: 538 ms


Unnamed: 0,precision,recall,f_beta,tp,fp,fn
token_total_metrics,0.067975,0.877603,0.601879,3413.0,46797.0,476.0
word_total_metrics,0.10627,0.892296,0.694675,2444.0,20554.0,295.0
token_O,0.99997,0.990187,0.99056,4688006.0,139.0,46460.0
token_Racial or ethnic information,0.0,0.0,0.0,0.0,0.0,0.0
token_Name,0.082514,0.972301,0.687261,3089.0,34347.0,88.0
token_Sexual orientation,0.0,0.0,0.0,0.0,0.0,0.0
token_Phone Numbers,0.027374,0.98,0.419079,49.0,1741.0,1.0
token_Health insurance information,0.0,0.0,0.0,0.0,0.0,0.0
token_Religious beliefs,0.0,0.0,0.0,0.0,0.0,0.0
token_Political opinions,0.0,0.0,0.0,0.0,0.0,0.0


## Debug

In [20]:
def compute_metrics_base(eval_preds, preprocessed_dataset, model_adapter, threshold = 0.1, return_processed=False):
    
    global flat_label_ids, list_word_true_labels
    
    np_probs, true_labels_id, inputs = eval_preds
    
    #---- Token Preds
    np_label_ids, flat_true_labels_id, flat_label_ids = get_tokens_thresholding(np_probs, true_labels_id, threshold, model_adapter)

    #---- Word Preds

    list_word_ids = [[word_id if word_id is not None else -100 for word_id in word_ids] for word_ids in preprocessed_dataset['word_ids'] ]
    word_ids_padded = pad_lists(list_word_ids, -100,  np_probs.shape[1])
    
    list_pred_words = get_word_preds(np_label_ids, word_ids_padded, model_adapter, preprocessed_dataset)
     
    #Overrides BIO if id already in BIO (BIO -> Entity -> BIO) or (non-BIO -> Entity -> BIO)
    #- implemented inside get_word_preds
    
#     list_pred_words_mapped = [to_bio_vect(pred_words,  model_adapter.np_model_id2cur_entity_id) for pred_words in list_pred_words]
    
    flat_word_label_ids = np.concatenate(list_pred_words)
    
    list_word_true_labels = preprocessed_dataset['word_labels']
    
    flat_word_true_labels_id = np.array([classes2id[word] for list_words in list_word_true_labels for word in list_words], dtype='int8')
    
 
    total_metrics, class_metrics, confusion_matrix = compute_micro_metrics(predictions = flat_label_ids, labels = flat_true_labels_id, 
                                                                       #    num_classes = len(model_adapter.labels), pos_labels=model_adapter.label_pos_ids, beta=5, prefix='token_')
                                                                            num_classes = model_adapter.n_token_labels, pos_labels=model_adapter.label_pos_ids, beta=5, prefix='token_')
    
    total_metrics_w, class_metrics_w, confusion_matrix_w = compute_micro_metrics(predictions = flat_word_label_ids, labels = flat_word_true_labels_id, 
                                                                                 num_classes = len(classes), pos_labels=classes_pos_id, beta=5, prefix='word_')
    
    dict_scores = {'token_confusion_matrix' : confusion_matrix,
                  'word_confusion_matrix' : confusion_matrix_w,
                   'token_total_metrics' : total_metrics,
                   'word_total_metrics' : total_metrics_w}
    
    dict_scores.update(class_metrics)
    dict_scores.update(class_metrics_w)

#     dict_scores = {}

    if return_processed:
        dict_processed = {'flat_label_ids' : flat_label_ids,
                  'flat_true_labels_id' : flat_true_labels_id,
                   'flat_word_label_ids' : flat_word_label_ids,
                   'flat_word_true_labels_id' : flat_word_true_labels_id}
        
        return dict_scores, dict_processed
    
    return dict_scores
    
    

In [139]:
%%time

threshold = 0.18
model_adapter = Yanis_Adapter(threshold = threshold, config_only=True, will_bio_tokens=False, will_bio_words=True)
# model_adapter = Yanis_Adapter(threshold = threshold, config_only=True, will_bio_tokens=True, will_bio_words=True)
compute_metrics = functools.partial(compute_metrics_base, preprocessed_dataset=preprocessed_dataset, model_adapter=model_adapter, threshold=threshold)

CPU times: total: 453 ms
Wall time: 964 ms


In [140]:
%%time
eval_preds = (loaded_predictions, labels_padded, None)
dict_all_scores = compute_metrics(eval_preds)

CPU times: total: 1min 1s
Wall time: 1min 1s


In [141]:
temp_counts = np.unique(flat_label_ids, return_counts=True)
temp_counts

(array([ 0,  2,  4,  8, 10, 12, 14], dtype=int64),
 array([4718157,   12811,     900,     122,     130,    2946,    3289],
       dtype=int64))

In [142]:
token_confusion_matrix = dict_all_scores.pop('token_confusion_matrix')
word_confusion_matrix = dict_all_scores.pop('word_confusion_matrix')

### will_bio_tokens = False = will_bio_words = True

In [143]:
def col_id2label(col_name):
    list_name = col_name.split('_')
    
    
    if list_name[-1].isdigit():
        label_id = int(list_name[-1])
        
        if list_name[0] == 'token':
            list_name[-1] = model_adapter.config.id2label[label_id]
        else:
            list_name[-1] = id2classes[label_id]
        return '_'.join(list_name)
        
    else:
        return col_name

In [144]:
%%time
df_all_metrics = pd.DataFrame(dict_all_scores)
df_all_metrics.columns = df_all_metrics.columns.map(col_id2label)
df_all_metrics = df_all_metrics.T
df_all_metrics

CPU times: total: 15.6 ms
Wall time: 4 ms


Unnamed: 0,precision,recall,f_beta,tp,fp,fn
token_total_metrics,0.165115,0.857547,0.738441,3335.0,16863.0,554.0
word_total_metrics,0.20341,0.871121,0.773468,2386.0,9344.0,353.0
token_O,0.99995,0.996506,0.996638,4717922.0,235.0,16544.0
token_Racial or ethnic information,0.0,0.0,0.0,0.0,0.0,0.0
token_Name,0.235735,0.950582,0.851295,3020.0,9791.0,157.0
token_Sexual orientation,0.0,0.0,0.0,0.0,0.0,0.0
token_Phone Numbers,0.046667,0.84,0.507907,42.0,858.0,8.0
token_Health insurance information,0.0,0.0,0.0,0.0,0.0,0.0
token_Religious beliefs,0.0,0.0,0.0,0.0,0.0,0.0
token_Political opinions,0.0,0.0,0.0,0.0,0.0,0.0


#### Dup

In [136]:
%%time
df_all_metrics = pd.DataFrame(dict_all_scores)
df_all_metrics.columns = df_all_metrics.columns.map(col_id2label)
df_all_metrics = df_all_metrics.T
df_all_metrics

CPU times: total: 31.2 ms
Wall time: 3 ms


Unnamed: 0,precision,recall,f_beta,tp,fp,fn
token_total_metrics,0.165115,0.857547,0.738441,3335.0,16863.0,554.0
word_total_metrics,0.20341,0.871121,0.773468,2386.0,9344.0,353.0
token_O,0.99995,0.996506,0.996638,4717922.0,235.0,16544.0
token_Racial or ethnic information,0.0,0.0,0.0,0.0,0.0,0.0
token_Name,0.235735,0.950582,0.851295,3020.0,9791.0,157.0
token_Sexual orientation,0.0,0.0,0.0,0.0,0.0,0.0
token_Phone Numbers,0.046667,0.84,0.507907,42.0,858.0,8.0
token_Health insurance information,0.0,0.0,0.0,0.0,0.0,0.0
token_Religious beliefs,0.0,0.0,0.0,0.0,0.0,0.0
token_Political opinions,0.0,0.0,0.0,0.0,0.0,0.0


### will_bio_tokens=True, will_bio_words=True

In [107]:
%%time
df_all_metrics = pd.DataFrame(dict_all_scores)
df_all_metrics.columns = df_all_metrics.columns.map(col_id2label)
df_all_metrics = df_all_metrics.T
df_all_metrics

CPU times: total: 0 ns
Wall time: 3 ms


Unnamed: 0,precision,recall,f_beta,tp,fp,fn
token_total_metrics,0.145163,0.503175,0.459581,2932.0,17266.0,2895.0
word_total_metrics,0.20341,0.871121,0.773468,2386.0,9344.0,353.0
token_O,0.999755,0.99672,0.996836,4717003.0,1154.0,15525.0
token_B-EMAIL,0.076923,0.961977,0.666869,253.0,3036.0,10.0
token_B-ID_NUM,0.020942,0.010753,0.010958,4.0,187.0,368.0
token_B-NAME_STUDENT,0.189873,0.726795,0.655502,1245.0,5312.0,468.0
token_B-PHONE_NUM,0.013746,0.363636,0.183746,4.0,287.0,7.0
token_B-STREET_ADDRESS,0.0,0.0,0.0,0.0,1620.0,2.0
token_B-URL_PERSONAL,0.0,0.0,0.0,0.0,0.0,1911.0
token_B-USERNAME,0.0,0.0,0.0,0.0,0.0,20.0


# Record Search

## List Approach

### Preprocessing

In [229]:
%%time
record_lens = [len(record) for record in preprocessed_dataset['word_labels']]
split_indices = np.cumsum(record_lens)

list_word_label_ids = np.split(flat_word_label_ids, split_indices)
list_word_true_label_ids = np.split(flat_word_true_labels_id, split_indices)

### Build

In [254]:
def build_record_meta(preds, labels):
    row_size = len(preds)
    np_record_meta = np.empty((row_size,2), dtype='int8')

    for i, (pred_id, true_id), in enumerate(zip(preds, labels)):
        if pred_id == true_id:
            np_record_meta[i][0] = -1
            np_record_meta[i][1] = -1
        else:
            np_record_meta[i][0] = pred_id
            np_record_meta[i][1] = true_id
            
    return np_record_meta
            
def get_undetected_records(list_record_meta, true_id):
    
    return [i for i, np_record_meta in enumerate(list_record_meta)  if (np_record_meta[:,0] == true_id).any()]
    
    
    return (np_record_meta[:,0] == true_id).any()

def get_mismatched_records(list_records, pred_id, true_id):
    return [i for i, np_record_meta in enumerate(list_record_meta)  if ((np_record_meta[:,0] == pred_id) & (np_record_meta[:,1] == true_id)).any()]
    


In [232]:
%%time
list_record_meta = []

for preds, labels in zip(list_word_label_ids, list_word_true_label_ids):
    list_record_meta.append(build_record_meta(preds, labels))


CPU times: total: 5.81 s
Wall time: 6.01 s


In [None]:
for i, (rec_len, word_label_ids, word_true_label_ids) in enumerate(zip(record_lens,list_word_label_ids,list_word_true_label_ids)):
    if rec_len != len(word_label_ids) or rec_len != len(word_true_label_ids):
        print(i)
        break

In [122]:
record_lens[1]

563

In [124]:
len(list_word_label_ids[1])

0

### Test get_undetected_records

In [150]:
np_record_meta = list_record_meta[0]

(np_record_meta[:,0] == 3).any()

True

In [151]:
classes2id

{'O': 0,
 'B-EMAIL': 1,
 'B-ID_NUM': 2,
 'B-NAME_STUDENT': 3,
 'B-PHONE_NUM': 4,
 'B-STREET_ADDRESS': 5,
 'B-URL_PERSONAL': 6,
 'B-USERNAME': 7,
 'I-ID_NUM': 8,
 'I-NAME_STUDENT': 9,
 'I-PHONE_NUM': 10,
 'I-STREET_ADDRESS': 11,
 'I-URL_PERSONAL': 12}

In [233]:
%%time
list_und_name = get_undetected_records(list_record_meta, 3)

CPU times: total: 46.9 ms
Wall time: 111 ms


In [172]:
len(list_und_name)

3406

### Test get_mismatched_records

In [None]:
df_preprocessed = preprocessed_dataset.to_pandas()
df_preprocessed.head()

In [152]:
target_doc_id = 7308
df_query = df_preprocessed[df_preprocessed.document == target_doc_id]
df_query

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,input_ids,attention_mask,words,word_labels,word_ids
440,7308,Md Carrillo\n\nlisarose@gmail.com\n\ndiazkrist...,"[[CLS], ▁Md, ▁Carrillo, ▁lisa, rose, @, gmail,...","[True, False, False, False, False, False, Fals...","[0, 2, 2, 14, 14, 14, 14, 14, 14, 14, 14, 14, ...","[1, 25884, 87905, 97971, 21220, 1683, 13007, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[Md, Carrillo, \n\n, lisarose@gmail.com, \n\n,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, B-EMAIL, O...","[nan, 0.0, 1.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, ..."


In [154]:
np.unique(df_query.word_labels.values[0], return_index=True)

(array(['B-EMAIL', 'B-NAME_STUDENT', 'B-URL_PERSONAL', 'I-NAME_STUDENT',
        'O'], dtype=object),
 array([3, 0, 7, 1, 2], dtype=int64))

In [161]:
np_record_meta = list_record_meta[440]

In [163]:
np_record_meta[7]

array([1, 6], dtype=int8)

In [142]:
np.unique(np_record_meta)

array([-1,  0,  1,  3,  6], dtype=int8)

In [165]:
np.where(np_record_meta == 6)

(array([  7, 384], dtype=int64), array([1, 1], dtype=int64))

In [164]:
np.where(np_record_meta == 1)

(array([  7, 384], dtype=int64), array([0, 0], dtype=int64))

In [167]:
((np_record_meta[:,0] == 1) & (np_record_meta[:,1] == 6)).any()

True

In [234]:
%%time
list_mis_1_6 = get_mismatched_records(list_record_meta, 1,6)

CPU times: total: 125 ms
Wall time: 141 ms


In [178]:
list_mis_1_6

[17,
 24,
 49,
 67,
 79,
 81,
 103,
 117,
 127,
 134,
 146,
 186,
 201,
 247,
 296,
 320,
 326,
 348,
 351,
 363,
 367,
 380,
 392,
 395,
 400,
 401,
 414,
 440,
 447,
 466,
 471,
 498,
 503,
 506,
 528,
 535,
 555,
 641,
 754,
 831,
 861,
 890,
 908,
 977,
 1010,
 1077,
 1103,
 1123,
 1179,
 1194,
 1385,
 1489,
 1494,
 1525,
 1642,
 1693,
 1743,
 1828,
 1879,
 2083,
 2100,
 2183,
 2188,
 2238,
 2287,
 2378,
 2383,
 2428,
 2531,
 2691,
 2790]

## Vectorized

### Preprocessing

In [226]:
%%time
record_lens = [len(record) for record in preprocessed_dataset['word_labels']]
split_indices = np.cumsum(record_lens)

list_word_label_ids = np.split(flat_word_label_ids, split_indices)
list_word_true_label_ids = np.split(flat_word_true_labels_id, split_indices)


np_word_label_ids = pad_lists(list_word_label_ids, 0, max_len)
np_word_true_label_ids = pad_lists(list_word_true_label_ids, 0, max_len)

max_len = max(record_lens)

CPU times: total: 9.69 s
Wall time: 10.2 s


In [263]:
def build_records_meta_vect(np_preds, np_labels):

    np_records_meta = np.empty((np_preds.shape[0], np_preds.shape[1],2), dtype='int8')

    mask_TP = np_preds == np_labels
    mask_F = ~ mask_TP

    np_records_meta[mask_TP, 0:2] = -1 #Not sure if this indexing is possible

    np_records_meta[mask_F, 0] = np_preds[mask_F]
    np_records_meta[mask_F, 1] = np_labels[mask_F]
    
    return np_records_meta


def get_undetected_records(list_record_meta, true_id):
    #Missing np.where
    return np.where((np_records_meta[:,:,0] == true_id).any(axis=(-1)))[0]


def get_mismatched_records(list_records, pred_id, true_id):\
    #Missing np.where
    return np.where(((np_records_meta[:,:,0] == pred_id) & (np_records_meta[:,:,1] == true_id)).any(axis=1))[0]



In [220]:
%%time
np_records_meta = build_records_meta_vect(np_word_label_ids,np_word_true_label_ids)

# for preds, labels in zip(list_word_label_ids, list_word_true_label_ids):
#     list_record_meta.append(build_record_meta(preds, labels))

CPU times: total: 969 ms
Wall time: 961 ms


In [221]:
np_records_meta.shape

(6808, 3298, 2)

### Test get_undetected_records

In [205]:
len((np_records_meta[:,:,0] == 3).any(axis=(-1)))

6808

In [256]:
%%time

np_und_name = get_undetected_records(list_record_meta, 3)

CPU times: total: 46.9 ms
Wall time: 31 ms


### Test get_mismatched_records

In [218]:
np.unique(np_records_meta[440])

array([-1,  0,  6], dtype=int8)

In [222]:
temp_results = ((np_records_meta[:,:,0] == 1) & (np_records_meta[:,:,1] == 6)).any(axis=(1))

In [223]:
len(temp_results)

6808

In [225]:
np.where(temp_results == True)

(array([  17,   24,   49,   67,   79,   81,  103,  117,  127,  134,  146,
         186,  201,  247,  296,  320,  326,  348,  351,  363,  367,  380,
         392,  395,  400,  401,  414,  440,  447,  466,  471,  498,  503,
         506,  528,  535,  555,  641,  754,  831,  861,  890,  908,  977,
        1010, 1077, 1103, 1123, 1179, 1194, 1385, 1489, 1494, 1525, 1642,
        1693, 1743, 1828, 1879, 2083, 2100, 2183, 2188, 2238, 2287, 2378,
        2383, 2428, 2531, 2691, 2790], dtype=int64),)

In [224]:
temp_results[440]

True

In [257]:
%%time

temp_results = ((np_records_meta[:,:,0] == 1) & (np_records_meta[:,:,1] == 6)).any(axis=(1))

CPU times: total: 46.9 ms
Wall time: 58 ms


In [264]:
np_mis_1_6 = get_mismatched_records(np_records_meta,1,6)

In [265]:
temp_results

array([False, False, False, ..., False, False, False])

### Compare results

In [269]:
list_und_name == np_und_name.tolist()

True

In [270]:
list_mis_1_6 == np_mis_1_6.tolist()

True

In [240]:
for i, (l,r) in enumerate(zip(list_und_name, temp_results.tolist())):
    if l != r:
        print(i)
        break

1


In [242]:
list_und_name[:10]

[0, 3, 5, 9, 10, 11, 12, 13, 15, 16]

In [243]:
temp_results[:10]

array([False, False, False, False, False, False, False, False, False,
       False])

In [241]:
list_und_name[1]

3

# Postprocessing