In [2]:
import json
import os
import pandas as pd
import re

In [3]:
import spacy
from spacy import displacy
from spacy.tokens import Span, Doc

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
from scipy import stats
import numpy as np
from sklearn.metrics import fbeta_score

In [6]:
# import tensorflow as tf
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification
from transformers import pipeline
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification

In [85]:
%run PII_Util.py

# Load Data

In [28]:
path_train = './in/train.json'
path_test = './in/test.json'

In [29]:
train_json = json.load(open(path_train))
df_train = pd.json_normalize(train_json)

test_json = json.load(open(path_test))
df_test = pd.json_normalize(test_json)

In [30]:
train_json[0].keys()

dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])

In [31]:
df_train.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment:¬† Visualization¬†Reflection¬† Submitt...,"[Assignment, :, ¬† , Visualization, ¬†, Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [32]:
text_test = """Meet Jane Doe, a brilliant student at XYZ University. She can be reached at jane.doe@email.com or through her phone number +1234567890. Jane resides at 123 Main Street, Cityville. Her student ID is 987654 and her personal website is www.janedoe.com. Connect with her on social media using the username @janedoe.

Meanwhile, John Smith, another outstanding student, can be contacted at john.smith@email.com or at +9876543210. John lives at 456 Oak Avenue, Townsville. His student ID is 123456, and you can visit his personal blog at www.johnsmithblog.com. Follow him on Twitter with the handle @johnsmith123.

For any inquiries about the university's programs, you can contact the administration office at admin@xyzuniversity.edu or call +5551234567. The office is located at 789 University Boulevard.

Visit our official website at www.xyzuniversity.edu for more information on courses and admission procedures.

"""


In [33]:
text_test

"Meet Jane Doe, a brilliant student at XYZ University. She can be reached at jane.doe@email.com or through her phone number +1234567890. Jane resides at 123 Main Street, Cityville. Her student ID is 987654 and her personal website is www.janedoe.com. Connect with her on social media using the username @janedoe.\n\nMeanwhile, John Smith, another outstanding student, can be contacted at john.smith@email.com or at +9876543210. John lives at 456 Oak Avenue, Townsville. His student ID is 123456, and you can visit his personal blog at www.johnsmithblog.com. Follow him on Twitter with the handle @johnsmith123.\n\nFor any inquiries about the university's programs, you can contact the administration office at admin@xyzuniversity.edu or call +5551234567. The office is located at 789 University Boulevard.\n\nVisit our official website at www.xyzuniversity.edu for more information on courses and admission procedures.\n\n"

# Dataset Specs

In [34]:
classes = ['O',
'B-EMAIL',
'B-ID_NUM',
'B-NAME_STUDENT',
'B-PHONE_NUM',
'B-STREET_ADDRESS',
'B-URL_PERSONAL',
'B-USERNAME',
'I-ID_NUM',
'I-NAME_STUDENT',
'I-PHONE_NUM',
'I-STREET_ADDRESS',
'I-URL_PERSONAL',
]

classes2id = {clas:i for i, clas in enumerate(classes)}
id2classes = {i:clas for i, clas in enumerate(classes)}
classes_pos = classes[1:]
classes_pos_id = [classes2id[label] for label in classes_pos]

# Model

## Yanis

In [35]:
class PII_Adapter():
    def __init__(self, model_name):
        self.model_name = model_name
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(model_name)


#Model not in BIO Format
class Yanis_Adapter(PII_Adapter):
    def __init__(self, threshold = 0.1):
        super().__init__("Yanis/microsoft-deberta-v3-large_ner_conll2003-anonimization_TRY_1")
        
        self.O_id = self.model.config.label2id['O']
            
        self.threshold = threshold
        self.model_id2cur_label = yanis_to_cur = {
            0:'O', 
            1:'O',
            2:'NAME_STUDENT',
            3:'O',
            4:'PHONE_NUM',
            5:'O',
            6:'O',
            7:'O',
            8:'ID_NUM',
            9:'O',
            10:'ID_NUM',
            11:'O',
            12:'STREET_ADDRESS',
            13:'O',
            14:'EMAIL',
            15:'O',
            16:'O',
            17:'O'}
        
        self.labels_irrelevant = [key for key,value in self.model_id2cur_label.items() if value == 'O' and key != self.O_id]
        
        rev_model_id2cur_label = {value:key for key,value in  self.model_id2cur_label.items()}
        rev_model_id2cur_label['O'] = self.O_id
        
        cur_label2model_id = {'O': self.O_id}
        for label in classes:
            if label == 'O':  
                continue
            else:
                entity = label.split('-')[1]

                if entity in rev_model_id2cur_label:
                    cur_label2model_id[label] = rev_model_id2cur_label[entity]        
                else:
                    cur_label2model_id[label] = rev_model_id2cur_label['O']
                    
        self.cur_label2model_id = cur_label2model_id
                    
                    
#         self.cur_label2model_id = {label: rev_model_id2cur_label(label) if in self.model.config.labels2id.keys() else  'O' for label in self.model.config.labels2id.keys()}       
#         self.label2id = { id2classes: for label in self.model.labels}

    

# Utils

## Align token to words

In [36]:
def align_tokens(tokens, word_ids, labels):
    
    #Tokens to word
    
    word_subword_mapping = {}
    for i, word_id in enumerate(word_ids):
        if word_id is not None:
            if word_id not in word_subword_mapping:
                word_subword_mapping[word_id] = []
            word_subword_mapping[word_id].append(i)

    # Step 5: Iterate through pairs of words and subwords to count the majority label
    word_labels = []
    for i, word in enumerate(words):
        if i in word_subword_mapping:
            subword_labels = pred_labels[word_subword_mapping[i]]
            majority_label = get_majority(subword_labels)
            word_labels.append((word, majority_label))
        else:
            word_labels.append((word,'O'))
            
            
    return word_labels

In [37]:
def align_words(tokens, words, word_ids, labels):
    
#     global word_ids_cur
#     global words_cur
#     global labels_cur
    
#     words_cur = words
#     word_ids_cur = word_ids
#     labels_cur = labels
    
    #word to tokens
    token_labels = []
    prev_entity = None
    prev_ent_type = None
    
    # Step 5: Iterate through pairs of words and subwords to count the majority label
    for i, (word_id, token) in enumerate(zip(word_ids, tokens)):
        if word_id is None:
            token_labels.append('O')
#             continue
        else:
            
            try:
                token_label = labels[word_id]
            except Exception as e:
                print(word_id)
                print(len(labels))
                
                raise(e)
            #Outside
            if token_label == 'O':
                token_labels.append('O')
                ent_type = 'O'
                
            else:
                prefix, ent_type = token_label.split('-')
            
                #Same entity: B-ent:B-ent, I-ent,I-ent, B-ent,I-ent
                if prev_entity == token_label or f'I-{prev_ent_type}' == token_label:
                    token_labels.append(f'I-{prev_ent_type}')

                #New Entity: I-ent: B-ent, x-ent1: x-ent2\
                else:
                    token_labels.append(f'B-{ent_type}')

            prev_entity = token_labels
            prev_ent_type = ent_type
                
            
    return token_labels

# tokens = inputs.tokens()
# token_labels = align_words(tokens, df_train.loc[0].tokens,word_ids, labels_true)

In [38]:
# def pos_threshold(tensor_probs, neg_index,  threshold=0.5):
    
#     indices_max = np.argmax(tensor_probs, axis = 1,2)
    
    
    
# #     highest_non_O_labels = []

# #     for token, probabilities in probabilities:
# #         non_O_labels = []
# #         non_O_probabilities = []

# #         # Iterate through predicted label probabilities
# #         for label_idx, probability in enumerate(probabilities):
# #             label_name = id2label[label_idx]
# #             # Exclude 'O' labels and labels with probabilities below the threshold
# #             if label_name != 'O' and probability >= threshold:
# #                 non_O_labels.append(label_name)
# #                 non_O_probabilities.append(probability)

# #         # If there are non-'O' labels above the threshold, select the one with the highest probability
# #         if non_O_labels:
# #             highest_non_O_label = non_O_labels[non_O_probabilities.index(max(non_O_probabilities))]
# #             highest_non_O_labels.append((token, highest_non_O_label))
# #         else:
# #             # If all labels are 'O' or below the threshold, consider it as non-entity
# #             highest_non_O_labels.append((token, 'O'))

#     return highest_non_O_labels

In [39]:
def pos_threshold_label(tokens, label_probabilities, id2label, threshold=0.5):
    highest_non_O_labels = []

    for token, probabilities in zip(tokens, label_probabilities):
        non_O_labels = []
        non_O_probabilities = []

        # Iterate through predicted label probabilities
        for label_idx, probability in enumerate(probabilities):
            label_name = id2label[label_idx]
            # Exclude 'O' labels and labels with probabilities below the threshold
            if label_name != 'O' and probability >= threshold:
                non_O_labels.append(label_name)
                non_O_probabilities.append(probability)

        # If there are non-'O' labels above the threshold, select the one with the highest probability
        if non_O_labels:
            highest_non_O_label = non_O_labels[non_O_probabilities.index(max(non_O_probabilities))]
            highest_non_O_labels.append((token, highest_non_O_label))
        else:
            # If all labels are 'O' or below the threshold, consider it as non-entity
            highest_non_O_labels.append((token, 'O'))

    return highest_non_O_labels

In [40]:
def get_majority(arr):
    unique_elements, counts = np.unique(arr, return_counts=True)
    max_count_index = np.argmax(counts)
    return unique_elements[max_count_index]

In [41]:
color_map_pii = {'B-EMAIL': '#2fc3da',
             'B-ID_NUM': '#1c0cfa',
             'B-NAME_STUDENT':'#e01d82',
             'B-PHONE_NUM': '#ebe70e',
             'B-STREET_ADDRESS':'#f2860a',
             'B-URL_PERSONAL': '#c9f211',
             'B-USERNAME': '#0eebb7',
             'I-ID_NUM': '#8e87ed',
             'I-NAME_STUDENT':'#eb83b9',
             'I-PHONE_NUM': '#e6e4a1',
             'I-STREET_ADDRESS': '#f2c694',
             'I-URL_PERSONAL':'#e5f2ac'}
options_pii = {'colors': color_map_pii}

def visualize_label(nlp, doc, tokens, labels, options = None):
    global span_infos, doc_spans
    
    start_pos = -1
    span_infos = []
    for label_index, label in enumerate(labels):
        if label!= 'O':
            start_pos = label_index
            end_pos = start_pos + 1
            span_dict = {'start_pos':start_pos, 'end_pos':end_pos, 'label':label}
            span_infos.append(span_dict)

    doc_spans = []
    doc = Doc(nlp.vocab, words=tokens)
    
    for span_info in span_infos:
        _span = Span(doc, span_info['start_pos'], span_info['end_pos'], span_info['label'])
        doc_spans.append(_span)

    doc.spans['sc'] = doc_spans
    displacy.render(doc, style = 'span', options = options)
#     displacy.render(doc, style = 'span')

In [42]:
def convert_to_bio(tokens, labels):
    bio_labels = []
    current_entity = None
    
    for token, label in zip(tokens, labels):
        if label == 'O':
            bio_labels.append(label)
            current_entity = None
        else:
            if current_entity == label:
                bio_labels.append('I-' + label)
            else:
                bio_labels.append('B-' + label)
            current_entity = label
    
    return bio_labels

# Pipeline

In [24]:
%%time
model_adapter = Yanis_Adapter()
# text = text_test

CPU times: total: 8.48 s
Wall time: 15.5 s


In [25]:
text = df_train.loc[0].full_text
labels_true = df_train.loc[0].labels
threshold = 0.1

## Inference (single)

In [None]:
def run_inference(text, labels, model_adapter, threshold=0.1):
    
    global pred_bio, word_ids, word_labels, token_v, pred_v, token_labels
    
    model_name = model_adapter.model_name
    model_id2cur_label = model_adapter.model_id2cur_label

    tokenizer = model_adapter.tokenizer
    model = model_adapter.model

    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # Step 1: Tokenize the text using spaCy for words
    words = [token.text for token in doc]

    # Step 2: Tokenize the text using the model's tokenizer and get word to subword mapping
    inputs = tokenizer(words, return_tensors='pt', return_offsets_mapping=False, is_split_into_words=True)
    word_ids = inputs.word_ids()

    # Step 3: Run inference
    with torch.no_grad():
        outputs = model(**inputs).logits

    label_probabilities = torch.softmax(outputs, dim=2)[0].cpu().numpy()
    tokens = inputs.tokens()

    # Get highest non-'O' labels for each token with thresholding
    token_labels = pos_threshold_label(tokens, label_probabilities, model.config.id2label, threshold)

    pred_labels = np.array([label for token,label in token_labels])

    # Step 4: Create a list of which tokens or subwords correspond to a word using the word_ids variable
    word_subword_mapping = {}
    for i, word_id in enumerate(word_ids):
        if word_id is not None:
            if word_id not in word_subword_mapping:
                word_subword_mapping[word_id] = []
            word_subword_mapping[word_id].append(i)

    # Step 5: Iterate through pairs of words and subwords to count the majority label
    word_labels = []
    for i, word in enumerate(words):
        if i in word_subword_mapping:
            subword_labels = pred_labels[word_subword_mapping[i]]
            majority_label = get_majority(subword_labels)
            word_labels.append((word, majority_label))
        else:
            word_labels.append((word,'O'))


    token_v, pred_v = zip(*word_labels)

    pred_conv = [model_id2cur_label[model.config.label2id[pred]] for pred in pred_v]
    pred_bio = convert_to_bio(token_v, pred_conv)

    f_beta = fbeta_score(labels, pred_bio, labels = classes_pos,beta=5, average='micro')
    print("F-beta score:", f_beta)
    
    return f_beta

In [111]:
token_labels

[('[CLS]', 'O'),
 ('‚ñÅDesign', 'O'),
 ('‚ñÅThinking', 'O'),
 ('‚ñÅfor', 'O'),
 ('‚ñÅinnovation', 'O'),
 ('‚ñÅreflex', 'O'),
 ('ion', 'O'),
 ('‚ñÅ-', 'O'),
 ('‚ñÅAvril', 'O'),
 ('‚ñÅ2021', 'O'),
 ('‚ñÅ-', 'O'),
 ('‚ñÅNathalie', 'Name'),
 ('‚ñÅS', 'Name'),
 ('ylla', 'Name'),
 ('‚ñÅChallenge', 'O'),
 ('‚ñÅ&', 'O'),
 ('‚ñÅselection', 'O'),
 ('‚ñÅThe', 'O'),
 ('‚ñÅtool', 'O'),
 ('‚ñÅI', 'O'),
 ('‚ñÅuse', 'O'),
 ('‚ñÅto', 'O'),
 ('‚ñÅhelp', 'O'),
 ('‚ñÅall', 'O'),
 ('‚ñÅstakeholders', 'O'),
 ('‚ñÅfinding', 'O'),
 ('‚ñÅtheir', 'O'),
 ('‚ñÅway', 'O'),
 ('‚ñÅthrough', 'O'),
 ('‚ñÅthe', 'O'),
 ('‚ñÅcomplexity', 'O'),
 ('‚ñÅof', 'O'),
 ('‚ñÅa', 'O'),
 ('‚ñÅproject', 'O'),
 ('‚ñÅis', 'O'),
 ('‚ñÅthe', 'O'),
 ('‚ñÅmind', 'O'),
 ('‚ñÅmap', 'O'),
 ('‚ñÅ.', 'O'),
 ('‚ñÅWhat', 'O'),
 ('‚ñÅexactly', 'O'),
 ('‚ñÅis', 'O'),
 ('‚ñÅa', 'O'),
 ('‚ñÅmind', 'O'),
 ('‚ñÅmap', 'O'),
 ('‚ñÅ?', 'O'),
 ('‚ñÅAccording', 'O'),
 ('‚ñÅto', 'O'),
 ('‚ñÅthe', 'O'),
 ('‚ñÅdefinition', 'O'),
 ('‚ñÅof', 'O'),
 ('‚ñÅBuz', '

In [25]:
%%time

run_inference(text, labels_true, model_adapter, threshold=0.1)

F-beta score: 0.968944099378882
CPU times: total: 50.3 s
Wall time: 9.68 s


0.968944099378882

In [30]:
%%time
run_inference(text, labels_true, model_adapter, threshold=0.95)

NameError: name 'get_labels' is not defined

In [25]:
%%time

run_inference(text, labels_true, model_adapter)

F-beta score: 0.968944099378882
CPU times: total: 6min 49s
Wall time: 4min


0.968944099378882

### dEBUG

In [28]:
import en_core_web_sm

nlp = en_core_web_sm.load()

In [29]:
%%time
model_name = model_adapter.model_name
to_cur = model_adapter.to_cur

tokenizer = model_adapter.tokenizer
model = model_adapter.model

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

CPU times: total: 1.09 s
Wall time: 1.31 s


In [30]:
%%time


# Step 1: Tokenize the text using spaCy for words
words = [token.text for token in doc]

# Step 2: Tokenize the text using the model's tokenizer and get word to subword mapping
inputs = tokenizer(words, return_tensors='pt', return_offsets_mapping=False, is_split_into_words=True)
word_ids = inputs.word_ids()


CPU times: total: 62.5 ms
Wall time: 79.1 ms


In [31]:
%%time

# Step 3: Run inference
with torch.no_grad():
    outputs = model(**inputs).logits

CPU times: total: 44.4 s
Wall time: 8.5 s


### Test scoring

In [25]:
pred_bio[52]

'B-NAME_STUDENT'

In [26]:
labels_true[52]

'O'

In [27]:
for i, (pred,label) in enumerate(zip(pred_bio,labels_true)):
    if pred != label:
        print(i, pred,label)

52 B-NAME_STUDENT O
53 I-NAME_STUDENT O
54 I-NAME_STUDENT O
55 I-NAME_STUDENT O
56 I-NAME_STUDENT O


In [28]:
pred_comp, labels_true_comp = zip(*[(pred,label) for pred,label in zip(pred_bio,labels_true) if (label != 'O' or pred != 'O' or label != pred)])

In [29]:
pred_comp[:]

('B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'I-NAME_STUDENT',
 'I-NAME_STUDENT',
 'I-NAME_STUDENT',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT')

In [30]:
labels_true_comp[:]

('B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT')

In [33]:
classes_pos

['B-EMAIL',
 'B-ID_NUM',
 'B-NAME_STUDENT',
 'B-PHONE_NUM',
 'B-STREET_ADDRESS',
 'B-URL_PERSONAL',
 'B-USERNAME',
 'I-ID_NUM',
 'I-NAME_STUDENT',
 'I-PHONE_NUM',
 'I-STREET_ADDRESS',
 'I-URL_PERSONAL']

In [35]:
fbeta_score(labels_true_comp, pred_comp , labels = classes_pos,beta=5, average='micro')

0.968944099378882

In [36]:
fbeta_score(labels_true, pred_bio, labels = classes_pos,beta=5, average='micro')

0.968944099378882

In [38]:
def get_fbeta_score_multiclass(y_true, y_pred, beta=5):
    '''
    Parameters:
    - y_true: ground truth labels (shape: (batch_size))
    - y_pred: predicted labels (shape: (batch_size))
    - beta: beta value for F-beta score

    Returns:
    - float: micro F-beta multiclass score disregarding the background class (class 0)
    '''
    # Calculate FNFP both
    FNFP = torch.sum((y_pred != 0) & (y_true != 0) & (y_pred != y_true))
    # Calculate FP
    FP = torch.sum((y_pred != 0) & (y_true == 0)) + FNFP
    # Calculate FN
    FN = torch.sum((y_pred == 0) & (y_true != 0)) + FNFP
    # Calculate TP
    TP = torch.sum((y_pred != 0) & (y_true != 0) & (y_pred == y_true))
    
    # Calculate F-beta score
    fbeta = (1 + beta**2) * TP / ((1 + beta**2) * TP + beta**2 * FN + FP)
    return fbeta

In [42]:
labels_torch = torch.Tensor([classes2id[label] for label in labels_true])
pred_torch = torch.Tensor([classes2id[pred] for pred in pred_bio])

labels_comp_t = torch.Tensor([classes2id[label] for label in labels_true_comp])
pred_comp_t = torch.Tensor([classes2id[pred] for pred in pred_comp])

In [41]:
get_fbeta_score_multiclass(labels_torch, pred_torch)

tensor(0.9689)

In [43]:
get_fbeta_score_multiclass(labels_comp_t, pred_comp_t)

tensor(0.9689)

## Inference (Batch)

In [1]:
#Postponed

# Trainer API

## Instantiate

In [43]:
%%time
model_adapter = Yanis_Adapter(threshold = 0.1)
# text = text_test
# text = df_train.loc[0].full_text
# labels_true = df_train.loc[0].labels
# threshold = 0.1


CPU times: total: 5.33 s
Wall time: 2.37 s


In [44]:
model = model_adapter.model

## Dataset

In [45]:
import pandas as pd
from datasets import Dataset

# Convert the Pandas DataFrame to a Hugging Face dataset
# hf_dataset = Dataset.from_pandas(df_train.iloc[:100])
hf_dataset = Dataset.from_pandas(df_train)


In [46]:
hf_dataset

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'],
    num_rows: 6807
})

## Preprocessing

In [47]:
# def convert_to_features(sample):
#     return {
#         "input_ids": tokenizer(sample["text"], return_tensors="pt")["input_ids"].squeeze(),
#         "attention_mask": tokenizer(sample["text"], return_tensors="pt")["attention_mask"].squeeze(),
#         "labels": torch.tensor(sample["labels"])
#     }

# # Convert dataset to features
# features = [convert_to_features(sample) for sample in hf_dataset]

### Tokenize

In [48]:
%%time
def tokenize_function(example, model_adapter):
    global labels_aligned
    
    tokenizer = model_adapter.tokenizer
    cur_label2model_id = model_adapter.cur_label2model_id
    
    inputs = tokenizer(example["tokens"], return_tensors='pt', return_offsets_mapping=False, is_split_into_words=True)
    tokens = inputs.tokens()
    word_ids = inputs.word_ids()
    
    
    
#     return {
#         "input_ids": inputs["input_ids"].squeeze(),
#         "attention_mask": inputs["attention_mask"].squeeze(),
#         "tokens": tokens,
#         "words": example["tokens"],
#         "word_labels": example["labels"],
#         "word_ids": inputs.word_ids()
#     }

    labels = align_words(tokens, example["tokens"], word_ids, example["labels"])
    label_ids = [cur_label2model_id[label] for label in labels]
    labels_aligned = labels
    
    return {
        "input_ids": inputs["input_ids"].squeeze(),
        "attention_mask": inputs["attention_mask"].squeeze(),
        "tokens": tokens,
        "words": example["tokens"],
        "word_labels": example["labels"],
        "word_ids": inputs.word_ids(),
        "labels": torch.tensor(label_ids),
    }

tokenized_datasets = hf_dataset.map(tokenize_function, fn_kwargs={"model_adapter": model_adapter}, batched=False)


# def tokenize_function(examples):
#     return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

# tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6807/6807 [01:34<00:00, 71.65 examples/s]

CPU times: total: 1min 36s
Wall time: 1min 48s





### Split

In [49]:
train_size = int(0.8 * len(hf_dataset))
eval_size = len(hf_dataset) - train_size
# train_dataset, eval_dataset = hf_dataset.train_test_split(test_size=eval_size)
# list_remove = ['labels']
# dataset_filtered = tokenized_datasets.remove_columns(list_remove)
dataset_filtered = tokenized_datasets

split_datasets = dataset_filtered.train_test_split(test_size=eval_size, seed = 42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

In [50]:
train_dataset

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'input_ids', 'attention_mask', 'words', 'word_labels', 'word_ids'],
    num_rows: 5445
})

In [51]:
eval_dataset

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'input_ids', 'attention_mask', 'words', 'word_labels', 'word_ids'],
    num_rows: 1362
})

## Data Collator

In [52]:
data_collator = DataCollatorForTokenClassification(model_adapter.tokenizer)

## Metric and Postprocessing

In [53]:
def preprocess_logits(logits, label):
    np_probs = torch.softmax(logits, axis=-1) 
    
    return np_probs

In [54]:
import functools
from datasets import load_metric

In [55]:
METRIC_F1 = load_metric("f1",  beta=5)
METRIC_PRECISION = load_metric("precision")
METRIC_RECALL = load_metric("recall")

  METRIC_F1 = load_metric("f1",  beta=5)
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [56]:
    ## (Vectorized) Get highest non-'O' labels for each token with thresholding
    # Get 'O' index of model
    # Get argsort
    # Get First max
    # Get == 'O' Mask
    # Get threshold mask
    # Get final indices

In [57]:
def compute_metrics_base(eval_preds, model_adapter, threshold = 0.1):
    
    global eval_preds_copy
    
    
    eval_preds_copy = eval_preds
    
    np_probs, true_labels_id, inputs = eval_preds
#     logits, true_labels_id = eval_preds
#     np_probs = np.argmax(logits, axis=-1)
#     tokens = inputs.tokens()


    label2id =  model_adapter.model.config.label2id
    o_index = label2id['O']

    np_sorted_indices = np.argsort(np_probs)
    np_max_indices = np_sorted_indices[:,:, -1]
    # np_max = np_sorted_indices[np.arange(np_max_indices.shape[0]), np_max_indices]
    np_max_prob = np.take_along_axis(np_probs, np_max_indices[:, :, np.newaxis], axis=2).squeeze()

    np_2nd_max_indices = np_sorted_indices[:, :, -2]
    # np_2nd_max = np_sorted_indices[np.arange(np_sorted_indices.shape[0]), np_max_indices]
    np_2nd_max_prob = np.take_along_axis(np_probs, np_2nd_max_indices[:, :, np.newaxis], axis=2).squeeze()

    np_O_mask = np_max_indices == o_index
    np_threshold_mask = np_2nd_max_prob > threshold

    np_replace_mask = np_threshold_mask & np_O_mask

    np_label_ids = np.where(np_replace_mask, np_2nd_max_indices, np_max_indices)

    flat_label_ids = np_label_ids.flatten()
    flat_true_labels_id = true_labels_id.flatten()

    # Postprocess labels, convert irrelevant labels to 'O'
    np_labels_irrelevant = np.array(model_adapter.labels_irrelevant)
    flat_label_ids_mask = np.isin(flat_label_ids, np_labels_irrelevant)
    flat_label_ids[flat_label_ids_mask] = o_index

    # Also remove paddings
    mask_padding_inv = flat_true_labels_id != -100
    flat_true_labels_id = flat_true_labels_id[mask_padding_inv]
    flat_label_ids = flat_label_ids[mask_padding_inv]

    dict_scores = {}

    f1_score = METRIC_F1.compute(predictions=flat_label_ids, references=flat_true_labels_id, labels=classes_pos_id, average ='micro')
    f_beta_score = fbeta_score(y_true = flat_true_labels_id, y_pred = flat_label_ids , labels = classes_pos_id,beta=5, average='micro')
    precision = METRIC_PRECISION.compute(predictions=flat_label_ids, references=flat_true_labels_id,labels=classes_pos_id, average ='micro')
    recall = METRIC_RECALL.compute(predictions=flat_label_ids, references=flat_true_labels_id,labels=classes_pos_id, average ='micro')

    for score in [f1_score, precision, recall]:
        dict_scores.update(score) 

    dict_scores['f_beta'] = f_beta_score

    return dict_scores


#     #Test above first
    
    
#     ##Compute competition metrics
    
#     # Step 4: Create a list of which tokens or subwords correspond to a word using the word_ids variable
#     word_subword_mapping = {}
#     for i, word_id in enumerate(word_ids):
#         if word_id is not None:
#             if word_id not in word_subword_mapping:
#                 word_subword_mapping[word_id] = []
#             word_subword_mapping[word_id].append(i)

#     # Step 5: Iterate through pairs of words and subwords to count the majority label
#     word_labels = []
#     for i, word in enumerate(words):
#         if i in word_subword_mapping:
#             subword_labels = pred_labels[word_subword_mapping[i]]
#             majority_label = get_majority(subword_labels)
#             word_labels.append((word, majority_label))
#         else:
#             word_labels.append((word,'O'))


#     token_v, pred_v = zip(*word_labels)

#     pred_conv = [model_id2cur_label[model.config.label2id[pred]] for pred in pred_v]
#     pred_bio = convert_to_bio(token_v, pred_conv)

#     f_beta = fbeta_score(labels, pred_bio, labels = classes_pos,beta=5, average='micro')
    
    
    
#     return metric.compute(predictions=predictions, references=labels)


    #Threshold + postprocessing
    #
    #Compute precision, recall, f1_beta from preds and labels
    
    #
    
    #Get 
    #Get

In [58]:
compute_metrics = functools.partial(compute_metrics_base, model_adapter=model_adapter, threshold=0.1)

## Trainer API

In [59]:
# %load_ext autoreload
# %autoreload 2

In [60]:
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch
from datasets import load_metric

In [82]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    per_device_eval_batch_size=4,    # batch size per device during evaluation
    logging_dir='./logs',            # directory for storing logs
#     do_eval=True                     # Perform evaluation
    include_inputs_for_metrics = True
)

# Initialize Trainer with the evaluation only mode
trainer = Trainer(
    model=model_adapter.model,       # the instantiated ü§ó Transformers model to be trained
    args=training_args,              # training arguments, defined above
    eval_dataset=eval_dataset,       # evaluation dataset
    data_collator=data_collator,     # data collator for evaluation
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics = preprocess_logits
)

In [61]:
# # Evaluate the model
# eval_results = trainer.evaluate()

# print(eval_results)

NameError: name 'inputs' is not defined

In [None]:
eval_dataset.select(range(2))

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'input_ids', 'attention_mask', 'words', 'word_labels', 'word_ids'],
    num_rows: 2
})

In [164]:
# %%time
# eval_results = trainer.evaluate(eval_dataset.select(range(2)))

# print(eval_results)

------------------ compute_metrics called ------------------
<class 'transformers.trainer_utils.EvalPrediction'>
<transformers...001E5DA7C87A0>
{'eval_loss': 8.329302787780762, 'eval_runtime': 72.6745, 'eval_samples_per_second': 0.028, 'eval_steps_per_second': 0.014}
CPU times: total: 7min 4s
Wall time: 1min 12s


In [83]:
%%time
pred_results = trainer.predict(eval_dataset.select(range(20)))

CPU times: total: 26min 18s
Wall time: 4min 54s


In [58]:
tokenized_datasets

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'input_ids', 'attention_mask', 'words', 'word_labels', 'word_ids'],
    num_rows: 100
})

### Whole dataset

In [62]:
tokenized_datasets

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'input_ids', 'attention_mask', 'words', 'word_labels', 'word_ids'],
    num_rows: 6807
})

In [80]:
cur_start, cur_stop = (0,20)
cur_dataset = tokenized_datasets.select(range(cur_start,cur_stop))

#### Batch size 

In [None]:
#### Batch size 8 = 13m 16s
#### Batch size 4 = 4min 54s
#### Batch size 2 = 7min 41s
#### Batch size 1 = 5min 28s

In [81]:
%%time
pred_results = trainer.predict(cur_dataset)

CPU times: total: 41min 1s
Wall time: 7min 41s


In [73]:
pred_results.predictions.shape

(20, 1929, 18)

In [65]:
import pickle

In [66]:
with open(f'./results/preds_240227_{cur_start}_{cur_stop}', 'wb') as file:
    # Use pickle to dump the object into the file
    pickle.dump(pred_results, file)

In [None]:
print('dsad')

In [50]:
pred_results.metrics

{'test_loss': 0.05136679485440254,
 'test_f1': 0.6239316239316239,
 'test_precision': 0.47096774193548385,
 'test_recall': 0.9240506329113924,
 'test_f_beta': 0.8910798122065727,
 'test_runtime': 289.1268,
 'test_samples_per_second': 0.069,
 'test_steps_per_second': 0.01}

### Old Results

In [46]:
pred_results.predictions.shape

(2, 648, 18)

In [51]:
compute_metrics((pred_results.predictions, pred_results.label_ids, None))

{'f1': 0.6239316239316239,
 'precision': 0.47096774193548385,
 'recall': 0.9240506329113924,
 'f_beta': 0.8910798122065727}

In [55]:
import pickle

In [53]:
type(pred_results)

transformers.trainer_utils.PredictionOutput

In [164]:
pred_results.metrics

{'test_loss': 0.0730685442686081,
 'test_f1': 0.4098939929328622,
 'test_precision': 0.2636363636363636,
 'test_recall': 0.9206349206349206,
 'test_f_beta': 0.8401114206128134,
 'test_runtime': 352.2587,
 'test_samples_per_second': 0.057,
 'test_steps_per_second': 0.009}

In [149]:
pred_results.metrics

{'test_loss': 0.0730685442686081,
 'test_f1': 0.017912291537986413,
 'test_precision': 0.009044129112739748,
 'test_recall': 0.9206349206349206,
 'test_runtime': 305.392,
 'test_samples_per_second': 0.065,
 'test_steps_per_second': 0.01}

In [42]:
pred_results.metrics

{'test_loss': 0.040113478899002075,
 'test_f1': 0.028469750889679714,
 'test_precision': 0.01444043321299639,
 'test_recall': 1.0,
 'test_runtime': 30.6181,
 'test_samples_per_second': 0.065,
 'test_steps_per_second': 0.033}

In [324]:
pred_results.metrics

{'test_loss': 10.082626342773438,
 'test_f1': 0.06493506493506493,
 'test_precision': 0.03355704697986577,
 'test_recall': 1.0,
 'test_runtime': 33.6083,
 'test_samples_per_second': 0.06,
 'test_steps_per_second': 0.03}

In [210]:
A = np.array([1, 2, 3, 4, 5])

# Create Array B (boolean mask)
B = np.array([True, False, True, False, True])

# Use boolean mask to select values from Array A where the mask is True
selected_values = A[B]

In [211]:
selected_values

array([1, 3, 5])

In [96]:
%%time

pred_results = trainer.predict(eval_dataset)

CPU times: total: 26min 36s
Wall time: 4min 36s


In [52]:
trainer.evaluate(eval_dataset.select(range(2)))

{'eval_loss': 0.07608494907617569,
 'eval_f1': 0.7619047619047619,
 'eval_precision': 0.6153846153846154,
 'eval_recall': 1.0,
 'eval_f_beta': 0.9765258215962441,
 'eval_runtime': 17.8009,
 'eval_samples_per_second': 0.112,
 'eval_steps_per_second': 0.056}

In [99]:
pred_results.metrics

{'test_runtime': 276.4769,
 'test_samples_per_second': 0.072,
 'test_steps_per_second': 0.011}

In [73]:
pred_results.metrics

{'test_runtime': 260.9532,
 'test_samples_per_second': 0.077,
 'test_steps_per_second': 0.011}

In [64]:
type(eval_results)

dict

In [39]:
type(eval_dataset)

str

In [21]:
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch
from datasets import load_metric

# Define your custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

# Sample dataset
dataset = [
    {"text": "This is a sample text.", "labels": 0},
    {"text": "Another example here.", "labels": 1}
    # Add more samples as needed
]

# Initialize your custom dataset
custom_dataset = CustomDataset(dataset)

# Define a function to convert samples to features
def convert_to_features(sample):
    return {
        "input_ids": tokenizer(sample["text"], return_tensors="pt")["input_ids"].squeeze(),
        "attention_mask": tokenizer(sample["text"], return_tensors="pt")["attention_mask"].squeeze(),
        "labels": torch.tensor(sample["labels"])
    }

# Initialize tokenizer
tokenizer = YourTokenizer.from_pretrained("tokenizer_name")

# Convert dataset to features
features = [convert_to_features(sample) for sample in custom_dataset]

# Split the dataset into train and eval
train_size = int(0.8 * len(features))
eval_size = len(features) - train_size
train_dataset, eval_dataset = torch.utils.data.random_split(features, [train_size, eval_size])

# Initialize data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    per_device_eval_batch_size=8,    # batch size per device during evaluation
    logging_dir='./logs',            # directory for storing logs
    do_eval=True                     # Perform evaluation
)

# Initialize Trainer with the evaluation only mode
trainer = Trainer(
    model=model,                     # the instantiated ü§ó Transformers model to be trained
    args=training_args,              # training arguments, defined above
    eval_dataset=eval_dataset,       # evaluation dataset
    data_collator=data_collator      # data collator for evaluation
)

# Evaluate the model
eval_results = trainer.evaluate()

print(eval_results)




# Experiments

## Metric experiments

In [None]:
#Metrics:
#Precision
#Recall
#f1_beta 
#f1_beta word-level

In [129]:
import reprlib

### Callback

In [161]:
def test_callback(eval_preds):
    print('------------------ compute_metrics called ------------------')
    print(type(eval_preds))
    print(reprlib.repr(eval_preds))
    
    return {}

In [None]:
def test_metric(eval_preds):
    
    logits, labels = eval_preds
    probabilities = np.argmax(logits, axis=-1)
    tokens = inputs.tokens()
    
    # Get highest non-'O' labels for each token with thresholding
    token_labels = get_labels(tokens, label_probabilities, model.config.id2label, threshold)
    pred_labels = np.array([label for token,label in token_labels])

    dict_scores = {}
    
    f1_score = metric_f1.compute(predictions=preds_id, references=true_labels_id, labels=labels_pos_id, average ='micro')
    precision = metric_f1.compute(predictions=preds_id, references=true_labels_id)
    recall = metric_f1.compute(predictions=preds_id, references=true_labels_id)
    
    for score in [f1_score, precision, recall]:
        dict_scores.update(score) 

    return dict_scores

In [177]:
from datasets import load_metric

### Load Metric and testing

In [178]:
# metric_f1 = load_metric("leslyarun/fbeta_score", beta=5)
METRIC_F1 = load_metric("f1",  beta=5)
METRIC_PRECISION = load_metric("precision")
METRIC_RECALL = load_metric("recall")

  METRIC_F1 = load_metric("f1",  beta=5)
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [205]:
preds_id = [classes2id[pred] for pred in pred_bio]

In [204]:
true_labels_id =  [classes2id[label] for label in labels_true]

In [184]:
labels_pos_id

NameError: name 'labels_pos_id' is not defined

In [189]:
labels_pos_id

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [206]:
METRIC_F1.compute(predictions=preds_id, references=true_labels_id, labels=labels_pos_id, average ='micro')

{'f1': 0.7058823529411765}

In [169]:
from datasets import list_metrics

In [173]:
[name for name in list_metrics if 'f' in name]

['chrf',
 'confusion_matrix',
 'f1',
 'frugalscore',
 'BucketHeadP65/confusion_matrix',
 'DarrenChensformer/eval_keyphrase',
 'DarrenChensformer/relation_extraction',
 'LuckiestOne/valid_efficiency_score',
 'Muennighoff/code_eval_octopack',
 'SpfIo/wer_checker',
 'Viona/fuzzy_reordering',
 'Viona/infolm',
 'angelina-wang/directional_bias_amplification',
 'aryopg/roc_auc_skip_uniform_labels',
 'bstrai/classification_report',
 'danieldux/hierarchical_softmax_loss',
 'dgfh76564/accents_unplugged_eval',
 'erntkn/dice_coefficient',
 'fnvls/bleu1234',
 'fnvls/bleu_1234',
 'franzi2505/detection_metric',
 'fschlatt/ner_eval',
 'gjacob/chrf',
 'gorkaartola/metric_for_tp_fp_samples',
 'kashif/mape',
 'leslyarun/fbeta_score',
 'maksymdolgikh/seqeval_with_fbeta',
 'mfumanelli/geometric_mean',
 'mgfrantz/roc_auc_macro',
 'mtc/fragments',
 'nevikw39/specificity',
 'omidf/squad_precision_recall',
 'red1bluelost/evaluate_genericify_cpp',
 'ronaldahmed/nwentfaithfulness',
 'transformersegmentation/segm

In [170]:
list_metrics()

  list_metrics()


['accuracy',
 'bertscore',
 'bleu',
 'bleurt',
 'brier_score',
 'cer',
 'character',
 'charcut_mt',
 'chrf',
 'code_eval',
 'comet',
 'competition_math',
 'confusion_matrix',
 'coval',
 'cuad',
 'exact_match',
 'f1',
 'frugalscore',
 'glue',
 'google_bleu',
 'indic_glue',
 'mae',
 'mahalanobis',
 'mape',
 'mase',
 'matthews_correlation',
 'mauve',
 'mean_iou',
 'meteor',
 'mse',
 'nist_mt',
 'pearsonr',
 'perplexity',
 'poseval',
 'precision',
 'r_squared',
 'recall',
 'rl_reliability',
 'roc_auc',
 'rouge',
 'sacrebleu',
 'sari',
 'seqeval',
 'smape',
 'spearmanr',
 'squad',
 'squad_v2',
 'super_glue',
 'ter',
 'trec_eval',
 'wer',
 'wiki_split',
 'xnli',
 'xtreme_s',
 'Aledade/extraction_evaluation',
 'AlhitawiMohammed22/CER_Hu-Evaluation-Metrics',
 'BucketHeadP65/confusion_matrix',
 'BucketHeadP65/roc_curve',
 'CZLC/rouge_raw',
 'DaliaCaRo/accents_unplugged_eval',
 'DarrenChensformer/eval_keyphrase',
 'DarrenChensformer/relation_extraction',
 'DoctorSlimm/bangalore_score',
 'DoctorS

In [22]:
arr = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]])

# Define an equality condition
condition = arr == 5

# Use np.where() to get the indices where the condition is True
indices = np.where(condition)

# Print the indices
print(indices)

(array([1], dtype=int64), array([1], dtype=int64))


In [34]:
arr = np.array([[3, 7, 1],
                [5, 2, 4]])

# Perform argsort along axis 1 (sorting each row independently)
sorted_indices = np.argsort(arr, axis=1)

# Get the index of the nth smallest element along axis 1
n = -1  # Change n to the desired value
nth_smallest_index = sorted_indices[:, n]

# Use fancy indexing to get the nth smallest element along axis 1
nth_smallest_elements = arr[np.arange(arr.shape[0]), nth_smallest_index]
nth_smallest_elements

array([7, 5])

In [31]:
arr = np.array([[3, 2, 4],
                [1, 6, 5]])

# Perform argsort along an axis (axis=1 in this example)
sorted_indices = np.argsort(arr, axis=1)

# Get the nth element along the axis
n = 1  # Change this to the desired index
nth_element = arr[np.arange(arr.shape[0])[:, np.newaxis], sorted_indices][:, n]

print(nth_element)

[3 5]


In [161]:
array.shape

(2, 2, 3)

In [162]:

array[:, :, np.newaxis].shape

(2, 2, 1, 3)

In [30]:
nth_smallest_elements

array([3, 4])

In [28]:
np.where(arr == 3)

(array([0], dtype=int64), array([2], dtype=int64))

In [23]:
arr == 5

array([[False, False, False],
       [False,  True, False],
       [False, False, False]])

In [None]:
datasets.

In [None]:
metric_f1.c

### Vectorized

In [42]:
# def compute_metrics_base(eval_preds, model_adapter, threshold = 0.1):
    
#     global eval_preds_copy
#     eval_preds_copy = eval_preds
    
#     logits, true_labels_id = eval_preds
#     np_probs = np.argmax(logits, axis=-1)
#     tokens = inputs.tokens()
    
    
    
    
#     ## (Vectorized) Get highest non-'O' labels for each token with thresholding
#     # Get 'O' index of model
#     # Get argsort
#     # Get First max
#     # Get == 'O' Mask
#     # Get threshold mask
#     # Get final indices
    
#     label2id =  model_adapter.model.config.label2id
#     o_index = label2id['O']
    
#     np_sorted_indices = np.argsort(np_probs)
#     np_max_indices = np_sorted[:, -1]
#     np_max = np_sorted[np.arange(arr.shape[0]), np_max_indices]
    
#     np_2nd_max_indices = p_sorted[:, -2]
#     np_2nd_max = np_sorted[np.arange(arr.shape[0]), np_max_indices]
    
#     np_O_mask = np_max_indices == o_index
#     np_threshold_mask = np_2nd_max > threshold
    
#     np_replace_mask = np_threshold_mask & np_O_mask
    
    
#     #np_max_indices[np_replace_mask] = np_2nd_max_indices
#     np_label_ids = np.where(np_replace_mask, np_max, np_2nd_max)
    
#     ##
#     #Compute model metrics
    
#     #Define labels_pos_id in model_adapter base class (exclude O)
    
    
#     dict_scores = {}
    
#     f1_score = metric_f1.compute(predictions=np_label_ids, references=true_labels_id, labels=labels_pos_id, average ='micro')
#     precision = metric_f1.compute(predictions=np_label_ids, references=true_labels_id)
#     recall = metric_f1.compute(predictions=np_label_ids, references=true_labels_id)
    
#     for score in [f1_score, precision, recall]:
#         dict_scores.update(score) 

        

#     return dict_scores


# #     #Test above first
    
    
# #     ##Compute competition metrics
    
# #     # Step 4: Create a list of which tokens or subwords correspond to a word using the word_ids variable
# #     word_subword_mapping = {}
# #     for i, word_id in enumerate(word_ids):
# #         if word_id is not None:
# #             if word_id not in word_subword_mapping:
# #                 word_subword_mapping[word_id] = []
# #             word_subword_mapping[word_id].append(i)

# #     # Step 5: Iterate through pairs of words and subwords to count the majority label
# #     word_labels = []
# #     for i, word in enumerate(words):
# #         if i in word_subword_mapping:
# #             subword_labels = pred_labels[word_subword_mapping[i]]
# #             majority_label = get_majority(subword_labels)
# #             word_labels.append((word, majority_label))
# #         else:
# #             word_labels.append((word,'O'))


# #     token_v, pred_v = zip(*word_labels)

# #     pred_conv = [model_id2cur_label[model.config.label2id[pred]] for pred in pred_v]
# #     pred_bio = convert_to_bio(token_v, pred_conv)

# #     f_beta = fbeta_score(labels, pred_bio, labels = classes_pos,beta=5, average='micro')
    
    
    
# #     return metric.compute(predictions=predictions, references=labels)


#     #Threshold + postprocessing
#     #
#     #Compute precision, recall, f1_beta from preds and labels
    
#     #
    
#     #Get 
#     #Get

In [41]:
import functools
compute_metrics = functools.partial(compute_metrics_base, model_adapter=model_adapter, threshold=0.1)

#### Experiment

In [45]:
eval_preds_copy.label_ids[0][2]

0

In [77]:
eval_preds_copy.inputs

array([[    1, 51146,   377, ...,     0,     0,     0],
       [    1, 69528,   463, ...,  1105,  1084,     2]], dtype=int64)

In [63]:
def preprocess_logits(logits, label):
    np_probs = torch.softmax(logits, axis=-1) 
    
    return np_probs

In [None]:
np_probs = np.softmax(logits, axis=-1)

In [78]:
inputs, logits, true_labels_id = eval_preds_copy

In [244]:
eval_preds_copy.id

AttributeError: 'EvalPrediction' object has no attribute 'id'

In [243]:
eval_preds_copy.np_probs.start_index

AttributeError: 'EvalPrediction' object has no attribute 'np_probs'

In [147]:
hreshold=0.5

np_probs, true_labels_id, inputs = eval_preds_copy

# tokens = inputs.tokens()

## (Vectorized) Get highest non-'O' labels for each token with thresholding
# Get 'O' index of model
# Get argsort
# Get First max
# Get == 'O' Mask
# Get threshold mask
# Get final indices

label2id =  model_adapter.model.config.label2id
o_index = label2id['O']

np_sorted_indices = np.argsort(np_probs)
np_max_indices = np_sorted_indices[:,:, -1]
# np_max = np_sorted_indices[np.arange(np_max_indices.shape[0]), np_max_indices]
np_max_prob = np.take_along_axis(np_probs, np_max_indices[:, :, np.newaxis], axis=2).squeeze()

np_2nd_max_indices = np_sorted_indices[:, :, -2]
# np_2nd_max = np_sorted_indices[np.arange(np_sorted_indices.shape[0]), np_max_indices]
np_2nd_max_prob = np.take_along_axis(np_probs, np_2nd_max_indices[:, :, np.newaxis], axis=2).squeeze()

np_O_mask = np_max_indices == o_index
np_threshold_mask = np_2nd_max_prob > threshold

np_replace_mask = np_threshold_mask & np_O_mask

np_label_ids = np.where(np_replace_mask, np_2nd_max_indices, np_max_indices)

flat_label_ids = np_label_ids.flatten()
flat_true_labels_id = true_labels_id.flatten()

# Postprocess labels, convert irrelevant labels to 'O'
np_labels_irrelevant = np.array(model_adapter.labels_irrelevant)
flat_label_ids_mask = np.isin(flat_label_ids, np_labels_irrelevant)
flat_label_ids[flat_label_ids_mask] = o_index

# Also remove paddings
mask_padding_inv = flat_true_labels_id != -100
flat_true_labels_id = flat_true_labels_id[mask_padding_inv]
flat_label_ids = flat_label_ids[mask_padding_inv]

dict_scores = {}

f1_score = METRIC_F1.compute(predictions=flat_label_ids, references=flat_true_labels_id, labels=classes_pos_id, average ='micro')
f_beta_score = fbeta_score(y_true = flat_true_labels_id, y_pred = flat_label_ids , labels = classes_pos_id,beta=5, average='micro')
precision = METRIC_PRECISION.compute(predictions=flat_label_ids, references=flat_true_labels_id,labels=classes_pos_id, average ='micro')
recall = METRIC_RECALL.compute(predictions=flat_label_ids, references=flat_true_labels_id,labels=classes_pos_id, average ='micro')

input_id = eval_dataset['input_id']
word_ids = eval_dataset['word_ids]
                        





for score in [f1_score, precision, recall]:
    dict_scores.update(score) 
    
dict_scores['f_beta'] = f_beta_score


dict_scores

{'f1': 0.5333333333333333,
 'precision': 0.36363636363636365,
 'recall': 1.0,
 'f_beta': 0.9369369369369369}

In [135]:
flat_true_labels_id[flat_true_labels_id == -100]

array([], dtype=int64)

In [136]:
pred_comp, labels_true_comp = zip(*[(pred,label) for pred,label in zip(list(flat_label_ids),list(flat_true_labels_id)) if (label != 0 or pred != 0 or label != pred)])

In [140]:
fbeta_score(labels_true_comp, pred_comp , labels = classes_pos_id,beta=0, average='micro')

0.36363636363636365

In [141]:
fbeta_score(labels_true_comp, pred_comp , labels = classes_pos_id,beta=5, average='micro')

0.9369369369369369

In [138]:
zip_comp = list(zip(pred_comp,labels_true_comp))

In [139]:
zip_comp

[(2, 2),
 (2, 2),
 (2, 0),
 (2, 0),
 (12, 0),
 (12, 0),
 (12, 0),
 (12, 0),
 (2, 2),
 (2, 2),
 (2, 0)]

In [76]:
zip_comp[1][0]

0

In [103]:
zip_comp[2][0] == zip_comp[2][1]

True

In [104]:
zip_comp[2]

(2, 2)

In [105]:
zip_comp[2][0] == zip_comp[2][1]

True

In [339]:
true_labels_id.shape

(2, 648)

In [338]:
np_probs.shape

(2, 648, 18)

In [302]:
dict_scores

{'f1': 0.06493506493506493,
 'precision': 0.0021240441801189465,
 'recall': 0.0021240441801189465}

In [260]:
model_adapter.O_id


0

In [304]:
pred_results.metrics

{'test_loss': 10.082626342773438,
 'test_f1': 0.06493506493506493,
 'test_precision': 0.03355704697986577,
 'test_recall': 1.0,
 'test_runtime': 34.472,
 'test_samples_per_second': 0.058,
 'test_steps_per_second': 0.029}

In [259]:
o_index

0

In [249]:
 dict_scores

{'f1': 0.05847953216374269,
 'precision': 0.0021240441801189465,
 'recall': 0.0021240441801189465}

In [245]:
np.array([11,2])

array([11,  2])

In [230]:
tokens[0]

NameError: name 'tokens' is not defined

In [345]:
flat_true_labels_id[0]

0

In [344]:
flat_label_ids[0]

2

In [310]:
np_probs, true_labels_id, inputs = eval_preds_copy

In [342]:
true_labels_id[0][0]

0

In [343]:
flat_true_labels_id[0]

0

In [227]:
dict_scores

{'f1': 0.05847953216374269,
 'precision': 0.0021240441801189465,
 'recall': 0.0021240441801189465}

In [210]:
true_labels_id.shape

(2, 1177)

In [209]:
np_label_ids.shape

(2, 1177)

In [190]:
classes_pos_id

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [188]:
labels_pos_id

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [168]:
np_label_ids.shape

(2, 1177)

In [140]:
np_probs[np.arange(np_probs.shape[0])[:, np.newaxis], np_sorted_indices][:, -1]

IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (2,1) (2,1177,18) 

In [146]:
np_sorted_indices.shape

(2, 1177, 18)

In [145]:
np_probs[:, np_sorted_indices].shape

(2, 2, 1177, 18, 18)

In [151]:
def nth_largest_along_z(array, n):
    # Step 1: Use argsort along the z-axis
    sorted_indices = np.argsort(array, axis=2)
    
    # Step 2: Extract the nth largest index along the z-axis
    nth_largest_index = sorted_indices[:, :, -n]
    
    # Step 3: Use the obtained index to extract the corresponding value
    nth_largest_values = array[np.arange(array.shape[0])[:, None, None], 
                               np.arange(array.shape[1])[None, :, None], 
                               nth_largest_index]
    
    return nth_largest_values

# Example usage:
array = np.array([[[1, 2, 3],
                   [4, 5, 6]],
                  [[7, 8, 9],
                   [10, 11, 12]]])

# Get the 2nd largest value along the z-axis
nth_largest = nth_largest_along_z(array, 2)
print(nth_largest)

[[[ 2  2]
  [ 5  5]]

 [[ 8  8]
  [11 11]]]


In [158]:
def nth_largest_along_z(array, n):
    global nth_largest_index
    # Step 1: Use argsort along the z-axis
    sorted_indices = np.argsort(array, axis=2)
    
    # Step 2: Extract the nth largest index along the z-axis
    nth_largest_index = sorted_indices[:, :, -n]
    
    # Step 3: Use the obtained index to extract the corresponding value
    nth_largest_values = np.take_along_axis(array, nth_largest_index[:, :, np.newaxis], axis=2)
    
    # Reshape to 2 dimensions
    nth_largest_values = nth_largest_values.squeeze(axis=2)
    
    return nth_largest_values

# Example usage:
array = np.array([[[1, 2, 3],
                   [4, 5, 6]],
                  [[7, 8, 9],
                   [10, 11, 12]]])

# Get the 2nd largest value along the z-axis
nth_largest = nth_largest_along_z(array, 2)
print(nth_largest)

[[ 2  5]
 [ 8 11]]


In [164]:
nth_largest_index.shape

(2, 2)

In [159]:
nth_largest_index[:, :, np.newaxis].shape

(2, 2, 1)

In [156]:
nth_largest

array([[ 2,  5],
       [ 8, 11]])

In [153]:
array.shape

(2, 2, 3)

In [154]:
nth_largest.shape

(2, 2, 1)

In [142]:
np.arange(np_probs.shape[0])[:, np.newaxis].shape

(2, 1)

In [138]:
np_probs[:,np_max_indices].shape


(2, 2, 1177, 18)

In [126]:
np_sorted_indices[np.arange(np_max_indices.shape[0]), np_max_indices]

IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (2,) (2,1177) 

In [135]:
np_probs[np.arange(np_probs.shape[0]), np_max_indices]

IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (2,) (2,1177) 

In [109]:
np_max_indices = np_sorted_indices[:,:, 18]

IndexError: too many indices for array: array is 2-dimensional, but 3 were indexed

In [120]:
np_sorted_indices.shape

(2, 1177, 18)

In [118]:
np_max_indices = np_sorted_indices[:, -1]

In [124]:
# np_max_indices = np_sorted_indices[:,:,-1]
np_max_indices = np_sorted_indices[:,:,-1]
np_max_indices.shape

(2, 1177)

In [123]:
np_max_indices

array([[ 8, 10, 11,  7, 14,  6, 15, 17,  0, 13, 12,  5, 16,  3,  4,  1,
         9,  2],
       [10,  8, 14, 17, 11,  6,  7, 12, 13, 15,  5,  3,  4,  1, 16,  9,
         2,  0]], dtype=int64)

In [111]:
np_sorted_indices.shape

(2, 1177, 18)

In [112]:
np_max_indices.shape

(2, 18)

In [89]:
np_sorted_indices.shape

(2, 1177)

In [87]:
np_max

array([ 251, 1155], dtype=int64)

In [86]:
np_label_ids

array([ 251, 1155], dtype=int64)

In [59]:
np_sorted_indices.shape

(2, 1177)

In [58]:
true_labels_id.shape

(2, 1177)

In [57]:
logits.shape

(2, 1177, 18)

In [56]:
np_label_ids

array([ 251, 1155], dtype=int64)

In [None]:
compute_metrics 

In [17]:
df = pd.DataFrame({'A': [1, 2, 3],
                   'B': [4, 5, 6],
                   'C': [7, 8, 9]})

filtered_df = df[df['A'] > 1] 

In [18]:
filtered_df

Unnamed: 0,A,B,C
1,2,5,8
2,3,6,9


In [37]:
arr = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]])

# Create a boolean mask selecting elements greater than 3
mask = arr > 3

# Use the mask to select elements from the original array
result = arr[mask]

print("Original array:")
print(arr)
print("\nBoolean mask:")
print(mask)
print("\nResulting array:")
print(result)

Original array:
[[1 2 3]
 [4 5 6]
 [7 8 9]]

Boolean mask:
[[False False False]
 [ True  True  True]
 [ True  True  True]]

Resulting array:
[4 5 6 7 8 9]


In [39]:
# Create an example array
arr = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]])

# Create a boolean mask selecting elements greater than 4
mask = arr > 4

# Use the mask to select elements from the original array
result = arr[mask]

print("Original array:")
print(arr)
print("\nBoolean mask:")
print(mask)
print("\nResulting array:")
print(result)

Original array:
[[1 2 3]
 [4 5 6]
 [7 8 9]]

Boolean mask:
[[False False False]
 [False  True  True]
 [ True  True  True]]

Resulting array:
[5 6 7 8 9]


In [40]:
import numpy as np

# Create an example array
arr = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]])

# Create a boolean mask selecting elements from the second row and the last column
mask = np.array([[False, False, False],
                 [False, True, False],
                 [False, False, True]])

# Use the mask to select elements from the original array
result = arr[mask]

print("Original array:")
print(arr)
print("\nBoolean mask:")
print(mask)
print("\nResulting array:")
print(result)

Original array:
[[1 2 3]
 [4 5 6]
 [7 8 9]]

Boolean mask:
[[False False False]
 [False  True False]
 [False False  True]]

Resulting array:
[5 9]


In [41]:
import numpy as np

# Create an example array
arr = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]])

# Create a boolean mask selecting elements from the second and third row
mask = np.array([False, True, True])

# Use the mask to select rows from the original array
result = arr[mask]

print("Original array:")
print(arr)
print("\nBoolean mask:")
print(mask)
print("\nResulting array:")
print(result)

Original array:
[[1 2 3]
 [4 5 6]
 [7 8 9]]

Boolean mask:
[False  True  True]

Resulting array:
[[4 5 6]
 [7 8 9]]


In [49]:
arr = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]])

# Create a boolean mask selecting elements greater than 4 along the horizontal axis (axis=1)
mask = arr > 4

# Use the mask to select elements from the original array along the horizontal axis
result = arr[:, mask.any(axis=1)]

print("Original array:")
print(arr)
print("\nBoolean mask:")
print(mask)
print("\nResulting array:")
print(result)

Original array:
[[1 2 3]
 [4 5 6]
 [7 8 9]]

Boolean mask:
[[False False False]
 [False  True  True]
 [ True  True  True]]

Resulting array:
[[2 3]
 [5 6]
 [8 9]]


In [50]:
mask.any(axis=1)

array([False,  True,  True])

In [36]:
arr = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]])

# 2D boolean mask with the same shape as arr
mask_2d = np.array([[True, False, True],
                    [False, True, False],
                    [True, False, True]])

result_2d = arr[mask_2d]
print(result_2d)

[1 3 5 7 9]


In [21]:
condition = lambda x: x['A'] > 1
df.filter(['A'],condition, axis=0)

TypeError: Keyword arguments `items`, `like`, or `regex` are mutually exclusive

# Profile

In [53]:
import cProfile

In [54]:
results = cProfile.run('run_inference(text, labels_true, model_adapter)')

F-beta score: 0.968944099378882
         1778966 function calls (1738293 primitive calls) in 14.654 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      690    0.004    0.000    0.041    0.000 1039821328.py:1(get_majority)
        1    0.009    0.009   14.654   14.654 1497379277.py:1(run_inference)
        1    0.001    0.001    0.001    0.001 1497379277.py:13(<listcomp>)
        1    0.000    0.000    0.000    0.000 1497379277.py:29(<listcomp>)
        1    0.001    0.001    0.002    0.002 1497379277.py:52(<listcomp>)
        1    0.000    0.000    0.000    0.000 2159492214.py:1(convert_to_bio)
        1    0.057    0.057    0.078    0.078 2791921769.py:1(get_labels)
       14    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:100(acquire)
        9    0.000    0.000    0.008    0.001 <frozen importlib._bootstrap>:1022(_find_and_load)
        5    0.000    0.000    0.000    0.000 <frozen importlib._bootstra