# Import

In [13]:
import json
import os
import pandas as pd
import re

In [14]:
import spacy
from spacy import displacy
from spacy.tokens import Span

In [15]:
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForTokenClassification
from transformers import pipeline

In [17]:
from transformers import TFDebertaForTokenClassification, DebertaTokenizer

# Load Data

In [6]:
path_train = './in/train.json'
path_test = './in/test.json'

In [7]:
train_json = json.load(open(path_train))
df_train = pd.json_normalize(train_json)

test_json = json.load(open(path_test))
df_test = pd.json_normalize(test_json)

In [8]:
train_json[0].keys()

dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])

In [9]:
df_train.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


# Spacy

In [10]:
import spacy

# Load spaCy English NER model
nlp = spacy.load("en_core_web_sm")

# Text containing PII
text = "John Doe's email is john.doe@email.com, and his phone number is +1 (555) 123-4567."

# Process the text with spaCy NER
doc = nlp(text)

# Print identified entities
for ent in doc.ents:
    print(f"Entity: {ent.text}, Type: {ent.label_}")

Entity: John Doe's, Type: PERSON
Entity: john.doe@email.com, Type: ORG
Entity: 555, Type: CARDINAL
Entity: 123-4567, Type: CARDINAL


In [11]:
import spacy
from spacy import displacy

# Step 4: Add custom labels
labels = ["LABEL1", "LABEL2", ...]  # Replace with your actual labels

for token, label in zip(doc, labels):
    token._.label = label

# Step 5: Visualize the document with labels
displacy.serve(doc, style="ent")

# Optional: Visualize labels beneath the text
def visualize_with_labels(doc):
    for token in doc:
        print(f"{token.text}\t{token._.label}")

visualize_with_labels(doc)

AttributeError: [E047] Can't assign a value to unregistered extension attribute 'label'. Did you forget to call the `set_extension` method?

In [20]:
nlp = spacy.load("en_core_web_sm")

# Input sentence
sentence = "Don't you love 🤗 Transformers? We sure do."

# Process the sentence using SpaCy
doc = nlp(sentence)

# Tokenize the sentence
tokens = [token.text for token in doc]
tokens

['Do', "n't", 'you', 'love', '🤗', 'Transformers', '?', 'We', 'sure', 'do', '.']

['SpaCy', 'is', 'a', 'powerful', 'NLP', 'library', 'for', 'Python', '.']

## Chunking

In [50]:
len(predicted_tokens_classes)

835

In [36]:
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))
tokens

['[CLS]',
 '▁Design',
 '▁Thinking',
 '▁for',
 '▁innovation',
 '▁reflex',
 'ion',
 '-',
 'Av',
 'ril',
 '▁2021',
 '-',
 'N',
 'atha',
 'lie',
 '▁S',
 'ylla',
 '▁Challenge',
 '▁&',
 '▁selection',
 '▁The',
 '▁tool',
 '▁I',
 '▁use',
 '▁to',
 '▁help',
 '▁all',
 '▁stakeholders',
 '▁finding',
 '▁their',
 '▁way',
 '▁through',
 '▁the',
 '▁complexity',
 '▁of',
 '▁a',
 '▁project',
 '▁is',
 '▁the',
 '▁mind',
 '▁map',
 '.',
 '▁What',
 '▁exactly',
 '▁is',
 '▁a',
 '▁mind',
 '▁map',
 '?',
 '▁According',
 '▁to',
 '▁the',
 '▁definition',
 '▁of',
 '▁Buz',
 'an',
 '▁T',
 '.',
 '▁and',
 '▁Buz',
 'an',
 '▁B',
 '.',
 '▁(',
 '1999',
 ',',
 '▁Des',
 's',
 'ine',
 '-',
 'moi',
 '▁l',
 "'",
 'intelligence',
 '.',
 '▁Paris',
 ':',
 '▁Les',
 '▁É',
 'dition',
 's',
 '▁d',
 "'",
 'Organ',
 'isation',
 '.',
 ')',
 ',',
 '▁the',
 '▁mind',
 '▁map',
 '▁(',
 'or',
 '▁heuristic',
 '▁diagram',
 ')',
 '▁is',
 '▁a',
 '▁graphic',
 '▁representation',
 '▁technique',
 '▁that',
 '▁follows',
 '▁the',
 '▁natural',
 '▁functioning',


In [None]:
displacy.render(doc, style = 'span')

## Run

In [None]:
import spacy

# Load spaCy English NER model
nlp = spacy.load("en_core_web_sm")

# Text containing PII
text = df_train.loc[0].full_text

# Process the text with spaCy NER
doc = nlp(text)

# Print identified entities
for ent in doc.ents:
    print(f"Entity: {ent.text}, Type: {ent.label_}")

## Vis Debug

In [None]:
document = df_train.loc[0].full_text

labels = [
    {"start": 1, "end": 2, "label": "ORG"},
    {"start": 3, "end": 4, "label": "PERSON"},
    # Add more label entries as needed
]

nlp = spacy.load("en_core_web_sm")
doc = nlp(document)
entities = [(ent["start"], ent["end"], ent["label"]) for ent in labels]
spans = [spacy.tokens.Span(doc, start, end, label=label) for start, end, label in entities]


doc.spans['sc'] = spans
displacy.render(doc, style="span", jupyter=True)

# doc.ents = spans
# displacy.render(doc, style="ent", jupyter=True)

In [58]:
[spacy.tokens.Span(doc, start, end, label=label) for start, end, label in entities]

[, ]

In [55]:
doc.ents 

()

In [53]:
for token in doc:
    print(f"{token.text}: {token.idx} - {token.idx + len(token.text)}")

Your: 0 - 4
long: 5 - 9
document: 10 - 18
goes: 19 - 23
here: 24 - 28
.: 28 - 29


In [7]:
print( df_train.loc[0].full_text)

Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla

Challenge & selection

The tool I use to help all stakeholders finding their way through the complexity of a project is the  mind map.

What exactly is a mind map? According to the definition of Buzan T. and Buzan B. (1999, Dessine-moi  l'intelligence. Paris: Les Éditions d'Organisation.), the mind map (or heuristic diagram) is a graphic  representation technique that follows the natural functioning of the mind and allows the brain's  potential to be released. Cf Annex1

This tool has many advantages:

•  It is accessible to all and does not require significant material investment and can be done  quickly

•  It is scalable

•  It allows categorization and linking of information

•  It can be applied to any type of situation: notetaking, problem solving, analysis, creation of  new ideas

•  It is suitable for all people and is easy to learn

•  It is fun and encourages exchanges

•  It makes visible the dimension of pr

# BERT (Tensorflow)

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForTokenClassification
from transformers import pipeline

## Debug

In [None]:
# Print confidence scores for each label
confidence_scores = tf.nn.softmax(outputs, axis=-1).numpy()[0]
for token, label, confidence_score in zip(tokens, predicted_labels, confidence_scores):
    confidence = confidence_score[tokenizer.convert_tokens_to_ids(label)]
    print(f"Token: {token}, Label: {label}, Confidence: {confidence}")


## Run

In [15]:
# Sample text with PII

full_text = df_train.loc[0].full_text
text = full_text

# Load pre-trained BERT model and tokenizer
model_name = "dslim/bert-large-NER" 
# model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"  # You can use other BERT models from Hugging Face
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForTokenClassification.from_pretrained(model_name)

# Tokenize the input text
inputs = tokenizer.encode_plus(text, return_tensors="tf", padding=True, truncation=True)
# inputs = tokenizer.encode(text, return_tensors="tf")
tokens_full = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))

tokens = tokens_full[:500]


# print("Input Text:", text)
# print("Tokenized Input:", tokens)
# print("Input IDs:", inputs)


# Perform token classification (NER)
outputs = model(inputs["input_ids"])

# print("Raw Outputs:", outputs)

predictions = tf.argmax(outputs.logits, axis=2)

# Decode the predicted labels
# predicted_labels = [tokenizer.convert_ids_to_tokens(prediction) for prediction in predictions.numpy()[0]]
predicted_labels = tokenizer.convert_ids_to_tokens(predictions.numpy()[0]) 

# print("Predicted Labels:", predicted_labels)

entities = []
current_entity = ""
for token, label in zip(inputs["input_ids"].numpy()[0], predicted_labels):
    token = tokenizer.decode(token)
    if label.startswith('B'):
        if current_entity:
            entities.append(current_entity.strip())
        current_entity = token
    elif label.startswith('I'):
        current_entity += ' ' + token
    else:
        if current_entity:
            entities.append(current_entity.strip())
        current_entity = ""

# Add the last entity if any
if current_entity:
    entities.append(current_entity.strip())

# Display the identified entities
# print("Identified Entities:")
# for entity in entities:
#     print(entity)

All PyTorch model weights were used when initializing TFBertForTokenClassification.

All the weights of TFBertForTokenClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.
Token indices sequence length is longer than the specified maximum sequence length for this model (778 > 512). Running this sequence through the model will result in indexing errors


## Vis Debug

In [None]:
print(text)
print(tokens)
print(predicted_labels)

In [None]:
from spacy.tokens import Doc

In [None]:
nlp = spacy.blank("en")

# # Create spaCy Doc object from the original text
doc = nlp(text)
doc = Doc(doc.vocab, words=tokens)

# Initialize spans to store entity positions
spans = []

# Initialize variables to track current entity
current_start = None
current_end = None
current_label = None

# Iterate through tokens and predicted labels
for i, (token, label) in enumerate(zip(tokens, predicted_labels)):
    if label != '[PAD]':
        if label.startswith('B') or label.startswith('I') or label.startswith('[unused'):
            # Start or continuation of an entity
            if current_start is not None:
                spans.append((current_start, current_end, current_label))
            current_start = i
            current_end = i + 1
            current_label = label[2:] if label.startswith('B') or label.startswith('I') else label  # Removing the 'B-' or 'I-' prefix
        else:
            # Outside of any entity
            if current_start is not None:
                spans.append((current_start, current_end, current_label))
                current_start = None
                current_end = None
                current_label = None

# Add the last entity if any
if current_start is not None:
    spans.append((current_start, current_end, current_label))

# Create a list of entities with start and end positions
entities = [{"start": start, "end": end, "label": label} for start, end, label in spans]

# Create Span objects and set them in the Doc
for ent in entities:
    start, end, label = ent["start"], ent["end"], ent["label"]
    span = Span(doc, start, end, label=label)
    doc.ents = list(doc.ents)

# Prepare data for displacy visualization
options = {"ents": [ent["label"] for ent in entities], "colors": {}}
for ent in entities:
    options["colors"][ent["label"]] = "yellow"  # You can change the color

# Visualize using displacy
displacy.render(doc, style="ent", options=options, jupyter=True)

In [None]:
# Create Span objects and set them in the Doc

spans_2 = []

for ent in entities:
    start, end, label = ent["start"], ent["end"], ent["label"]
    span = Span(doc, start, end, label=label)
    spans_2.append(span)
    
doc.ents = spans_2

# Prepare data for displacy visualization
options = {"ents": [ent["label"] for ent in entities], "colors": {}}
for ent in entities:
    options["colors"][ent["label"]] = "yellow"  # You can change the color

# Visualize using displacy
displacy.render(doc, style="ent", options=options, jupyter=True)

In [None]:
entities

In [None]:
def visualize(full_text,tokens,pred):
    
    nlp = spacy.blank("en")
    # # Create spaCy Doc object from the original text
    doc = nlp(full_text)
    doc = Doc(doc.vocab, words=tokens)

    # Initialize spans to store entity positions
    spans = []

    # Initialize variables to track current entity
    current_start = None
    current_end = None
    current_label = None

    # Iterate through tokens and predicted labels
    for i, (token, label) in enumerate(zip(tokens, predicted_labels)):
        if label != '[PAD]':
            if label.startswith('B') or label.startswith('I') or label.startswith('[unused'):
                # Start or continuation of an entity
                if current_start is not None:
                    spans.append((current_start, current_end, current_label))
                current_start = i
                current_end = i + 1
                current_label = label[2:] if label.startswith('B') or label.startswith('I') else label  # Removing the 'B-' or 'I-' prefix
            else:
                # Outside of any entity
                if current_start is not None:
                    spans.append((current_start, current_end, current_label))
                    current_start = None
                    current_end = None
                    current_label = None

    # Add the last entity if any
    if current_start is not None:
        spans.append((current_start, current_end, current_label))
        
    spans = [Span(doc, start, end, label=label) for start, end, label in spans]
    
    doc.spans["sc"] = spans


    # Create a list of entities with start and end positions
#     entities = [{"start": start, "end": end, "label": label} for start, end, label in spans]

#     # Create Span objects and set them in the Doc
#     for ent in entities:
#         start, end, label = ent["start"], ent["end"], ent["label"]
#         span = Span(doc, start, end, label=label)
#         doc.ents = list(doc.ents)

    # Prepare data for displacy visualization
    options = {"ents": [ent["label"] for ent in entities], "colors": {}}
    for ent in entities:
        options["colors"][ent["label"]] = "yellow"  # You can change the color

    # Visualize using displacy
    displacy.render(doc, style="span", options=options, jupyter=True)


In [None]:
print(text)
print(tokens)
print(predicted_labels)

In [None]:
visualize(text,tokens,predicted_labels)

# DeBERTa

In [16]:
# Sample text with PII

full_text = df_train.loc[0].full_text
text = full_text

# Load pre-trained BERT model and tokenizer
# model_name = "knowledgator/UTC-DeBERTa-large"
model_name = "geckos/deberta-base-fine-tuned-ner"  # You can use other BERT models from Hugging Face
tokenizer = DebertaTokenizer.from_pretrained(model_name)
model = TFDebertaForTokenClassification.from_pretrained(model_name)

# Tokenize the input text
inputs = tokenizer.encode_plus(text, return_tensors="tf", padding=True, truncation=True)
# inputs = tokenizer.encode(text, return_tensors="tf")
tokens_full = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))

tokens = tokens_full[:500]


# print("Input Text:", text)
# print("Tokenized Input:", tokens)
# print("Input IDs:", inputs)


# Perform token classification (NER)
outputs = model(inputs["input_ids"])

# print("Raw Outputs:", outputs)

predictions = tf.argmax(outputs.logits, axis=2)

# Decode the predicted labels
# predicted_labels = [tokenizer.convert_ids_to_tokens(prediction) for prediction in predictions.numpy()[0]]
predicted_labels = tokenizer.convert_ids_to_tokens(predictions.numpy()[0]) 

# print("Predicted Labels:", predicted_labels)

entities = []
current_entity = ""
for token, label in zip(inputs["input_ids"].numpy()[0], predicted_labels):
    token = tokenizer.decode(token)
    if label.startswith('B'):
        if current_entity:
            entities.append(current_entity.strip())
        current_entity = token
    elif label.startswith('I'):
        current_entity += ' ' + token
    else:
        if current_entity:
            entities.append(current_entity.strip())
        current_entity = ""

# Add the last entity if any
if current_entity:
    entities.append(current_entity.strip())

# Display the identified entities
# print("Identified Entities:")
# for entity in entities:
#     print(entity)

tokenizer_config.json: 100%|██████████████████████████████████████████████████████| 1.18k/1.18k [00:00<00:00, 1.18MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
vocab.json: 100%|███████████████████████████████████████████████████████████████████| 798k/798k [00:00<00:00, 3.30MB/s]
merges.txt: 100%|███████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 10.1MB/s]
special_tokens_map.json: 100%|████████████████████████████████████████████████████████████████| 778/778 [00:00<?, ?B/s]
tokenizer.json: 100%|█████████████████████████████████████████████████████████████| 1.36M/1.36M [00:01<00:00, 1.27MB/s]
config.json: 100%|████████████████████████████████████████████████████████████████████████| 1.13k/1.13k [00:00<?, ?B/s]


OSError: geckos/deberta-base-fine-tuned-ner does not appear to have a file named tf_model.h5 but there is a file for PyTorch weights. Use `from_pt=True` to load this model from those weights.

In [1]:
import torch

## Debug

In [43]:
from transformers import AutoTokenizer, DebertaForTokenClassification, DebertaV2ForTokenClassification
import torch


text = "John Doe's email is john.doe@email.com, and his phone number is +1 (555) 123-4567."

# model_name = "geckos/deberta-base-fine-tuned-ner" 
# model_name = "knowledgator/UTC-DeBERTa-large"
# model_name = "Gladiator/microsoft-deberta-v3-large_ner_conll2003"
model_name = "Yanis/microsoft-deberta-v3-large_ner_conll2003-anonimization_TRY_1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = DebertaV2ForTokenClassification.from_pretrained(model_name)
# model = DebertaV2ForTokenClassification.from_pretrained(model_name)

# inputs = tokenizer(text, add_special_tokens=False, return_tensors="pt")

inputs = tokenizer(text, add_special_tokens=True, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_token_class_ids = logits.argmax(-1)

# Note that tokens are classified rather then input words which means that
# there might be more predicted token classes than words.
# Multiple token classes might account for the same word
predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]

labels = predicted_token_class_ids
loss = model(**inputs, labels=labels).loss
predicted_tokens_classes

In [44]:
predicted_token_class_ids

tensor([[ 9,  2,  2,  2,  2,  0,  0, 14, 14, 14, 14, 14, 14, 14, 14,  0,  0,  0,
          0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  4,  0,  2]])

In [45]:
predicted_tokens_classes

['Civil state',
 'Name',
 'Name',
 'Name',
 'Name',
 'O',
 'O',
 'Email address',
 'Email address',
 'Email address',
 'Email address',
 'Email address',
 'Email address',
 'Email address',
 'Email address',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'Phone Numbers',
 'Phone Numbers',
 'Phone Numbers',
 'Phone Numbers',
 'Phone Numbers',
 'Phone Numbers',
 'Phone Numbers',
 'Phone Numbers',
 'Phone Numbers',
 'O',
 'Name']

In [42]:
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))
tokens

['[CLS]',
 '▁John',
 '▁Doe',
 "'",
 's',
 '▁email',
 '▁is',
 '▁john',
 '.',
 'do',
 'e',
 '@',
 'email',
 '.',
 'com',
 ',',
 '▁and',
 '▁his',
 '▁phone',
 '▁number',
 '▁is',
 '▁+',
 '1',
 '▁(',
 '555',
 ')',
 '▁123',
 '-',
 '45',
 '67',
 '.',
 '[SEP]']

In [6]:
inputs

{'input_ids': tensor([[  610, 28484,    18,  1047,    16, 41906,     4,   417,  3540,  1039,
         10555,     4,   175,     6,     8,    39,  1028,   346,    16,  2055,
           134,    36, 33772,    43, 17072,    12,  1898,  4111,     4]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]])}

In [5]:
labels

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0]])

## Run  (Yanis)

In [46]:
from transformers import AutoTokenizer, DebertaForTokenClassification, DebertaV2ForTokenClassification
import torch

full_text = df_train.loc[0].full_text
text = full_text
# model_name = "geckos/deberta-base-fine-tuned-ner" 
# model_name = "knowledgator/UTC-DeBERTa-large"
# model_name = "Gladiator/microsoft-deberta-v3-large_ner_conll2003"
model_name = "Yanis/microsoft-deberta-v3-large_ner_conll2003-anonimization_TRY_1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = DebertaV2ForTokenClassification.from_pretrained(model_name)
# model = DebertaV2ForTokenClassification.from_pretrained(model_name)

# inputs = tokenizer(text, add_special_tokens=False, return_tensors="pt")

inputs = tokenizer(text, add_special_tokens=True, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_token_class_ids = logits.argmax(-1)

# Note that tokens are classified rather then input words which means that
# there might be more predicted token classes than words.
# Multiple token classes might account for the same word
predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]

labels = predicted_token_class_ids
loss = model(**inputs, labels=labels).loss

predicted_tokens_classes

['Name',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'Name',
 'Name',
 'Name',
 'Name',
 'Name',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'Name',
 'Name',
 'Name',
 'Name',
 'O',
 'Name',
 'Name',
 'Name',
 'Name',
 'O',
 'O',
 'O',
 'Religious beliefs',
 'Religious beliefs',
 'Physical addresses',
 'Religious beliefs',
 'Religious beliefs',
 'Physical addresses',
 'Phone Numbers',
 'Religious beliefs',
 'O',
 'Physical addresses',
 'O',
 'Health insurance information',
 'Health insurance information',
 'Health insurance information',
 'Health insurance information',
 'Health insurance information',
 'Health insurance information',
 'Health insurance information',
 'Health insurance information',
 'Health insurance information',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 '

## Run (lakshyakh93/deberta_finetuned_pii)

In [18]:
from transformers import AutoTokenizer, DebertaForTokenClassification, DebertaV2ForTokenClassification
import torch

full_text = df_train.loc[0].full_text
text = full_text
model_name = "lakshyakh93/deberta_finetuned_pii"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = DebertaForTokenClassification.from_pretrained(model_name)
# model = DebertaV2ForTokenClassification.from_pretrained(model_name)

# inputs = tokenizer(text, add_special_tokens=False, return_tensors="pt")

inputs = tokenizer(text, add_special_tokens=True, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_token_class_ids = logits.argmax(-1)

# Note that tokens are classified rather then input words which means that
# there might be more predicted token classes than words.
# Multiple token classes might account for the same word
predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]

labels = predicted_token_class_ids
loss = model(**inputs, labels=labels).loss

predicted_tokens_classes

  return self.fget.__get__(instance, owner)()
Token indices sequence length is longer than the specified maximum sequence length for this model (835 > 512). Running this sequence through the model will result in indexing errors


['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O'

In [50]:
len(predicted_tokens_classes)

835

In [36]:
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))
tokens

['[CLS]',
 '▁Design',
 '▁Thinking',
 '▁for',
 '▁innovation',
 '▁reflex',
 'ion',
 '-',
 'Av',
 'ril',
 '▁2021',
 '-',
 'N',
 'atha',
 'lie',
 '▁S',
 'ylla',
 '▁Challenge',
 '▁&',
 '▁selection',
 '▁The',
 '▁tool',
 '▁I',
 '▁use',
 '▁to',
 '▁help',
 '▁all',
 '▁stakeholders',
 '▁finding',
 '▁their',
 '▁way',
 '▁through',
 '▁the',
 '▁complexity',
 '▁of',
 '▁a',
 '▁project',
 '▁is',
 '▁the',
 '▁mind',
 '▁map',
 '.',
 '▁What',
 '▁exactly',
 '▁is',
 '▁a',
 '▁mind',
 '▁map',
 '?',
 '▁According',
 '▁to',
 '▁the',
 '▁definition',
 '▁of',
 '▁Buz',
 'an',
 '▁T',
 '.',
 '▁and',
 '▁Buz',
 'an',
 '▁B',
 '.',
 '▁(',
 '1999',
 ',',
 '▁Des',
 's',
 'ine',
 '-',
 'moi',
 '▁l',
 "'",
 'intelligence',
 '.',
 '▁Paris',
 ':',
 '▁Les',
 '▁É',
 'dition',
 's',
 '▁d',
 "'",
 'Organ',
 'isation',
 '.',
 ')',
 ',',
 '▁the',
 '▁mind',
 '▁map',
 '▁(',
 'or',
 '▁heuristic',
 '▁diagram',
 ')',
 '▁is',
 '▁a',
 '▁graphic',
 '▁representation',
 '▁technique',
 '▁that',
 '▁follows',
 '▁the',
 '▁natural',
 '▁functioning',


# Validation Pipeline

## Token Alignment

In [17]:
def tokenize(example, tokenizer):
    text = []
    token_map = []
    
    idx = 0
    
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
        
        
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=INFERENCE_MAX_LENGTH)
    
        
    return {
        **tokenized,
        "token_map": token_map,
    }

In [20]:
INFERENCE_MAX_LENGTH = 2048

In [21]:
temp_tokens = tokenize(df_train.loc[0], tokenizer)

In [24]:
temp_tokens.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'token_map'])

In [None]:
temp_tokens

In [28]:
input_ids = temp_tokens['input_ids']
tokens_temp = tokenizer.convert_ids_to_tokens(input_ids)
tokens_temp

['[CLS]',
 'ĠDesign',
 'ĠThinking',
 'Ġfor',
 'Ġinnovation',
 'Ġreflex',
 'ion',
 '-',
 'Av',
 'ril',
 'Ġ2021',
 '-',
 'N',
 'ath',
 'al',
 'ie',
 'ĠSy',
 'lla',
 'Ċ',
 'Ċ',
 'Chall',
 'enge',
 'Ġ&',
 'Ġselection',
 'Ċ',
 'Ċ',
 'The',
 'Ġtool',
 'ĠI',
 'Ġuse',
 'Ġto',
 'Ġhelp',
 'Ġall',
 'Ġstakeholders',
 'Ġfinding',
 'Ġtheir',
 'Ġway',
 'Ġthrough',
 'Ġthe',
 'Ġcomplexity',
 'Ġof',
 'Ġa',
 'Ġproject',
 'Ġis',
 'Ġthe',
 'Ġ',
 'Ġmind',
 'Ġmap',
 '.',
 'Ċ',
 'Ċ',
 'What',
 'Ġexactly',
 'Ġis',
 'Ġa',
 'Ġmind',
 'Ġmap',
 '?',
 'ĠAccording',
 'Ġto',
 'Ġthe',
 'Ġdefinition',
 'Ġof',
 'ĠBu',
 'zan',
 'ĠT',
 '.',
 'Ġand',
 'ĠBu',
 'zan',
 'ĠB',
 '.',
 'Ġ(',
 '1999',
 ',',
 'ĠD',
 'ess',
 'ine',
 '-',
 'mo',
 'i',
 'Ġ',
 'Ġl',
 "'",
 'intelligence',
 '.',
 'ĠParis',
 ':',
 'ĠLes',
 'ĠÃī',
 'd',
 'itions',
 'Ġd',
 "'",
 'Organ',
 'isation',
 '.),',
 'Ġthe',
 'Ġmind',
 'Ġmap',
 'Ġ(',
 'or',
 'Ġhe',
 'uristic',
 'Ġdiagram',
 ')',
 'Ġis',
 'Ġa',
 'Ġgraphic',
 'Ġ',
 'Ġrepresentation',
 'Ġtechnique',


In [29]:
len(tokens_temp)

835

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

def chunk_and_process(input_text, tokenizer, model, max_chunk_length=512):
    # Tokenize the input text
    tokens = tokenizer(input_text, max_length=max_chunk_length, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length", truncation=True)

    # Get the chunks
    overflow_to_sample_mapping = tokens.pop("overflow_to_sample_mapping")
    offsets = tokens.pop("offset_mapping")

    # Process each chunk independently
    all_predictions = []
    for i, offset in enumerate(offsets):
        # Select the relevant chunk based on offsets
        start, end = offset[0], offset[-1]
        chunk_input = input_text[start:end]

        # Tokenize the chunk
        chunk_tokens = tokenizer(chunk_input, max_length=max_chunk_length, padding="max_length", truncation=True, return_tensors="pt")

        # Process the chunk through the model
        with torch.no_grad():
            chunk_predictions = model(**chunk_tokens).logits  # Assuming model returns logits for token classification

        all_predictions.append(chunk_predictions)

    # Combine or aggregate final predictions based on your task
    # (This might involve handling overlaps between chunks for token classification)
    final_predictions = process_final_predictions(all_predictions)

    return final_predictions

# Example usage within a validation loop
def validate(model, dataloader, tokenizer):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            # Assuming your batch contains 'input_text' and 'labels'
            input_text = batch['input_text']
            labels = batch['labels']

            # Chunk and process each input text
            for text, label in zip(input_text, labels):
                predictions = chunk_and_process(text, tokenizer, model)
                all_predictions.append(predictions)
                all_labels.append(label)

    # Evaluate your predictions and labels as needed
    # ...

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("your_model_name")
model = AutoModelForTokenClassification.from_pretrained("your_model_name")

# Assuming you have a validation dataloader
validate(model, validation_dataloader, tokenizer)

In [12]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import torch

class TokenClassificationDataset(Dataset):
    def __init__(self, data, labels, tokenizer, max_length=512):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        label = self.labels[idx]

        # Tokenize the input text
        encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")

        # Convert label to tensor (adjust this based on your label format)
        label_tensor = torch.tensor(label)

        return {'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'labels': label_tensor}

# Example usage:
tokenizer = AutoTokenizer.from_pretrained("your_model_name")

# Example data and labels
data = ["This is the first document.", "Another example document."]
labels = [[1, 0, 0, 2, 0, 0], [1, 0, 0, 0, 0, 2]]

# Create the custom dataset
dataset = TokenClassificationDataset(data, labels, tokenizer)

# Specify batch size and create the DataLoader
batch_size = 2
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Iterate over the dataloader
for batch in dataloader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    # Your training or validation logic here


OSError: your_model_name is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`