# Setup 

In [1]:
SYS_INPUT_DIR = '/kaggle/input/pii-detection-removal-from-educational-data'

In [2]:
import os
import numpy as np
import warnings
import pandas as pd

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Read train data

In [3]:
import json
train_json = json.load(open(os.path.join(SYS_INPUT_DIR, "train.json")))
data = pd.json_normalize(train_json)

In [4]:
# Check train has one row per document
assert data['document'].nunique() == data.shape[0]

In [5]:
from sklearn.model_selection import train_test_split
train, val_test = train_test_split(data, test_size=0.4, random_state=42)
val, test = train_test_split(val_test, test_size=0.5, random_state=42)

In [6]:
full_ner_labels = [
    'B-NAME_STUDENT', 'I-NAME_STUDENT',
    'B-URL_PERSONAL', 'I-URL_PERSONAL',
    'B-ID_NUM', 'I-ID_NUM',
    'B-EMAIL', 'I-EMAIL',
    'B-STREET_ADDRESS', 'I-STREET_ADDRESS',
    'B-PHONE_NUM', 'I-PHONE_NUM',
    'B-USERNAME', 'I-USERNAME'
]

In [None]:
!pip install datasets

In [7]:
labeldict = {
    'O':0,
    'B-NAME_STUDENT':1, 'I-NAME_STUDENT':2,
    'B-URL_PERSONAL':3, 'I-URL_PERSONAL':4,
    'B-ID_NUM':5, 'I-ID_NUM':6,
    'B-EMAIL':7, 'I-EMAIL':8,
    'B-STREET_ADDRESS':9, 'I-STREET_ADDRESS':10,
    'B-PHONE_NUM':11, 'I-PHONE_NUM':12,
    'B-USERNAME':13, 'I-USERNAME':14
}
from datasets import Features,ClassLabel,Sequence,Value,Dataset

def chunkdf(df):# expects document, tokens and labels
    n_tokens = 400
    df_chunked = pd.DataFrame(columns=df.columns)
    for index,row in df.iterrows():
        l_token = [];l_label = [];l_docnum = [];last = 0
        n_loop=len(df.tokens[index])//n_tokens+1
        for i in range(n_loop-1):
            l_token=df.tokens[index][n_tokens*i:n_tokens*(i+1)]
            l_label=df.labels[index][n_tokens*i:n_tokens*(i+1)]
            l_docnum=str(df.document[index])+'|'+str(i)
            sub = {"document":l_docnum,"tokens":l_token,"labels":l_label}
            df_chunked = df_chunked._append(sub,ignore_index=True)
            last = i+1
        #assert last!=0,print(f"failed for {index}")
        l_token = df.tokens[index][n_tokens*last:]
        if len(l_token) >0:
          l_label = df.labels[index][n_tokens*last:]
          l_docnum = str(df.document[index])+'|'+str(last)
          sub = {"document":l_docnum,"tokens":l_token,"labels":l_label}
          df_chunked = df_chunked._append(sub,ignore_index=True)
    return df_chunked
def changeFormat (df):
    df_sub = chunkdf(df.loc[:,["document","tokens","labels"]])
    df_sub.labels=df_sub.labels.apply(lambda r : [labeldict[e] for e in r])

    l=['O']
    l.extend(full_ner_labels)
    cl=ClassLabel(names=l)
    # Sequence(feature=cl)
    # d_train = Dataset.from_pandas(df_sub.loc[:,"labels"],features=Features({"labels":Sequence(feature=cl),"__index_level_0__":Value("string")}))
    d_train = Dataset.from_pandas(df_sub.loc[:,["labels"]],features=Features({"labels":Sequence(feature=cl)}))
    # d_train = Dataset.from_pandas(train.loc[:,["labels"]])
    # d_train=d_train.remove_columns('__index_level_0__')
    d_train=d_train.rename_column("labels","ner_tags")
    d_train=d_train.add_column('id',[str(s) for s in df_sub.document])
    d_train=d_train.add_column('tokens',df_sub.tokens)
    return d_train

In [27]:
df=train
df_sub = chunkdf(df.loc[:,["document","tokens","labels"]])
#len(df.tokens[4879])
# df_sub.loc[198,:]
# len(df[df.document==21272].tokens)
# (df[df.document==21272].tokens)
# len(df.loc[6160,:].tokens)
#len(df[df.document==21272].tokens)
# for index,row in df_sub.iterrows():
#   #print(len(df_sub.tokens[index]))
#   assert len(df_sub.tokens[index])>0,print(f"Lower length limit failed for {index}")
#   assert len(df_sub.tokens[index])<=400,print(f"Upper length limit failed for {index}")
#   assert len(df_sub.tokens[index]) == len(df_sub.labels[index]),print(f"Same length assertion failed for {index}")

In [33]:
for index,row in df_sub.iterrows():
    if(len(df_sub.tokens[index])==0 or len(df_sub.tokens[index])>400 or (len(df_sub.labels[index])!=len(df_sub.tokens[index]))):
        df_sub = df_sub.drop(index)


In [34]:
for index,row in df_sub.iterrows():
  #print(len(df_sub.tokens[index]))
  assert len(df_sub.tokens[index])>0,print(f"Lower length limit failed for {index}")
  assert len(df_sub.tokens[index])<=400,print(f"Upper length limit failed for {index}")
  assert len(df_sub.tokens[index]) == len(df_sub.labels[index]),print(f"Same length assertion failed for {index}")

In [35]:
from datasets import Dataset, DatasetDict
import pandas as pd
col_names = ['id', 'tokens', 'ner_tags']
wnut_bert = DatasetDict()
wnut_bert["train"] = changeFormat(train)
wnut_bert["test"] = changeFormat(test)
wnut_bert["validation"] = changeFormat(val)

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [36]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [37]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [38]:
tokenized_wnut = wnut_bert.map(tokenize_and_align_labels,batched = True)

  0%|          | 0/13 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [39]:
# --- Pytorch
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [41]:
!pip install evaluate
!pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Obtaining dependency information for evaluate from https://files.pythonhosted.org/packages/70/63/7644a1eb7b0297e585a6adec98ed9e575309bb973c33b394dae66bc35c69/evaluate-0.4.1-py3-none-any.whl.metadata
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=d28303f16263fd943c81338b884e144bf847fa28caddb598109094fb2edc9164
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [42]:
import evaluate
seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [43]:
import numpy as np
#example = wnut_bert["train"][0]
#labels = [full_ner_labels[i] for i in example[f"ner_tags"]]
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [44]:
id2label = {
    0: "O",
    1: "B-NAME_STUDENT",
    2: "I-NAME_STUDENT",
    3: "B-URL_PERSONAL",
    4: "I-URL_PERSONAL",
    5: "B-ID_NUM",
    6: "I-ID_NUM",
    7: "B-EMAIL",
    8: "I-EMAIL",
    9: "B-STREET_ADDRESS",
    10: "I-STREET_ADDRESS",
    11: "B-PHONE_NUM",
    12: "I-PHONE_NUM",
    13: "B-USERNAME",
    14: "I-USERNAME"
}
label2id = {
    "O":0,
    "B-NAME_STUDENT":1,
    "I-NAME_STUDENT":2,
    "B-URL_PERSONAL":3,
    "I-URL_PERSONAL":4,
    "B-ID_NUM":5,
    "I-ID_NUM":6,
    "B-EMAIL":7,
    "I-EMAIL":8,
    "B-STREET_ADDRESS":9,
    "I-STREET_ADDRESS":10,
    "B-PHONE_NUM":11,
    "I-PHONE_NUM":12,
    "B-USERNAME":13,
    "I-USERNAME":14
}

In [45]:
# -- Pytorch
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=15, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install transformers[torch]
! pip install -U accelerate
# restart kernel after this installation

In [None]:
# --- Pytorch
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
