In [None]:
! pip install transformers datasets

In [None]:
import transformers
import torch
print(transformers.__version__)

4.11.0


In [None]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "dslim/bert-base-NER" #'DistilBertForTokenClassification' #
batch_size = 16

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Loading the dataset

We will use the gs bucket with annotated data and get the metric we need to use for evaluation (to compare our model to the benchmark). This can be easily done with the functions `load_dataset` and `load_metric`.

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
# !curl https://sdk.cloud.google.com | bash
# !gcloud init

In [None]:
# Get the data from GS
!gsutil cp gs://label-studio-object/labelled_tasks/*.json .
!gsutil cp gs://label-studio-object/labelling_tasks/*.txt .

In [None]:
# Check one of the items
import json

with open('100.json') as f:
  data = json.load(f)

## Preprocessing the data

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# from transformers import DistilBertTokenizer, DistilBertForTokenClassification
# tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)

In [None]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [None]:
tokenizer("Hello, this is one sentence!", padding = True)

In [None]:
tokenizer(["Hello", ",", "this", "is", "one", "sentence", "split", "into", "words", "."], is_split_into_words=True, padding = True, truncation = True)

Create the data here. Pull from the labelling txt file and the data labels from the labelled data.

In [None]:
# Function to return list of tokenized text with a corresponding labels
import json

def getLabelsandTextfromJSON(jsonFile):
  with open(jsonFile) as f:
    labels = json.load(f)
    textFile = labels['task']['data']['text'].split('/')[-1]
  with open(textFile.format(i)) as a:
    text = a.read().strip()
  # print(len(text))
  labelsList = labels['result']
  startEndList = []
  for item in labelsList:
    start = item["value"]['start']
    end = item["value"]['end']
    lbl = item["value"]['labels'][0]
    startEndList.append((start, end, lbl))
  finalList = []
  labelIDList = []
  lastEndChar = 0
  if len(startEndList) != 0:
    # print(startEndList)
    for token in startEndList:
      firstChar = token[0]
      if firstChar == lastEndChar + 1:
        finalList.append(text[token[0]:token[1]])
        labelIDList.append(token[2])
        lastEndChar = token[1]
      else:
        # add the gap
        unlabeledString = text[lastEndChar:token[0]]
        # print(unlabeledString)
        tokenizedList = unlabeledString.strip().split(" ")
        tokenizedList = [item.strip() for item in tokenizedList]
        finalList.extend(tokenizedList)
        labelList = ["O"] * len(tokenizedList)
        labelIDList.extend(labelList)
        # add the present token
        finalList.append(text[token[0]:token[1]])
        labelIDList.append(token[2])
        lastEndChar = token[1]
  else:
    finalList.extend(text.strip().split(" "))
    labelList = ["O"] * len(finalList)
    labelIDList.extend(labelList)
  return (finalList, labelIDList, len(startEndList))


In [None]:
import pandas as pd
columns = ['tokens','numlabels','ner_tags']
df = pd.DataFrame(columns = columns)

for i in range(4,200):
  fileName = '{}.json'.format(i)
  datum = getLabelsandTextfromJSON(fileName)
  # print("File: {}, lengthToksList: {}, Length labels {}, Num Labels: {}".format(fileName,len(datum[0]),len(datum[1]),datum[2]))
  df2 = pd.DataFrame({'tokens': [datum[0]], 'numlabels': [datum[2]], 'ner_tags': [datum[1]]})
  df = df.append(df2, ignore_index=True)
  # print(df)

<!--  -->

In [None]:
# Put into pandas df
df.head()
df.describe()

Unnamed: 0,tokens,numlabels,ner_tags
count,196,196,196
unique,185,6,74
top,[Done],0,"[O, O, O, O]"
freq,4,133,14


In [None]:
# Add column of one hot encoded Labels
complete_labels_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
def encode_labels(labels_list):
  return [complete_labels_list.index(tok) for tok in labels_list]

In [None]:
df['labels_as_ids'] = df['ner_tags'].apply(encode_labels)

In [None]:
df

Unnamed: 0,tokens,numlabels,ner_tags,labels_as_ids
0,[text],0,[O],[0]
1,"[Make, edits.]",0,"[O, O]","[0, 0]"
2,"[""Battery, Ran, out,, finish, at, milk, market]",2,"[O, O, O, O, O, B-LOC, I-LOC]","[0, 0, 0, 0, 0, 5, 6]"
3,"[Send, writing, samples, to, Lela]",1,"[O, O, O, O, B-PER]","[0, 0, 0, 0, 1]"
4,"[Create, a, project, workflow, chart, in, mind...",1,"[O, O, O, O, O, O, B-ORG]","[0, 0, 0, 0, 0, 0, 3]"
...,...,...,...,...
191,"[Finish, Elevations]",0,"[O, O]","[0, 0]"
192,"[Do, the, 1/2-car, and, 3rd-car, grg, options]",0,"[O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0]"
193,"[See, how, far, you, get.]",0,"[O, O, O, O, O]","[0, 0, 0, 0, 0]"
194,"[Get, Alex]",1,"[O, B-PER]","[0, 1]"


In [None]:
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"].tolist(), truncation=True, is_split_into_words=True, padding = True ) #
    # tokenized_labels = tokenizer(examples["labels_as_ids"].tolist(), truncation=True, is_split_into_words=True, padding = True )
    labels = []
    for i, label in enumerate(examples[f"labels_as_ids"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_df = tokenize_and_align_labels(df)
print(tokenized_df)

{'input_ids': [[101, 3087, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 7102, 14609, 1116, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 107, 11537, 16890, 1179, 1149, 117, 3146, 1120, 6831, 2319, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
# tokenized_df['labels']

In [None]:
# print(df["labels_as_ids"].tolist())
# tokenized_labels = tokenizer(df["labels_as_ids"].tolist(), truncation=True, is_split_into_words=True, padding = True )

In [None]:
df_test = pd.DataFrame({'input_ids': tokenized_df['input_ids'], 'attention_mask': tokenized_df['attention_mask'], 'labels': tokenized_df['labels']})

In [None]:
df_test

Unnamed: 0,input_ids,attention_mask,labels
0,"[101, 3087, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, -100, -100, -100, -100, -100, -100, ..."
1,"[101, 7102, 14609, 1116, 119, 102, 0, 0, 0, 0,...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, -100, -100, -100, -100, -10..."
2,"[101, 107, 11537, 16890, 1179, 1149, 117, 3146...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, -100, -10..."
3,"[101, 25929, 2269, 8025, 1106, 3180, 1742, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 1, 1, -100, -100, -100, -10..."
4,"[101, 140, 15998, 170, 1933, 1250, 12712, 3481...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, -100, -10..."
...,...,...,...
191,"[101, 19140, 2944, 2896, 12853, 6126, 102, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, -100, -100, -100, -100, ..."
192,"[101, 2091, 1103, 122, 120, 123, 118, 1610, 11...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
193,"[101, 3969, 1293, 1677, 1128, 1243, 119, 102, ...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, 0, -100, -100, -100, -10..."
194,"[101, 3949, 3230, 102, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 1, -100, -100, -100, -100, -100, -10..."


I need to tokenize all words. And I need to convert all lists into sentences.

We want to output a list of the words, with a corresponding list of the labels.

In [None]:
example = df["tokens"][3]
print(example)

['Send', 'writing', 'samples', 'to', 'Lela']


In [None]:
# tokenized_input = tokenizer(example, is_split_into_words=True, padding= True, truncation=True)
# tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
# print(tokens)
# print(tokenized_input.word_ids())

In [None]:
# label_all_tokens = True
# def tokenize_and_align_labels(examples, labelsIDs):
#     tokenized_inputs = tokenizer(examples, truncation=True, is_split_into_words=True, padding=True)
#     # print(examples)
#     label_ids = []
#     for i, label in enumerate(labelsIDs):
#         word_ids = tokenized_inputs.word_ids() #(batch_index=i)
#         # print(word_ids)
#         previous_word_idx = None
#         for word_idx in word_ids:
#             # Special tokens have a word id that is None. We set the label to -100 so they are automatically
#             # ignored in the loss function.
#             if word_idx is None:
#                 label_ids.append(-100)
#             # We set the label for the first token of each word.
#             elif word_idx != previous_word_idx:
#                 label_ids.append(word_idx)
#             # For the other tokens in a word, we set the label to either the current label or -100, depending on
#             # the label_all_tokens flag.
#             else:
#                 label_ids.append(word_idx if label_all_tokens else -100)
#             previous_word_idx = word_idx
#             # print(label_ids)


#     tokenized_inputs["labels"] = label_ids
#     return tokenized_inputs

In [None]:
# tokenized_dataset = []
# for index, row in df.iterrows():
#   # print(row['labels_as_ids'])
#   _result = tokenize_and_align_labels(row['tokens'], row['labels_as_ids'])
#   # print(_result)
#   tokenized_dataset.append(_result)
#   # print(_result)
#   # print(_result['input_ids'])
#   # df['input_ids'] = _result['input_ids']
#   # df['attention_mask'] = _result['attention_mask']
#   # df['labels'] = _result['labels']

In [None]:
# tokenized_df = pd.DataFrame(tokenized_dataset)
# tokenized_df['tokens'] = df['tokens']
# tokenized_df['ner_tags'] = df['ner_tags']
# tokenized_df['id'] = tokenized_df.index

In [None]:
from datasets import Dataset

In [None]:
traindf=df_test.sample(frac=0.8,random_state=200) #random state is a seed value
testdf=df_test.drop(traindf.index)

In [None]:
print(len(traindf))
print(len(testdf))

157
39


In [None]:
train_dataset = Dataset.from_pandas(traindf)
val_dataset = Dataset.from_pandas(testdf)

In [None]:
columns_to_return = ['input_ids', 'labels', 'attention_mask']
val_dataset.set_format(type='torch', columns=columns_to_return)
train_dataset.set_format(type='torch', columns=columns_to_return)
print(val_dataset)
print(train_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', '__index_level_0__'],
    num_rows: 39
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels', '__index_level_0__'],
    num_rows: 157
})


## Fine-tuning the model

In [None]:
complete_labels_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(complete_labels_list)).to(device)

loading configuration file https://huggingface.co/dslim/bert-base-NER/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a5ff16a1d557b5ad480f50b1d454448475c644d08df9ce8fccabea7745bebd9f.a61836f2236a3ff1a0827544e2d7c512cbb8cd26ed7b32d643526bebb5d7f92e
  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Model config BertConfig {
  "_num_labels": 9,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-MISC",
    "2": "I-MISC",
    "3": "B-PER",
    "4": "I-PER",
    "5": "B-ORG",
    "6": "I-ORG",
    "7": "B-LOC",
    "8": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 7,
    "B-MISC": 1,
    "B-ORG": 5,
    "B-PER": 3,
    "I-LOC": 8,
    "I-MISC": 2,
    "I-ORG": 6,


Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

storing https://huggingface.co/dslim/bert-base-NER/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/3ca763a5697d51432247d711b6aae51030a05f5b0c9a59cb83b20255eabb7ff4.aeec53fbb8d04bbdb0c84621a6f18491499bffc49a246808de99e63e7684ad79
creating metadata file for /root/.cache/huggingface/transformers/3ca763a5697d51432247d711b6aae51030a05f5b0c9a59cb83b20255eabb7ff4.aeec53fbb8d04bbdb0c84621a6f18491499bffc49a246808de99e63e7684ad79
loading weights file https://huggingface.co/dslim/bert-base-NER/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/3ca763a5697d51432247d711b6aae51030a05f5b0c9a59cb83b20255eabb7ff4.aeec53fbb8d04bbdb0c84621a6f18491499bffc49a246808de99e63e7684ad79
All model checkpoint weights were used when initializing BertForTokenClassification.

All the weights of BertForTokenClassification were initialized from the model checkpoint at dslim/bert-base-NER.
If your task is similar to the task the model of the checkpoint was

In [None]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"test-ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    # push_to_hub_model_id=f"{model_name}-finetuned-ner",
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer) #, max_length=200, padding='longest'

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # data_collator=data_collator,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics
)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: __index_level_0__.
***** Running training *****
  Num examples = 157
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 30
  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Epoch,Training Loss,Validation Loss
1,No log,0.283381
2,No log,0.229778
3,No log,0.220431


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 39
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 39
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 39
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=30, training_loss=0.24062229792277018, metrics={'train_runtime': 19.7837, 'train_samples_per_second': 23.808, 'train_steps_per_second': 1.516, 'total_flos': 26923435251552.0, 'train_loss': 0.24062229792277018, 'epoch': 3.0})

In [None]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 39
  Batch size = 16


{'epoch': 3.0,
 'eval_loss': 0.22043052315711975,
 'eval_runtime': 0.6489,
 'eval_samples_per_second': 60.105,
 'eval_steps_per_second': 4.623}

In [None]:
len(tokenizer.vocab)

28996

##Save fine-tuned model

In [None]:
model.save_pretrained('custom-ner')
tokenizer.save_pretrained('custom-ner')

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Configuration saved in custom-ner/config.json
Model weights saved in custom-ner/pytorch_model.bin
tokenizer config file saved in custom-ner/tokenizer_config.json
Special tokens file saved in custom-ner/special_tokens_map.json


('custom-ner/tokenizer_config.json',
 'custom-ner/special_tokens_map.json',
 'custom-ner/vocab.txt',
 'custom-ner/added_tokens.json',
 'custom-ner/tokenizer.json')

In [None]:
# from transformers import AutoModel

# tokenizerPT = AutoTokenizer.from_pretrained(save_directory)
# modelPT = AutoModel.from_pretrained(save_directory).to(device)

##Load and Inference Model

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

save_directory = 'custom-ner'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForTokenClassification.from_pretrained(save_directory)

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


In [None]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
model.to('cpu')
ner_results = nlp("Send writing samples to Ethan's and Barton's, Jonny project on spacy.")

In [None]:
print(ner_results)

[{'entity': 'B-PER', 'score': 0.5210645, 'index': 5, 'word': 'Ethan', 'start': 24, 'end': 29}, {'entity': 'B-PER', 'score': 0.36045, 'index': 9, 'word': 'Barton', 'start': 36, 'end': 42}, {'entity': 'I-ORG', 'score': 0.47312313, 'index': 10, 'word': "'", 'start': 42, 'end': 43}, {'entity': 'I-ORG', 'score': 0.47989675, 'index': 11, 'word': 's', 'start': 43, 'end': 44}, {'entity': 'B-PER', 'score': 0.7987663, 'index': 13, 'word': 'Jon', 'start': 46, 'end': 49}, {'entity': 'B-PER', 'score': 0.58755285, 'index': 14, 'word': '##ny', 'start': 49, 'end': 51}]
