# Install Dependencies

In [None]:
!pip3 install transformers[torch]

In [68]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.6.0-cp39-cp39-win_amd64.whl (12.3 MB)
                                              0.0/12.3 MB ? eta -:--:--
                                              0.2/12.3 MB 6.3 MB/s eta 0:00:02
     -                                        0.6/12.3 MB 7.1 MB/s eta 0:00:02
     ---                                      0.9/12.3 MB 7.5 MB/s eta 0:00:02
     ----                                     1.4/12.3 MB 8.0 MB/s eta 0:00:02
     ------                                   1.9/12.3 MB 8.6 MB/s eta 0:00:02
     ------                                   2.1/12.3 MB 7.8 MB/s eta 0:00:02
     -------                                  2.4/12.3 MB 7.7 MB/s eta 0:00:02
     ---------                                2.9/12.3 MB 8.1 MB/s eta 0:00:02
     -----------                              3.4/12.3 MB 8.4 MB/s eta 0:00:02
     ------------                             3.9/12.3 MB 8.5 MB/s eta 0:00:01
     --------------                           4.4/12.3 MB 8.7

In [None]:
!pip install evaluate

In [None]:
!pip3 install seqeval

In [None]:
!pip3 install accelerate -U

In [None]:
!pip3 install jsonl-to-conll

In [None]:
!pip3 install -U scikit-learn

In [None]:
!pip3 install doccano-transformer

In [None]:
!pip3 install transformers datasets tokenizers segeval -q 

# Converting JSONL to ConnL

In [None]:
# Importing to convert docanno dataset into ConLL
from doccano_transformer.datasets import NERDataset
from doccano_transformer.utils import read_jsonl

In [1]:
import os

In [2]:
import json

In [None]:
# Get the current working directory
current_dir = os.getcwd()
print("Current Directory:", current_dir)
data_file = os.path.join(current_dir,'Labelled Data','labelled_data.jsonl')
print(data_file)

In [None]:
dataset = read_jsonl(filepath=data_file, dataset=NERDataset, encoding='utf-8')
conll_data = dataset.to_conll2003(tokenizer=str.split)

In [None]:
BOI_list = []

In [None]:
for line in conll_data:
    lines = line['data'].strip().split('\n')
    lines = [line for line in lines if line != '-DOCSTART- -X- -X- O' and line.strip() != '']
    
    result = []
    for line in lines:
        line = line.replace("_ _ ", "")
        result.append(line)
    cleaned_data = '\n'.join(result)
    
    BOI_list.append(cleaned_data)

In [None]:
print(BOI_list)

# Saving data in 5 cross validation

In [None]:
# k = 5

# kf = KFold(n_splits=k)

# i = 0
# for train_index, test_index in kf.split(BOI_list):
#     train_data = [BOI_list[i] for i in train_index]
#     test_data = [BOI_list[i] for i in test_index]
#     data = {'train': train_data, 'test': test_data}
    
#     # Convert the dictionary to a JSON string
#     json_data = json.dumps(data)
    
#     file_path = os.path.join(current_dir,'Labelled Data',f'dataset{i}.json')
#     # Write the JSON string to a file
#     with open(file_path, 'w') as file:
#         file.write(json_data)
    
#     i += 1
    

In [3]:
# Read the JSON file

i = 0
current_dir = os.getcwd()
file_path = os.path.join(current_dir,'Labelled Data',f'dataset{i}.json')
with open(file_path, 'r') as file:
    json_data = file.read()

# Parse the JSON string into a dictionary
data = json.loads(json_data)

# Retrieve the lists from the dictionary
train_data = data['train']
test_data = data['test']

In [4]:
len(train_data)

596

# Preparing ConLL data for the dataloader

In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict, load_dataset, load_metric

In [6]:
def get_sentences_and_labels(BOI_list):
    # Initialize empty lists for sentences and word labels
    sentences = []
    word_labels = []

    # Process each input string
    for input_string in BOI_list:
        lines = input_string.split("\n")
        sentence = []
        labels = []
        for line in lines:
            if line.strip() != "":
                word, label = line.split(" ")
                sentence.append(word)
                labels.append(label)
        sentences.append(" ".join(sentence))
        word_labels.append(",".join(labels))

    # Create pandas DataFrame
    data_style = {"sentence": sentences, "word_labels": word_labels}
    data = pd.DataFrame(data_style)
    
    return sentences, word_labels, data


In [7]:
sentences, word_labels, data = get_sentences_and_labels(train_data)

In [8]:
len(sentences)

596

In [9]:
data.head()

Unnamed: 0,sentence,word_labels
0,Practices like CI/CD and automation have becom...,"O,O,B-Development_Scalability,O,B-Development_..."
1,"Research, on behalf of Atlassian, conducted an...","O,O,O,O,B-Company_Name,O,O,O,O,O,B-Internal_Or..."
2,On call improves the work product of developer...,"O,O,O,O,O,O,O,B-Internal_Organization,I-Intern..."
3,"Traditionally, many organizations have dedicat...","O,O,O,O,O,B-Internal_Organization,I-Internal_O..."
4,"A relatively new role popularized by Google, s...","O,O,O,O,O,O,B-Company_Name,B-Internal_Organiza..."


In [10]:
all_labels = set()
for input_string in train_data:
    lines = input_string.split("\n")
    for line in lines:
        if line.strip() != "":
            word, label = line.split(" ")
            all_labels.add(label)

In [11]:
labels_to_ids = {k: v for v, k in enumerate(all_labels)}
labels_to_ids

{'I-Data_Scalability': 0,
 'I-Company_Name': 1,
 'I-Internal_Organization': 2,
 'I-Userbase_Information': 3,
 'B-Transaction_Scalability': 4,
 'I-Development_Scalability': 5,
 'B-Userbase_Information': 6,
 'I-Software_Purpose': 7,
 'O': 8,
 'B-Software_Name': 9,
 'I-Transaction_Scalability': 10,
 'B-Software_Purpose': 11,
 'B-Development_Scalability': 12,
 'B-Company_Name': 13,
 'B-Data_Scalability': 14,
 'I-Software_Name': 15,
 'B-Internal_Organization': 16}

In [12]:
ids_to_labels = {id: tag for tag, id in labels_to_ids.items()}
ids_to_labels

{0: 'I-Data_Scalability',
 1: 'I-Company_Name',
 2: 'I-Internal_Organization',
 3: 'I-Userbase_Information',
 4: 'B-Transaction_Scalability',
 5: 'I-Development_Scalability',
 6: 'B-Userbase_Information',
 7: 'I-Software_Purpose',
 8: 'O',
 9: 'B-Software_Name',
 10: 'I-Transaction_Scalability',
 11: 'B-Software_Purpose',
 12: 'B-Development_Scalability',
 13: 'B-Company_Name',
 14: 'B-Data_Scalability',
 15: 'I-Software_Name',
 16: 'B-Internal_Organization'}

In [13]:
labels_to_ids = {'B-Development_Scalability': 0,
 'B-Company_Name': 1,
 'I-Internal_Organization': 2,
 'O': 3,
 'I-Software_Name': 4,
 'B-Transaction_Scalability': 5,
 'I-Company_Name': 6,
 'I-Data_Scalability': 7,
 'I-Software_Purpose': 8,
 'I-Transaction_Scalability': 9,
 'I-Userbase_Information': 10,
 'B-Software_Purpose': 11,
 'I-Development_Scalability': 12,
 'B-Userbase_Information': 13,
 'B-Software_Name': 14,
 'B-Data_Scalability': 15,
 'B-Internal_Organization': 16}

In [14]:
ids_to_labels = {0: 'B-Development_Scalability',
 1: 'B-Company_Name',
 2: 'I-Internal_Organization',
 3: 'O',
 4: 'I-Software_Name',
 5: 'B-Transaction_Scalability',
 6: 'I-Company_Name',
 7: 'I-Data_Scalability',
 8: 'I-Software_Purpose',
 9: 'I-Transaction_Scalability',
 10: 'I-Userbase_Information',
 11: 'B-Software_Purpose',
 12: 'I-Development_Scalability',
 13: 'B-Userbase_Information',
 14: 'B-Software_Name',
 15: 'B-Data_Scalability',
 16: 'B-Internal_Organization'}

In [15]:
label_list = ['B-Development_Scalability',
 'B-Company_Name',
 'I-Internal_Organization',
 'O',
 'I-Software_Name',
 'B-Transaction_Scalability',
 'I-Company_Name',
 'I-Data_Scalability',
 'I-Software_Purpose',
 'I-Transaction_Scalability',
 'I-Userbase_Information',
 'B-Software_Purpose',
 'I-Development_Scalability',
 'B-Userbase_Information',
 'B-Software_Name',
 'B-Data_Scalability',
 'B-Internal_Organization']

In [16]:
def get_texts_and_tags(BOI_list):
    # Initialize empty lists for sentences and word labels
    texts = []
    tags = []

    # Process each input string
    for input_string in BOI_list:
        lines = input_string.split("\n")
        sentence = []
        labels = []
        for line in lines:
            if line.strip() != "":
                word, label = line.split(" ")
                sentence.append(word)
                labels.append(labels_to_ids[label])
        texts.append(sentence)
        tags.append(labels)
    
    return texts, tags


In [17]:
texts, tags = get_texts_and_tags(train_data)

In [18]:
test_texts, test_tags = get_texts_and_tags(test_data)

In [19]:
# Splitting into training and validation data
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.1)

In [20]:
len(train_texts)

536

In [21]:
len(val_texts)

60

In [22]:
def construct_dataset(texts, tags):
    data = {}
    data["id"] = [x for x in range(0,len(texts))]
    data["tokens"] = texts
    data["ner_tags"] = tags
    return Dataset.from_dict(data)

In [23]:
train_dataset = construct_dataset(train_texts, train_tags)
val_dataset = construct_dataset(val_texts, val_tags)
test_dataset = construct_dataset(test_texts, test_tags)

In [24]:
datasets = DatasetDict({"train": train_dataset, "validation": val_dataset, "test": test_dataset})

In [25]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 536
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 60
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 149
    })
})

In [26]:
# The rest of the tokenization is based on this source
# https://huggingface.co/docs/transformers/tasks/token_classification

# Preparing the dataset and dataloader

In [27]:
from transformers import AutoTokenizer

In [28]:
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")

In [29]:
example = datasets["train"][1]
print(example["tokens"])

['AWS', 'implements', 'formal,', 'documented', 'policies', 'and', 'procedures', 'that', 'provide', 'guidance', 'for', 'operations', 'and', 'information', 'security', 'within', 'the', 'organization', 'and', 'the', 'supporting', 'AWS', 'environments.', 'Policies', 'address', 'purpose,', 'scope,', 'roles,', 'responsibilities', 'and', 'management', 'commitment.', 'All', 'policies', 'are', 'maintained', 'in', 'a', 'centralized', 'location', 'that', 'is', 'accessible', 'by', 'employees.']


In [30]:
print(example["ner_tags"])

[14, 4, 3, 0, 12, 3, 0, 12, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 12, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 16]


In [31]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', '▁a', 'ws', '▁implement', 's', '▁formal', ',', '▁documented', '▁policies', '▁and', '▁procedures', '▁that', '▁provide', '▁guidance', '▁for', '▁operations', '▁and', '▁information', '▁security', '▁within', '▁the', '▁organization', '▁and', '▁the', '▁supporting', '▁a', 'ws', '▁environments', '.', '▁policies', '▁address', '▁purpose', ',', '▁scope', ',', '▁roles', ',', '▁', 'responsibilities', '▁and', '▁management', '▁commitment', '.', '▁all', '▁policies', '▁are', '▁maintained', '▁in', '▁a', '▁centralized', '▁location', '▁that', '▁is', '▁accessible', '▁by', '▁employees', '.', '[SEP]']


In [32]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example["ner_tags"][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

58 58


In [33]:
print(word_ids)

[None, 0, 0, 1, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 21, 22, 22, 23, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 30, 31, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 44, None]


In [34]:
aligned_labels

[-100,
 14,
 14,
 4,
 4,
 3,
 3,
 0,
 12,
 3,
 0,
 12,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 0,
 0,
 12,
 12,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 16,
 16,
 -100]

In [35]:
label_all_tokens = True

In [36]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [37]:
tokenize_and_align_labels(datasets['train'][:5])

{'input_ids': [[2, 158, 4823, 3501, 21, 2592, 4024, 128, 14, 10058, 15, 1895, 173, 15, 5282, 118, 19174, 11747, 15, 17, 2455, 329, 11747, 9, 3], [2, 21, 10268, 8713, 18, 3337, 15, 8926, 4845, 17, 8876, 30, 1181, 8193, 26, 1311, 17, 676, 1221, 363, 14, 1165, 17, 14, 3134, 21, 10268, 11246, 9, 4845, 3218, 2131, 15, 9914, 15, 2954, 15, 13, 10525, 17, 1097, 6578, 9, 65, 4845, 50, 3926, 19, 21, 25176, 1474, 30, 25, 7342, 34, 3716, 9, 3], [2, 11817, 18, 6127, 50, 3638, 71, 20, 21, 4496, 16, 331, 13, 11872, 16, 600, 8582, 726, 416, 4851, 13, 5, 3970, 14, 4851, 1, 18, 375, 4326, 6, 9, 3], [2, 32, 1927, 1516, 8, 23702, 17, 10719, 3186, 579, 2301, 133, 3108, 79, 1880, 85, 8, 17601, 1603, 8674, 145, 28, 7610, 8738, 68, 15, 6018, 18161, 15, 7331, 68, 17, 10119, 18, 9, 3], [2, 7974, 414, 8330, 129, 4077, 4721, 5350, 7951, 20, 953, 17, 3343, 7951, 9, 1086, 15, 931, 2443, 1880, 414, 20, 21, 64, 8, 1898, 28804, 491, 44, 21, 9857, 274, 19, 14, 704, 876, 76, 5350, 1238, 13328, 50, 117, 28, 12760, 2985, 

In [38]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/536 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/149 [00:00<?, ? examples/s]

# Importing the Albert Model

In [39]:
# Imports
from transformers import AlbertForTokenClassification 
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
import torch

In [40]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU")

GPU is available


In [41]:
print(f"Is CUDA available: {torch.cuda.is_available()}")
# True
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
# Tesla T4

Is CUDA available: True
CUDA device: NVIDIA GeForce GTX 1650 with Max-Q Design


In [42]:

print(torch.__version__)

2.0.1+cu117


In [43]:
model_checkpoint = "albert-base-v2"

In [44]:
# Importing the model
model = AlbertForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForTokenClassification: ['predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.dense.weight', 'predictions.bias', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-st

In [45]:
model.to(device)

AlbertForTokenClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bi

In [46]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=True,
    logging_steps = 20,
    logging_dir='./logs',  # Directory for storing training logs
    
)

In [47]:
data_collator = DataCollatorForTokenClassification(tokenizer)

# Compute metrics and Training

In [48]:
import evaluate
import json
from transformers import pipeline, AutoModelForTokenClassification

In [49]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [50]:
# metric = evaluate.load('exact_match')

In [51]:
labels = [label_list[i] for i in example["ner_tags"]]
metric.compute(predictions=[labels], references=[labels])

{'Development_Scalability': {'precision': 1.0,
  'recall': 1.0,
  'f1': 1.0,
  'number': 3},
 'Internal_Organization': {'precision': 1.0,
  'recall': 1.0,
  'f1': 1.0,
  'number': 1},
 'Software_Name': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [54]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [55]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/athuln/albert-base-v2-finetuned-ner into local empty directory.


Download file pytorch_model.bin:   0%|          | 16.5k/42.4M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 3.81k/3.81k [00:00<?, ?B/s]

Clean file training_args.bin:  26%|##6       | 1.00k/3.81k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/42.4M [00:00<?, ?B/s]

In [56]:
trainer.train()

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.1758,1.026851,0.196891,0.114114,0.144487,0.71185
2,0.8015,0.764348,0.327189,0.213213,0.258182,0.753826
3,0.5537,0.64217,0.393491,0.399399,0.396423,0.803673
4,0.4523,0.593323,0.431085,0.441441,0.436202,0.816791
5,0.385,0.625691,0.40458,0.477477,0.438017,0.805422
6,0.2843,0.610857,0.468391,0.489489,0.478708,0.814167
7,0.2254,0.631398,0.494318,0.522523,0.508029,0.821163
8,0.1635,0.661203,0.490196,0.525526,0.507246,0.816353
9,0.1412,0.682776,0.48062,0.558559,0.516667,0.814604
10,0.0856,0.694013,0.482667,0.543544,0.511299,0.814604


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=670, training_loss=0.45554525371807725, metrics={'train_runtime': 227.9132, 'train_samples_per_second': 23.518, 'train_steps_per_second': 2.94, 'total_flos': 19548914001552.0, 'train_loss': 0.45554525371807725, 'epoch': 10.0})

In [62]:
version = "all_entites_1"
save_name = "ner_model_"+ version

In [66]:
log_data = trainer.state.log_history
log_data

[{'loss': 1.6129,
  'learning_rate': 1.9402985074626868e-05,
  'epoch': 0.3,
  'step': 20},
 {'loss': 1.2785,
  'learning_rate': 1.8805970149253735e-05,
  'epoch': 0.6,
  'step': 40},
 {'loss': 1.1758,
  'learning_rate': 1.8208955223880598e-05,
  'epoch': 0.9,
  'step': 60},
 {'eval_loss': 1.0268510580062866,
  'eval_precision': 0.19689119170984457,
  'eval_recall': 0.11411411411411411,
  'eval_f1': 0.1444866920152091,
  'eval_accuracy': 0.7118495846086577,
  'eval_runtime': 0.7669,
  'eval_samples_per_second': 78.237,
  'eval_steps_per_second': 10.432,
  'epoch': 1.0,
  'step': 67},
 {'loss': 1.001,
  'learning_rate': 1.7611940298507464e-05,
  'epoch': 1.19,
  'step': 80},
 {'loss': 0.8433,
  'learning_rate': 1.701492537313433e-05,
  'epoch': 1.49,
  'step': 100},
 {'loss': 0.8015,
  'learning_rate': 1.6417910447761197e-05,
  'epoch': 1.79,
  'step': 120},
 {'eval_loss': 0.7643482685089111,
  'eval_precision': 0.3271889400921659,
  'eval_recall': 0.2132132132132132,
  'eval_f1': 0.258

In [67]:
with open(f'log/{save_name}.json', 'w') as f:
    json.dump(log_data, f)

In [63]:
# Saving Model
model.save_pretrained(save_name)

In [65]:
tokenizer.save_pretrained("tokenizer")

('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\tokenizer.json')

In [None]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [None]:
config = json.load(open(f"{save_name}/config.json"))

In [None]:
config["id2label"] = id2label
config["label2id"] = label2id

In [None]:
json.dump(config, open(f"{save_name}/config.json","w"))

In [None]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained(save_name)

In [None]:
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)


example = "Google has 100 users for their cloud platform."

ner_results = nlp(example)

print(ner_results)