# Install Dependencies

In [None]:
!pip3 install transformers[torch]

In [68]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.6.0-cp39-cp39-win_amd64.whl (12.3 MB)
                                              0.0/12.3 MB ? eta -:--:--
                                              0.2/12.3 MB 6.3 MB/s eta 0:00:02
     -                                        0.6/12.3 MB 7.1 MB/s eta 0:00:02
     ---                                      0.9/12.3 MB 7.5 MB/s eta 0:00:02
     ----                                     1.4/12.3 MB 8.0 MB/s eta 0:00:02
     ------                                   1.9/12.3 MB 8.6 MB/s eta 0:00:02
     ------                                   2.1/12.3 MB 7.8 MB/s eta 0:00:02
     -------                                  2.4/12.3 MB 7.7 MB/s eta 0:00:02
     ---------                                2.9/12.3 MB 8.1 MB/s eta 0:00:02
     -----------                              3.4/12.3 MB 8.4 MB/s eta 0:00:02
     ------------                             3.9/12.3 MB 8.5 MB/s eta 0:00:01
     --------------                           4.4/12.3 MB 8.7

In [None]:
!pip install evaluate

In [None]:
!pip3 install seqeval

In [None]:
!pip3 install accelerate -U

In [None]:
!pip3 install jsonl-to-conll

In [None]:
!pip3 install -U scikit-learn

In [39]:
!pip3 install doccano-transformer

Collecting doccano-transformer
  Using cached doccano_transformer-1.0.2-py3-none-any.whl (6.4 kB)
Installing collected packages: doccano-transformer
Successfully installed doccano-transformer-1.0.2


In [None]:
!pip3 install transformers datasets tokenizers segeval -q 

# Converting JSONL to ConnL

In [41]:
# Importing to convert docanno dataset into ConLL
from doccano_transformer.datasets import NERDataset
from doccano_transformer.utils import read_jsonl

In [42]:
import os

In [50]:
import json

In [44]:
# Get the current working directory
current_dir = os.getcwd()
print("Current Directory:", current_dir)
data_file = os.path.join(current_dir,'Labelled Data','labelled_data.jsonl')
print(data_file)

Current Directory: D:\UvA\Thesis\SE-Project\NER notebook
D:\UvA\Thesis\SE-Project\NER notebook\Labelled Data\labelled_data.jsonl


In [45]:
dataset = read_jsonl(filepath=data_file, dataset=NERDataset, encoding='utf-8')
conll_data = dataset.to_conll2003(tokenizer=str.split)

In [46]:
BOI_list = []

In [47]:
for line in conll_data:
    lines = line['data'].strip().split('\n')
    lines = [line for line in lines if line != '-DOCSTART- -X- -X- O' and line.strip() != '']
    
    result = []
    for line in lines:
        line = line.replace("_ _ ", "")
        result.append(line)
    cleaned_data = '\n'.join(result)
    
    BOI_list.append(cleaned_data)

In [48]:
print(BOI_list)

['ALLDATA B-CS_Name\nprovides O\ninnovative O\nsoftware O\nsolutions O\nthat O\nconnect B-Software_Purpose\nautomotive I-Software_Purpose\nrepair I-Software_Purpose\ntechnicians I-Software_Purpose\nwith O\nthe O\ndiagnostic B-Software_Purpose\nand I-Software_Purpose\nrepair B-Software_Purpose\ninformation I-Software_Purpose\nthey O\nneed O\nfrom O\noriginal O\nequipment O\nmanufacturers O\n(OEMs). O', 'At O\nAct-On, B-CS_Name\nwe’re O\nnot O\nshy O\nabout O\nsaying O\nthat O\nour B-Internal_Organization\nDeliverability I-Internal_Organization\nTeam I-Internal_Organization\nis O\nthe O\nbest O\nin O\nthe O\nbusiness. O', 'After O\nsome O\nanxiety-fueled O\nGoogle O\nsearches, O\nhe O\ncontacted O\nhis O\ncustomer B-CS_Name\nsuccess I-CS_Name\nmanager, I-CS_Name\nwho O\nimmediately O\nset O\nup O\na O\ncall O\nwith O\nAct-On’s B-CS_Name\nsecret O\nweapon: O\nthe O\nDeliverability B-Internal_Organization\nTeam. I-Internal_Organization', "Act-On's B-CS_Name\nemail B-Internal_Organization\n

# Saving data in 5 cross validation

In [51]:
from sklearn.model_selection import KFold

In [52]:
k = 5

kf = KFold(n_splits=k)

i = 0
for train_index, test_index in kf.split(BOI_list):
    train_data = [BOI_list[i] for i in train_index]
    test_data = [BOI_list[i] for i in test_index]
    data = {'train': train_data, 'test': test_data}
    
    # Convert the dictionary to a JSON string
    json_data = json.dumps(data)
    
    file_path = os.path.join(current_dir,'Labelled Data',f'dataset{i}.json')
    # Write the JSON string to a file
    with open(file_path, 'w') as file:
        file.write(json_data)
    
    i += 1
    

In [53]:
# Read the JSON file

i = 0
current_dir = os.getcwd()
file_path = os.path.join(current_dir,'Labelled Data',f'dataset{i}.json')
with open(file_path, 'r') as file:
    json_data = file.read()

# Parse the JSON string into a dictionary
data = json.loads(json_data)

# Retrieve the lists from the dictionary
train_data = data['train']
test_data = data['test']

In [54]:
len(train_data)

596

# Preparing ConLL data for the dataloader

In [55]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict, load_dataset, load_metric

In [56]:
def get_sentences_and_labels(BOI_list):
    # Initialize empty lists for sentences and word labels
    sentences = []
    word_labels = []

    # Process each input string
    for input_string in BOI_list:
        lines = input_string.split("\n")
        sentence = []
        labels = []
        for line in lines:
            if line.strip() != "":
                word, label = line.split(" ")
                sentence.append(word)
                labels.append(label)
        sentences.append(" ".join(sentence))
        word_labels.append(",".join(labels))

    # Create pandas DataFrame
    data_style = {"sentence": sentences, "word_labels": word_labels}
    data = pd.DataFrame(data_style)
    
    return sentences, word_labels, data


In [57]:
sentences, word_labels, data = get_sentences_and_labels(train_data)

In [58]:
len(sentences)

596

In [59]:
data.head()

Unnamed: 0,sentence,word_labels
0,Practices like CI/CD and automation have becom...,"O,O,B-Development_Scalability,O,B-Development_..."
1,"Research, on behalf of Atlassian, conducted an...","O,O,O,O,B-CS_Name,O,O,O,O,O,B-Internal_Organiz..."
2,On call improves the work product of developer...,"O,O,O,O,O,O,O,B-Internal_Organization,I-Intern..."
3,"Traditionally, many organizations have dedicat...","O,O,O,O,O,B-Internal_Organization,I-Internal_O..."
4,"A relatively new role popularized by Google, s...","O,O,O,O,O,O,B-CS_Name,B-Internal_Organization,..."


In [60]:
all_labels = set()
for input_string in train_data:
    lines = input_string.split("\n")
    for line in lines:
        if line.strip() != "":
            word, label = line.split(" ")
            all_labels.add(label)

In [61]:
labels_to_ids = {k: v for v, k in enumerate(all_labels)}
labels_to_ids

{'B-Transaction_Scalability': 0,
 'I-Transaction_Scalability': 1,
 'I-Software_Purpose': 2,
 'B-Development_Scalability': 3,
 'I-Development_Scalability': 4,
 'B-Userbase_Information': 5,
 'I-Userbase_Information': 6,
 'B-Data_Scalability': 7,
 'I-CS_Name': 8,
 'B-Software_Purpose': 9,
 'B-Internal_Organization': 10,
 'I-Internal_Organization': 11,
 'I-Data_Scalability': 12,
 'B-CS_Name': 13,
 'O': 14}

In [62]:
ids_to_labels = {id: tag for tag, id in labels_to_ids.items()}
ids_to_labels

{0: 'B-Transaction_Scalability',
 1: 'I-Transaction_Scalability',
 2: 'I-Software_Purpose',
 3: 'B-Development_Scalability',
 4: 'I-Development_Scalability',
 5: 'B-Userbase_Information',
 6: 'I-Userbase_Information',
 7: 'B-Data_Scalability',
 8: 'I-CS_Name',
 9: 'B-Software_Purpose',
 10: 'B-Internal_Organization',
 11: 'I-Internal_Organization',
 12: 'I-Data_Scalability',
 13: 'B-CS_Name',
 14: 'O'}

In [63]:
labels_to_ids = {'B-Transaction_Scalability': 0,
 'I-Transaction_Scalability': 1,
 'I-Software_Purpose': 2,
 'B-Development_Scalability': 3,
 'I-Development_Scalability': 4,
 'B-Userbase_Information': 5,
 'I-Userbase_Information': 6,
 'B-Data_Scalability': 7,
 'I-CS_Name': 8,
 'B-Software_Purpose': 9,
 'B-Internal_Organization': 10,
 'I-Internal_Organization': 11,
 'I-Data_Scalability': 12,
 'B-CS_Name': 13,
 'O': 14}

In [64]:
ids_to_labels = {0: 'B-Transaction_Scalability',
 1: 'I-Transaction_Scalability',
 2: 'I-Software_Purpose',
 3: 'B-Development_Scalability',
 4: 'I-Development_Scalability',
 5: 'B-Userbase_Information',
 6: 'I-Userbase_Information',
 7: 'B-Data_Scalability',
 8: 'I-CS_Name',
 9: 'B-Software_Purpose',
 10: 'B-Internal_Organization',
 11: 'I-Internal_Organization',
 12: 'I-Data_Scalability',
 13: 'B-CS_Name',
 14: 'O'}

In [66]:
label_list = ['B-Transaction_Scalability',
 'I-Transaction_Scalability',
 'I-Software_Purpose',
 'B-Development_Scalability',
 'I-Development_Scalability',
 'B-Userbase_Information',
 'I-Userbase_Information',
 'B-Data_Scalability',
 'I-CS_Name',
 'B-Software_Purpose',
 'B-Internal_Organization',
 'I-Internal_Organization',
 'I-Data_Scalability',
 'B-CS_Name',
 'O']

In [67]:
list(labels_to_ids.keys())

['B-Transaction_Scalability',
 'I-Transaction_Scalability',
 'I-Software_Purpose',
 'B-Development_Scalability',
 'I-Development_Scalability',
 'B-Userbase_Information',
 'I-Userbase_Information',
 'B-Data_Scalability',
 'I-CS_Name',
 'B-Software_Purpose',
 'B-Internal_Organization',
 'I-Internal_Organization',
 'I-Data_Scalability',
 'B-CS_Name',
 'O']

In [68]:
def get_texts_and_tags(BOI_list):
    # Initialize empty lists for sentences and word labels
    texts = []
    tags = []

    # Process each input string
    for input_string in BOI_list:
        lines = input_string.split("\n")
        sentence = []
        labels = []
        for line in lines:
            if line.strip() != "":
                word, label = line.split(" ")
                sentence.append(word)
                labels.append(labels_to_ids[label])
        texts.append(sentence)
        tags.append(labels)
    
    return texts, tags


In [69]:
texts, tags = get_texts_and_tags(train_data)

In [70]:
test_texts, test_tags = get_texts_and_tags(test_data)

In [71]:
# Splitting into training and validation data
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.1)

In [72]:
len(train_texts)

536

In [73]:
len(val_texts)

60

In [74]:
def construct_dataset(texts, tags):
    data = {}
    data["id"] = [x for x in range(0,len(texts))]
    data["tokens"] = texts
    data["ner_tags"] = tags
    return Dataset.from_dict(data)

In [75]:
train_dataset = construct_dataset(train_texts, train_tags)
val_dataset = construct_dataset(val_texts, val_tags)
test_dataset = construct_dataset(test_texts, test_tags)

In [76]:
datasets = DatasetDict({"train": train_dataset, "validation": val_dataset, "test": test_dataset})

In [77]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 536
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 60
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 149
    })
})

In [78]:
# The rest of the tokenization is based on this source
# https://huggingface.co/docs/transformers/tasks/token_classification

# Preparing the dataset and dataloader

In [79]:
from transformers import AutoTokenizer

In [80]:
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")

In [81]:
example = datasets["train"][1]
print(example["tokens"])

['For', 'every', 'DB', 'engine', 'except', 'RDS', 'for', 'SQL', 'Server,', 'you', 'can', 'provision', 'additional', 'IOPS', 'and', 'storage', 'throughput', 'when', 'storage', 'size', 'is', 'at', 'or', 'above', 'the', 'threshold', 'value.', 'For', 'RDS', 'for', 'SQL', 'Server,', 'you', 'can', 'provision', 'additional', 'IOPS', 'and', 'storage', 'throughput', 'for', 'any', 'available', 'storage', 'size.', 'For', 'all', 'DB', 'engines,', 'you', 'pay', 'for', 'only', 'the', 'additional', 'provisioned', 'storage', 'performance.']


In [82]:
print(example["ner_tags"])

[14, 14, 13, 8, 14, 7, 12, 7, 12, 14, 14, 14, 14, 7, 12, 7, 0, 1, 7, 12, 14, 14, 14, 14, 14, 7, 12, 14, 7, 12, 7, 12, 14, 14, 14, 14, 7, 12, 7, 12, 0, 14, 14, 7, 12, 14, 14, 13, 8, 14, 14, 14, 14, 14, 14, 7, 12, 12]


In [83]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', '▁for', '▁every', '▁', 'db', '▁engine', '▁except', '▁', 'rd', 's', '▁for', '▁sq', 'l', '▁server', ',', '▁you', '▁can', '▁provision', '▁additional', '▁i', 'ops', '▁and', '▁storage', '▁through', 'put', '▁when', '▁storage', '▁size', '▁is', '▁at', '▁or', '▁above', '▁the', '▁threshold', '▁value', '.', '▁for', '▁', 'rd', 's', '▁for', '▁sq', 'l', '▁server', ',', '▁you', '▁can', '▁provision', '▁additional', '▁i', 'ops', '▁and', '▁storage', '▁through', 'put', '▁for', '▁any', '▁available', '▁storage', '▁size', '.', '▁for', '▁all', '▁', 'db', '▁engines', ',', '▁you', '▁pay', '▁for', '▁only', '▁the', '▁additional', '▁provision', 'ed', '▁storage', '▁performance', '.', '[SEP]']


In [84]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example["ner_tags"][i] for i in word_ids]
print(example["ner_tags"])
print(len(aligned_labels), len(tokenized_input["input_ids"]))

[14, 14, 13, 8, 14, 7, 12, 7, 12, 14, 14, 14, 14, 7, 12, 7, 0, 1, 7, 12, 14, 14, 14, 14, 14, 7, 12, 14, 7, 12, 7, 12, 14, 14, 14, 14, 7, 12, 7, 12, 0, 14, 14, 7, 12, 14, 14, 13, 8, 14, 14, 14, 14, 14, 14, 7, 12, 12]
79 79


In [85]:
print(word_ids)

[None, 0, 1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 28, 28, 29, 30, 30, 31, 31, 32, 33, 34, 35, 36, 36, 37, 38, 39, 39, 40, 41, 42, 43, 44, 44, 45, 46, 47, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 55, 56, 57, 57, None]


In [86]:
aligned_labels

[-100,
 14,
 14,
 13,
 13,
 8,
 14,
 7,
 7,
 7,
 12,
 7,
 7,
 12,
 12,
 14,
 14,
 14,
 14,
 7,
 7,
 12,
 7,
 0,
 0,
 1,
 7,
 12,
 14,
 14,
 14,
 14,
 14,
 7,
 12,
 12,
 14,
 7,
 7,
 7,
 12,
 7,
 7,
 12,
 12,
 14,
 14,
 14,
 14,
 7,
 7,
 12,
 7,
 12,
 12,
 0,
 14,
 14,
 7,
 12,
 12,
 14,
 14,
 13,
 13,
 8,
 8,
 14,
 14,
 14,
 14,
 14,
 14,
 7,
 7,
 12,
 12,
 12,
 -100]

In [87]:
label_all_tokens = True

In [88]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [89]:
tokenize_and_align_labels(datasets['train'][:5])

{'input_ids': [[2, 8059, 4492, 9007, 13, 5, 1410, 3521, 5474, 220, 13, 21618, 6, 25, 21, 1512, 15, 18957, 579, 15, 2157, 904, 15, 17, 2337, 1471, 4492, 6018, 365, 30, 6747, 3521, 5474, 220, 170, 8294, 18, 9, 3], [2, 26, 352, 13, 9007, 1406, 1613, 13, 897, 18, 26, 4444, 255, 8128, 15, 42, 92, 8738, 1351, 31, 11314, 17, 4326, 120, 4881, 76, 4326, 1072, 25, 35, 54, 784, 14, 11361, 1923, 9, 26, 13, 897, 18, 26, 4444, 255, 8128, 15, 42, 92, 8738, 1351, 31, 11314, 17, 4326, 120, 4881, 26, 186, 904, 4326, 1072, 9, 26, 65, 13, 9007, 4016, 15, 42, 1372, 26, 104, 14, 1351, 8738, 69, 4326, 956, 9, 3], [2, 8059, 4865, 9990, 7499, 1957, 11435, 18, 1054, 19, 2422, 15, 17, 2965, 3878, 20, 15436, 1054, 37, 8964, 84, 14, 317, 2620, 26, 18194, 18, 3767, 30, 50, 5547, 29, 21, 10268, 932, 6258, 15, 7974, 6587, 20, 1221, 8791, 9, 3], [2, 235, 1, 18, 132, 16, 14, 151, 14795, 95, 577, 120, 29, 14, 3228, 79, 1105, 2884, 1406, 15, 76, 13785, 15, 1313, 7798, 68, 15, 10353, 17, 6605, 125, 318, 2447, 5853, 9, 3],

In [90]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/536 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/149 [00:00<?, ? examples/s]

# Importing the Albert Model

In [91]:
# Imports
from transformers import AlbertForTokenClassification 
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
import torch

In [92]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU")

GPU is available


In [93]:
print(f"Is CUDA available: {torch.cuda.is_available()}")
# True
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
# Tesla T4

Is CUDA available: True
CUDA device: NVIDIA GeForce GTX 1650 with Max-Q Design


In [94]:

print(torch.__version__)

2.0.1+cu117


In [95]:
model_checkpoint = "albert-base-v2"

In [96]:
# Importing the model
model = AlbertForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForTokenClassification: ['predictions.decoder.bias', 'predictions.LayerNorm.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.dense.weight', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-st

In [97]:
model.to(device)

AlbertForTokenClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bi

In [98]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=True,
    logging_steps = 20,
    logging_dir='./logs',  # Directory for storing training logs
    
)

In [99]:
data_collator = DataCollatorForTokenClassification(tokenizer)

# Compute metrics and Training

In [100]:
import evaluate
import json
from transformers import pipeline, AutoModelForTokenClassification

In [101]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [102]:
# metric = evaluate.load('exact_match')

In [103]:
labels = [label_list[i] for i in example["ner_tags"]]
metric.compute(predictions=[labels], references=[labels])

{'CS_Name': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'Data_Scalability': {'precision': 1.0,
  'recall': 1.0,
  'f1': 1.0,
  'number': 12},
 'Transaction_Scalability': {'precision': 1.0,
  'recall': 1.0,
  'f1': 1.0,
  'number': 2},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [104]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [106]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

D:\UvA\Thesis\SE-Project\NER notebook\albert-base-v2-finetuned-ner is already a clone of https://huggingface.co/athuln/albert-base-v2-finetuned-ner. Make sure you pull the latest changes with `repo.git_pull()`.


In [107]:
trainer.train()

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.1079,1.007859,0.165414,0.068111,0.096491,0.7161
2,0.87,0.839534,0.247387,0.219814,0.232787,0.747235
3,0.6389,0.734065,0.311594,0.266254,0.287145,0.775502
4,0.4741,0.72585,0.340694,0.334365,0.3375,0.783286
5,0.3834,0.774274,0.329897,0.396285,0.360056,0.764031
6,0.2704,0.808178,0.394231,0.380805,0.387402,0.782057
7,0.2247,0.829968,0.394495,0.399381,0.396923,0.785334
8,0.1698,0.861513,0.39939,0.405573,0.402458,0.789021
9,0.1623,0.870591,0.383099,0.421053,0.40118,0.784924
10,0.1329,0.903929,0.39939,0.405573,0.402458,0.787382


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=670, training_loss=0.45084716900071103, metrics={'train_runtime': 229.0258, 'train_samples_per_second': 23.403, 'train_steps_per_second': 2.925, 'total_flos': 19458007935360.0, 'train_loss': 0.45084716900071103, 'epoch': 10.0})

In [108]:
version = "all_entites_1"
save_name = "ner_model_"+ version

In [109]:
log_data = trainer.state.log_history
log_data

[{'loss': 1.5639,
  'learning_rate': 1.9402985074626868e-05,
  'epoch': 0.3,
  'step': 20},
 {'loss': 1.2316,
  'learning_rate': 1.8805970149253735e-05,
  'epoch': 0.6,
  'step': 40},
 {'loss': 1.1079,
  'learning_rate': 1.8208955223880598e-05,
  'epoch': 0.9,
  'step': 60},
 {'eval_loss': 1.0078587532043457,
  'eval_precision': 0.16541353383458646,
  'eval_recall': 0.06811145510835913,
  'eval_f1': 0.09649122807017542,
  'eval_accuracy': 0.7160999590331831,
  'eval_runtime': 1.0172,
  'eval_samples_per_second': 58.984,
  'eval_steps_per_second': 7.865,
  'epoch': 1.0,
  'step': 67},
 {'loss': 0.947,
  'learning_rate': 1.7611940298507464e-05,
  'epoch': 1.19,
  'step': 80},
 {'loss': 0.8124,
  'learning_rate': 1.701492537313433e-05,
  'epoch': 1.49,
  'step': 100},
 {'loss': 0.87,
  'learning_rate': 1.6417910447761197e-05,
  'epoch': 1.79,
  'step': 120},
 {'eval_loss': 0.8395339250564575,
  'eval_precision': 0.24738675958188153,
  'eval_recall': 0.21981424148606812,
  'eval_f1': 0.232

In [110]:
with open(f'log/{save_name}.json', 'w') as f:
    json.dump(log_data, f)

In [111]:
# Saving Model
model.save_pretrained(save_name)

In [112]:
tokenizer.save_pretrained("tokenizer")

('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\tokenizer.json')

In [None]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [None]:
config = json.load(open(f"{save_name}/config.json"))

In [None]:
config["id2label"] = id2label
config["label2id"] = label2id

In [None]:
json.dump(config, open(f"{save_name}/config.json","w"))

In [None]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained(save_name)

In [None]:
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)


example = "Google has 100 users for their cloud platform."

ner_results = nlp(example)

print(ner_results)