In [None]:
import pandas as pd

In [None]:
train_tags = []

with open('data', 'r') as train_file_tags:
    train_tags.append(train_file_tags.readlines())

train_tags = [[i.rstrip() for i in item] for item in train_tags]
train_tags = [item[i].split() for item in train_tags for i in range(len(train_tags[0]))]

In [None]:
train_tokens = []

with open('data', 'r') as train_file_tokens:
    train_tokens.append(train_file_tokens.readlines())

train_tokens = [[i.rstrip() for i in item] for item in train_tokens]
train_tokens = [item[i].split() for item in train_tokens for i in range(len(train_tokens[0]))]

In [None]:
len(train_tags), len(train_tags)

In [None]:
test_tags = []

with open('data', 'r') as test_file_tags:
    test_tags.append(test_file_tags.readlines())

test_tags = [[i.rstrip() for i in item] for item in test_tags]
test_tags = [item[i].split() for item in test_tags for i in range(len(test_tags[0]))]

In [None]:
test_tokens = []

with open('data', 'r') as test_file_tokens:
    test_tokens.append(test_file_tokens.readlines())

test_tokens = [[i.rstrip() for i in item] for item in test_tokens]
test_tokens = [item[i].split() for item in test_tokens for i in range(len(test_tokens[0]))]

In [None]:
len(test_tags), len(test_tags)

In [None]:
from datasets import Dataset, DatasetDict

df = pd.DataFrame({'tokens': train_tokens, 'ner_tags_str': train_tags})
train = Dataset.from_pandas(df)

df = pd.DataFrame({'tokens': test_tokens, 'ner_tags_str': test_tags})
test = Dataset.from_pandas(df)

dataset = DatasetDict({'train': train, 'test': test, 'validation': test})

dataset

In [None]:
unique_tags = set()
for tag in dataset['train']['ner_tags_str']:
    unique_tags.update(tag)

unique_tags

In [None]:
unique_tags = list(set([x[2:] for x in list(unique_tags) if x!='O']))
unique_tags

In [None]:
tag2index = {"O": 0}
for i, tag in enumerate(unique_tags):
    tag2index[f'B-{tag}'] = len(tag2index)
    tag2index[f'I-{tag}'] = len(tag2index)

tag2index

In [None]:
dataset['train'][0]

In [None]:
index2tag = {v:k for k,v in tag2index.items()}
index2tag

In [None]:
dataset = dataset.map(lambda example: {"ner_tags": [tag2index[tag] for tag in example['ner_tags_str']]})
dataset

In [None]:
from transformers import AutoTokenizer

In [None]:
model_ckpt = "data"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)

            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            
            else:
                label_ids.append(-100)

            previous_word_idx = word_idx
        
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels

    return tokenized_inputs
    


In [None]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
tokenized_dataset

In [None]:
tokenized_dataset['train'][2]

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
import evaluate
import numpy as np

metric = evaluate.load('seqeval')
label_names = list(tag2index)

def compute_metrics(eval_preds):
    logits, labels = eval_preds

    predictions = np.argmax(logits, axis=-1)
    true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

    true_predictions = [[label_names[p] for p, l in zip(prediction, label) if l != -100] 
                        for prediction, label in zip(predictions, labels)]
    
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": all_metrics['overall_precision'],
        'recall': all_metrics['overall_recall'],
        'f1': all_metrics['overall_f1'],
        'accuracy': all_metrics['overall_accuracy'],
    }

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_ckpt, id2label=index2tag, label2id=tag2index)

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments("finetuned-ner", evaluation_strategy='epoch',
                         save_strategy='epoch',
                         learning_rate=2e-5,
                         num_train_epochs=3,
                         weight_decay=0.01)

In [None]:
trainer = Trainer(model=model, args=args,
                  train_dataset=tokenized_dataset['train'],
                  eval_dataset=tokenized_dataset['validation'],
                  data_collator=data_collator,
                #   compute_metrics=compute_metrics,
                  tokenizer=tokenizer)

In [None]:
trainer.train()

In [None]:
trainer.save_model("data")

In [None]:
from transformers import pipeline

checkpoint = "data"
pipe = pipeline('token-classification', model=checkpoint, aggregation_strategy='simple')

In [None]:
pipe("which restaurant serves the best shushi in new york?")