In [None]:
from transformers import BertForSequenceClassification, BertConfig, CharacterBertModel, \
CharacterBertForPreTraining, CharacterBertConfig, CharacterBertTokenizer
import torch
import os

In [None]:
file_path = "Bangla-NER-Splitted-Dataset.json"

In [None]:
import json
  
# Opening JSON file
f = open(file_path, mode="r", encoding="utf-8")
  
# returns JSON object as 
# a dictionary
json_data = json.load(f)

In [None]:
json_data.keys()

In [None]:
train_json_data = json_data['train']
val_json_data = json_data['validation']

In [None]:
test_json_data = json_data['test']

In [None]:
import pandas as pd
import re
import numpy as np

train_df = pd.DataFrame(columns=['ner_tags', 'tokens'])
val_df = pd.DataFrame(columns=['ner_tags', 'tokens'])

In [None]:
test_df = pd.DataFrame(columns=['ner_tags', 'tokens'])

In [None]:
def read_json_file(json_file, df):
    token_docs = []
    tag_docs = []

    for idx,doc in enumerate(json_file):
        tokens = doc['sentence']
        tags = doc['iob_tags']
        token_docs.append(tokens)
        tag_docs.append(tags)
        df.loc[idx] = pd.Series({'ner_tags':tags, 'tokens':tokens})
    
    return df, token_docs, tag_docs

In [None]:
train_df,_,tag_docs = read_json_file(train_json_data, train_df)

In [None]:
val_df,_,_ = read_json_file(val_json_data, val_df)

In [None]:
test_df,_,_ = read_json_file(test_json_data, val_df)

In [None]:
train_df[:1]

In [None]:
val_df[:1]

In [None]:
test_df[:1]

In [None]:
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

In [None]:
datasets = DatasetDict()
datasets['train'] = train_ds
datasets['validation'] = val_ds
datasets['test'] = test_ds

In [None]:
datasets["train"][1]["tokens"]

In [None]:
datasets["train"][1]["ner_tags"]

In [None]:
# create set from list
expanded_tag_docs = []

for tags in tag_docs:
    for tag in tags:
        expanded_tag_docs.append(tag)

In [None]:
unique_tags = set(expanded_tag_docs)
unique_tags

In [None]:
def assign_label(examples):
    mapping = {'B-LOC':0, 'B-OBJ':1, 'B-ORG':2, 'B-PER':3, 'I-LOC':4, 'I-OBJ':5, 'I-ORG':6, 'I-PER':7, 'O':8}
    ner_labels = []
    for example in examples["ner_tags"]:
        ner_labels.append(mapping[example])
    examples["ner_labels"] = ner_labels
    return examples

In [None]:
datasets = datasets.map(assign_label)

In [None]:
from transformers import AutoTokenizer

tokenizer = CharacterBertTokenizer(strip_accents=None, do_lower_case=None)

In [None]:
inputs = tokenizer(' '.join(datasets["train"][0]["tokens"]))
# inputs = tokenizer('ত্রাণ ও সমাজকল্যাণ সম্পাদক সুজিত রায় নন্দী রমুখ সংবাদ সম্মেলনে উপস্থিত ছিলেন')

In [None]:
datasets["train"][0]["tokens"]

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    for word_id in word_ids:
        if word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            new_labels.append(labels[word_id])
    return new_labels

In [None]:
def get_word_ids(input_tokens):
    word_ids = []
    count = 0
    special_tokens_list = [tokenizer.pad_token_id, tokenizer.unk_token_id, tokenizer.cls_token_id, tokenizer.mask_token_id,\
                       tokenizer.unk_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id]
    for input_token in input_tokens:
        if input_token in special_tokens_list:
            word_id = None
            word_ids.append(word_id)
        else:
            word_id = count
            word_ids.append(word_id)
            count += 1
    return word_ids

In [None]:
labels = datasets["train"][0]["ner_labels"]
word_ids = get_word_ids(inputs['input_ids'])
print(word_ids)
print(labels)
print(align_labels_with_tokens(labels, word_ids))

In [None]:
datasets["train"][0]['ner_tags']

In [None]:
mapping = {'B-LOC':0, 'B-OBJ':1, 'B-ORG':2, 'B-PER':3, 'I-LOC':4, 'I-OBJ':5, 'I-ORG':6, 'I-PER':7, 'O':8}
label_names = list()
for k, v in mapping.items():
    label_names.append(k)

label_names

In [None]:
# def tokenize_and_align_labels(examples):
#     temp_examples = [' '.join(example) for example in examples["tokens"]]
#     #print(temp_examples)
#     tokenized_inputs = tokenizer(
#         temp_examples
#     )
#     all_labels = examples["ner_labels"]
#     new_labels = []
#     for i, labels in enumerate(all_labels):
#         word_ids = get_word_ids(tokenized_inputs['input_ids'][i])
#         print(len(word_ids))
#         new_labels.append(align_labels_with_tokens(labels, word_ids))
#         print(len(labels))

#     tokenized_inputs["labels"] = new_labels
#     return tokenized_inputs

In [None]:
def tokenize_and_align_labels(examples):
    temp_examples = [' '.join(example) for example in examples["tokens"]]
    #print(temp_examples)
    tokenized_inputs = tokenizer(
        temp_examples, max_length=128
    )
    
    all_labels = examples["ner_labels"]
    new_labels = []
    
    for i, labels in enumerate(all_labels):
        new_labels.append([-100]+labels+[-100])
    
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
samples = datasets["train"][:2]

In [None]:
samples['tokens'][0]

In [None]:
tokenize_and_align_labels(samples)

In [None]:
tokenized_datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=datasets["train"].column_names,
)

In [None]:
tokenized_datasets

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

In [None]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

In [None]:
import evaluate

metric = evaluate.load("seqeval")

In [None]:
# !pip install seqeval

In [None]:
labels = datasets["train"][0]["ner_labels"]
labels = [label_names[i] for i in labels]
labels

In [None]:
predictions = labels.copy()
predictions[2] = 'B-PER'
metric.compute(predictions=[predictions], references=[labels])

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
id2label

In [None]:
label2id

In [None]:
#### LOADING BERT FOR CLASSIFICATION ####

from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    'bert-base-uncased',
    id2label=id2label,
    label2id=label2id,
)

In [None]:
#### REPLACING BERT WITH CHARACTER_BERT ####

character_bert_model = CharacterBertModel.from_pretrained(
    "E:\Documents\Character Bert\Question Classification\character-bert")
model.bert = character_bert_model

In [None]:
model.bert.embeddings.word_embeddings  # wordpieces are replaced with a CharacterCNN

In [None]:
model.config.num_labels

In [None]:
#  disable weights and biases logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import TrainingArguments
batch_size = 32

args = TrainingArguments(
    "models/ner",
    report_to = None,
    logging_dir= None,
    save_strategy="no",
    evaluation_strategy="epoch",
    #save_strategy="epoch",
    #learning_rate=2e-5,
    learning_rate=3e-5,
    #num_train_epochs=4,
    num_train_epochs=3,
    warmup_ratio = 0.06,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
import transformers
trainer.remove_callback(transformers.integrations.TensorBoardCallback)

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
# trainer.save_model()

In [None]:
trainer.evaluate(tokenized_datasets["test"])

In [None]:
y_preds, y_true, _ = trainer.predict(tokenized_datasets["test"])

In [None]:
predictions = np.argmax(y_preds, axis=-1)

In [None]:
# Remove ignored index (special tokens) and convert to labels
true_labels = [[label_names[l] for l in label if l != -100] for label in y_true]

In [None]:
true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, y_true)
    ]

In [None]:
misclassified = [i for i in range(len(true_predictions)) if (true_predictions[i] != true_labels[i])]

In [None]:
misclassified

In [None]:
temp = test_ds.select(misclassified)

In [None]:
# search_query = 'বাংলাদেশ'
# search_query = 'শান্ত'
search_query = 'উত্তর-পশ্চিমাঞ্চলের'
found_example = ""
for index, example in zip(misclassified, temp['tokens']):
    if search_query in example:
        print(index)
        print(example)
        found_index = index
        found_example = " ".join(example)
        break

In [None]:
tokenizer.tokenize(found_example)

In [None]:
true_labels[found_index]

In [None]:
true_predictions[found_index]