In [1]:
file_path = "Bangla-NER-Splitted-Dataset.json"

In [2]:
import json
  
# Opening JSON file
f = open(file_path, mode="r", encoding="utf-8")
  
# returns JSON object as 
# a dictionary
json_data = json.load(f)

In [3]:
json_data.keys()

dict_keys(['test', 'train', 'validation'])

In [4]:
train_json_data = json_data['train']
val_json_data = json_data['validation']
test_json_data = json_data['test']

In [5]:
import pandas as pd
import re
import numpy as np

train_df = pd.DataFrame(columns=['ner_tags', 'tokens'])
val_df = pd.DataFrame(columns=['ner_tags', 'tokens'])
test_df = pd.DataFrame(columns=['ner_tags', 'tokens'])

In [6]:
def read_json_file(json_file, df):
    token_docs = []
    tag_docs = []

    for idx,doc in enumerate(json_file):
        tokens = doc['sentence']
        tags = doc['iob_tags']
        token_docs.append(tokens)
        tag_docs.append(tags)
        df.loc[idx] = pd.Series({'ner_tags':tags, 'tokens':tokens})
    
    return df, token_docs, tag_docs

In [7]:
train_df,_,tag_docs = read_json_file(train_json_data, train_df)

In [8]:
val_df,_,_ = read_json_file(val_json_data, val_df)

In [9]:
test_df,_,_ = read_json_file(test_json_data, val_df)

In [10]:
train_df[:1]

Unnamed: 0,ner_tags,tokens
0,"[O, O, O, B-PER, B-PER, I-PER, I-PER, O, O, O,...","[ত্রাণ, ও, সমাজকল্যাণ, সম্পাদক, সুজিত, রায়, নন..."


In [11]:
val_df[:1]

Unnamed: 0,ner_tags,tokens
0,"[O, O, O, B-LOC, O, O, O, O]","[৫%, তার, চাইতে, পশ্চিমোরে, এর, সাক্ষরতার, হার..."


In [12]:
test_df[:1]

Unnamed: 0,ner_tags,tokens
0,"[O, O, O, B-LOC, O, O, O, O]","[৫%, তার, চাইতে, পশ্চিমোরে, এর, সাক্ষরতার, হার..."


In [13]:
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

In [14]:
datasets = DatasetDict()
datasets['train'] = train_ds
datasets['validation'] = val_ds
datasets['test'] = test_ds

In [15]:
datasets["train"][3]["tokens"]

['তিনি',
 'বলছিলেন',
 'সবচেয়ে',
 'বড়',
 'কথা',
 'উনি',
 'খুব',
 'ভালো',
 'মানুষ',
 'ছিলেন']

In [16]:
datasets["train"][3]["ner_tags"]

['B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O']

In [17]:
# create set from list
expanded_tag_docs = []

for tags in tag_docs:
    for tag in tags:
        expanded_tag_docs.append(tag)

In [18]:
unique_tags = set(expanded_tag_docs)
unique_tags

{'B-LOC', 'B-OBJ', 'B-ORG', 'B-PER', 'I-LOC', 'I-OBJ', 'I-ORG', 'I-PER', 'O'}

In [19]:
def assign_label(examples):
    mapping = {'B-LOC':0, 'B-OBJ':1, 'B-ORG':2, 'B-PER':3, 'I-LOC':4, 'I-OBJ':5, 'I-ORG':6, 'I-PER':7, 'O':8}
    ner_labels = []
    for example in examples["ner_tags"]:
        ner_labels.append(mapping[example])
    examples["ner_labels"] = ner_labels
    return examples

In [20]:
datasets = datasets.map(assign_label)

  0%|          | 0/64155 [00:00<?, ?ex/s]

  0%|          | 0/3565 [00:00<?, ?ex/s]

  0%|          | 0/3565 [00:00<?, ?ex/s]

In [21]:
from transformers import AutoTokenizer

model_checkpoint = "../Bengali Pretraining/models/unigram/unigram-long-text"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [22]:
inputs = tokenizer(datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 '▁ত্রাণ',
 '▁',
 'ও',
 '▁সমাজকল্যাণ',
 '▁সম্পাদক',
 '▁সুজিত',
 '▁রায়',
 '▁নন্দী',
 '▁প্রমুখ',
 '▁সংবাদ',
 '▁সম্মেলনে',
 '▁উপস্থিত',
 '▁ছিলেন',
 '[SEP]']

In [23]:
inputs.word_ids()

[None, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, None]

In [24]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [25]:
labels = datasets["train"][0]["ner_labels"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[8, 8, 8, 3, 3, 7, 7, 8, 8, 8, 8, 8]
[-100, 8, 8, 8, 8, 3, 3, 7, 7, 8, 8, 8, 8, 8, -100]


In [26]:
datasets["train"][1]

{'ner_tags': ['O',
  'O',
  'B-PER',
  'B-OBJ',
  'O',
  'O',
  'B-PER',
  'O',
  'O',
  'B-PER',
  'O',
  'O',
  'B-PER',
  'O',
  'B-OBJ',
  'O'],
 'tokens': ['পরিকল্পনা',
  'অনুযায়ী',
  'তারা',
  'বাসায়',
  'ঢুকে',
  'দুই',
  'অতিথিকে',
  'নগ্ন',
  'করে',
  'তাদের',
  'মাঝখানে',
  'এক',
  'ছাত্রীকে',
  'বসিয়ে',
  'ছবি',
  'তোলেন'],
 '__index_level_0__': 1,
 'ner_labels': [8, 8, 3, 1, 8, 8, 3, 8, 8, 3, 8, 8, 3, 8, 1, 8]}

In [27]:
mapping = {'B-LOC':0, 'B-OBJ':1, 'B-ORG':2, 'B-PER':3, 'I-LOC':4, 'I-OBJ':5, 'I-ORG':6, 'I-PER':7, 'O':8}
label_names = list()
for k, v in mapping.items():
    label_names.append(k)

label_names

['B-LOC', 'B-OBJ', 'B-ORG', 'B-PER', 'I-LOC', 'I-OBJ', 'I-ORG', 'I-PER', 'O']

In [28]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, max_length=128
    )
    all_labels = examples["ner_labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [29]:
tokenized_datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=datasets["train"].column_names,
)

  0%|          | 0/65 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [30]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 64155
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3565
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3565
    })
})

In [31]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [32]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    8,    8,    8,    8,    3,    3,    7,    7,    8,    8,    8,
            8,    8, -100, -100, -100, -100, -100],
        [-100,    8,    8,    3,    1,    8,    8,    3,    4,    8,    8,    3,
            8,    8,    3,    8,    1,    8, -100]])

In [33]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 8, 8, 8, 8, 3, 3, 7, 7, 8, 8, 8, 8, 8, -100]
[-100, 8, 8, 3, 1, 8, 8, 3, 4, 8, 8, 3, 8, 8, 3, 8, 1, 8, -100]


In [34]:
import evaluate

metric = evaluate.load("seqeval")

In [35]:
labels = datasets["train"][0]["ner_labels"]
labels = [label_names[i] for i in labels]
labels

['O', 'O', 'O', 'B-PER', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O']

In [36]:
predictions = labels.copy()
predictions[2] = 'B-PER'
metric.compute(predictions=[predictions], references=[labels])

{'PER': {'precision': 0.6666666666666666,
  'recall': 1.0,
  'f1': 0.8,
  'number': 2},
 'overall_precision': 0.6666666666666666,
 'overall_recall': 1.0,
 'overall_f1': 0.8,
 'overall_accuracy': 0.9166666666666666}

In [37]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [38]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [39]:
id2label

{0: 'B-LOC',
 1: 'B-OBJ',
 2: 'B-ORG',
 3: 'B-PER',
 4: 'I-LOC',
 5: 'I-OBJ',
 6: 'I-ORG',
 7: 'I-PER',
 8: 'O'}

In [40]:
label2id

{'B-LOC': 0,
 'B-OBJ': 1,
 'B-ORG': 2,
 'B-PER': 3,
 'I-LOC': 4,
 'I-OBJ': 5,
 'I-ORG': 6,
 'I-PER': 7,
 'O': 8}

In [41]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at ../Bengali Pretraining/models/unigram/unigram-long-text were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at .

In [42]:
model.config.num_labels

9

In [43]:
#  disable weights and biases logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [44]:
from transformers import TrainingArguments
# batch_size = 16
batch_size = 32

args = TrainingArguments(
    "models/ner",
    report_to = None,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    #save_strategy="epoch",
    #learning_rate=2e-5,
    learning_rate=3e-5,
    #num_train_epochs=3,
    num_train_epochs=4,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [45]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [46]:
import transformers
trainer.remove_callback(transformers.integrations.TensorBoardCallback)

In [47]:
trainer.evaluate()

{'eval_loss': 2.4056737422943115,
 'eval_precision': 0.025315185095960813,
 'eval_recall': 0.12236778262979137,
 'eval_f1': 0.041951527854018,
 'eval_accuracy': 0.07304556872293538,
 'eval_runtime': 9.6206,
 'eval_samples_per_second': 370.559,
 'eval_steps_per_second': 11.642}

In [48]:
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2749,0.271741,0.70765,0.694808,0.70117,0.900787
2,0.2198,0.261115,0.726106,0.697428,0.711478,0.906181
3,0.1729,0.27241,0.706528,0.718389,0.712409,0.904116
4,0.1398,0.301497,0.700453,0.720912,0.710535,0.903746


TrainOutput(global_step=8020, training_loss=0.21027263786430073, metrics={'train_runtime': 1414.251, 'train_samples_per_second': 181.453, 'train_steps_per_second': 5.671, 'total_flos': 5117780019313440.0, 'train_loss': 0.21027263786430073, 'epoch': 4.0})

In [51]:
trainer.save_model()

In [50]:
# trainer.args.num_train_epochs = trainer.args.num_train_epochs + 1
# trainer.train("models/ner/checkpoint-2005")

In [49]:
import gc, torch
gc.collect()
torch.cuda.empty_cache()

In [50]:
trainer.evaluate(tokenized_datasets["test"])

{'eval_loss': 0.2724098563194275,
 'eval_precision': 0.7065279633517847,
 'eval_recall': 0.7183891314895682,
 'eval_f1': 0.712409180580282,
 'eval_accuracy': 0.904116133207995,
 'eval_runtime': 6.36,
 'eval_samples_per_second': 560.537,
 'eval_steps_per_second': 17.61,
 'epoch': 4.0}

In [77]:
y_preds, y_true, _ = trainer.predict(tokenized_datasets["test"])

In [78]:
predictions = np.argmax(y_preds, axis=-1)

In [79]:
# Remove ignored index (special tokens) and convert to labels
true_labels = [[label_names[l] for l in label if l != -100] for label in y_true]

In [81]:
true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, y_true)
    ]

In [84]:
misclassified = [i for i in range(len(true_predictions)) if (true_predictions[i] != true_labels[i])]

In [89]:
misclassified

[1,
 2,
 6,
 7,
 8,
 11,
 12,
 13,
 15,
 21,
 25,
 28,
 29,
 33,
 34,
 35,
 36,
 38,
 39,
 44,
 47,
 51,
 53,
 56,
 57,
 61,
 62,
 65,
 66,
 67,
 68,
 70,
 72,
 73,
 74,
 75,
 78,
 79,
 81,
 82,
 85,
 87,
 88,
 90,
 91,
 92,
 95,
 96,
 98,
 99,
 102,
 106,
 107,
 108,
 109,
 113,
 114,
 115,
 116,
 117,
 118,
 120,
 121,
 123,
 126,
 127,
 128,
 130,
 131,
 133,
 135,
 136,
 139,
 142,
 144,
 146,
 148,
 152,
 153,
 154,
 155,
 159,
 160,
 164,
 165,
 167,
 168,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 180,
 182,
 183,
 184,
 186,
 188,
 189,
 190,
 191,
 194,
 195,
 197,
 198,
 199,
 200,
 202,
 203,
 207,
 209,
 211,
 212,
 214,
 215,
 217,
 218,
 219,
 220,
 222,
 227,
 228,
 230,
 232,
 235,
 236,
 237,
 240,
 242,
 246,
 250,
 253,
 256,
 259,
 260,
 262,
 264,
 265,
 266,
 267,
 271,
 273,
 274,
 275,
 277,
 278,
 280,
 283,
 287,
 289,
 290,
 293,
 294,
 296,
 298,
 300,
 301,
 303,
 307,
 308,
 311,
 312,
 313,
 315,
 316,
 317,
 320,
 321,
 323,
 325,
 329,
 330,
 331,
 332,

In [95]:
temp = test_ds.select(misclassified)

In [161]:
temp[:]['tokens']

[['গত',
  '২০১৫',
  'সালের',
  '৫',
  'আগস্ট',
  'সকাল',
  'সাড়ে',
  '৮টার',
  'দিকে',
  'ভাড়া',
  'বাসায়',
  'আনিছুর',
  'রহমান',
  'ধারালো',
  'বটি',
  'দিয়ে',
  'কুপিয়ে',
  'স্ত্রী',
  'মৌসুমিকে',
  'হত্যা',
  'করে'],
 ['জেলা', 'ক্রীড়া', 'সংস্থার', 'সাধারণ', 'সম্পাদক', 'ওবায়দুর', 'রহমান', 'খান'],
 ['এই',
  'ফ্লাইটটি',
  'পরিচালনা',
  'করে',
  'ব্রিটিশ',
  'ইউরোপীয়ান',
  'এয়ারওয়েজে',
  'একটি',
  'এলিজাবেথীয়',
  'শ্রেণীর',
  'এয়ারস্পীড',
  'অ্যাম্বাসেডর',
  'চার্টার',
  'বিমান',
  'জি-এএলজেডইউ',
  'G-ALZU',
  'লর্ড',
  'বার্ঘলি'],
 ['টুর্নামেন্টে',
  'নিজেদের',
  'প্রথম',
  'ম্যাচে',
  'উরুগুয়ের',
  'বিপক্ষে',
  'হারলেও',
  'পরের',
  'দুই',
  'ম্যাচে',
  'ঘুরে',
  'দাঁড়িয়ে',
  'সেমিফাইনালে',
  'পৌঁছে',
  'গেছে',
  'বাংলাদেশ',
  'ইউনিফাইড',
  'দল'],
 ['তাদের',
  'ক্রমবর্ধমান',
  'ক্ষমতা',
  'ও',
  'ধনসম্পদের',
  'পরিচয়',
  'পাওয়া',
  'যায়',
  'তাদের',
  'বিশদ',
  'মাসতাবা',
  'সমাধি',
  'এবং',
  'আবিদোসের',
  'সমাধিমন্দিরগুলিতে',
  'যেখানে',
  'ফারাওদের',
  'মৃত্যুর',
  'পর',
 

In [1]:
search_query = 'বাংলাদেশ'
# search_query = 'শান্ত'
# search_query = 'উত্তর-পশ্চিমাঞ্চলের'
found_example = ""
for index, example in zip(misclassified, temp['tokens']):
    if search_query in example:
        print(index)
        print(example)
        found_index = index
        found_example = " ".join(example)
        break

NameError: name 'misclassified' is not defined

In [177]:
tokenizer.tokenize(found_example)

['▁বাঙালি',
 '▁নদী',
 '▁বাংলাদেশের',
 '▁উত্তর',
 '-',
 'পশ্চিম',
 'াঞ্চল',
 'ের',
 '▁গাইবান্ধা',
 '▁বগুড়া',
 '▁এবং',
 '▁সিরাজগঞ্জ',
 '▁জেলার',
 '▁একটি',
 '▁নদী']

In [178]:
true_labels[found_index]

['B-LOC',
 'I-LOC',
 'B-LOC',
 'B-LOC',
 'B-LOC',
 'B-LOC',
 'B-LOC',
 'B-LOC',
 'B-LOC',
 'B-LOC',
 'O',
 'B-LOC',
 'O',
 'O',
 'B-LOC']

In [179]:
true_predictions[found_index]

['B-LOC',
 'I-LOC',
 'B-LOC',
 'O',
 'B-LOC',
 'O',
 'O',
 'O',
 'B-LOC',
 'B-LOC',
 'O',
 'B-LOC',
 'I-LOC',
 'O',
 'B-LOC']