In [1]:
from transformers import BertForSequenceClassification, BertConfig, CharacterBertModel, \
CharacterBertForPreTraining, CharacterBertConfig, CharacterBertTokenizer
import torch
import os

In [2]:
file_path = "Bangla-NER-Splitted-Dataset.json"

In [3]:
import json
  
# Opening JSON file
f = open(file_path, mode="r", encoding="utf-8")
  
# returns JSON object as 
# a dictionary
json_data = json.load(f)

In [4]:
json_data.keys()

dict_keys(['test', 'train', 'validation'])

In [5]:
train_json_data = json_data['train']
val_json_data = json_data['validation']

In [6]:
test_json_data = json_data['test']

In [7]:
import pandas as pd
import re
import numpy as np

train_df = pd.DataFrame(columns=['ner_tags', 'tokens'])
val_df = pd.DataFrame(columns=['ner_tags', 'tokens'])

In [8]:
test_df = pd.DataFrame(columns=['ner_tags', 'tokens'])

In [9]:
def read_json_file(json_file, df):
    token_docs = []
    tag_docs = []

    for idx,doc in enumerate(json_file):
        tokens = doc['sentence']
        tags = doc['iob_tags']
        token_docs.append(tokens)
        tag_docs.append(tags)
        df.loc[idx] = pd.Series({'ner_tags':tags, 'tokens':tokens})
    
    return df, token_docs, tag_docs

In [10]:
train_df,_,tag_docs = read_json_file(train_json_data, train_df)

In [11]:
val_df,_,_ = read_json_file(val_json_data, val_df)

In [12]:
test_df,_,_ = read_json_file(test_json_data, val_df)

In [13]:
train_df[:1]

Unnamed: 0,ner_tags,tokens
0,"[O, O, O, B-PER, B-PER, I-PER, I-PER, O, O, O,...","[ত্রাণ, ও, সমাজকল্যাণ, সম্পাদক, সুজিত, রায়, নন..."


In [14]:
val_df[:1]

Unnamed: 0,ner_tags,tokens
0,"[O, O, O, B-LOC, O, O, O, O]","[৫%, তার, চাইতে, পশ্চিমোরে, এর, সাক্ষরতার, হার..."


In [15]:
test_df[:1]

Unnamed: 0,ner_tags,tokens
0,"[O, O, O, B-LOC, O, O, O, O]","[৫%, তার, চাইতে, পশ্চিমোরে, এর, সাক্ষরতার, হার..."


In [62]:
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

In [63]:
datasets = DatasetDict()
datasets['train'] = train_ds
datasets['validation'] = val_ds
datasets['test'] = test_ds

In [64]:
datasets["train"][1]["tokens"]

['পরিকল্পনা',
 'অনুযায়ী',
 'তারা',
 'বাসায়',
 'ঢুকে',
 'দুই',
 'অতিথিকে',
 'নগ্ন',
 'করে',
 'তাদের',
 'মাঝখানে',
 'এক',
 'ছাত্রীকে',
 'বসিয়ে',
 'ছবি',
 'তোলেন']

In [65]:
datasets["train"][1]["ner_tags"]

['O',
 'O',
 'B-PER',
 'B-OBJ',
 'O',
 'O',
 'B-PER',
 'O',
 'O',
 'B-PER',
 'O',
 'O',
 'B-PER',
 'O',
 'B-OBJ',
 'O']

In [66]:
# create set from list
expanded_tag_docs = []

for tags in tag_docs:
    for tag in tags:
        expanded_tag_docs.append(tag)

In [67]:
unique_tags = set(expanded_tag_docs)
unique_tags

{'B-LOC', 'B-OBJ', 'B-ORG', 'B-PER', 'I-LOC', 'I-OBJ', 'I-ORG', 'I-PER', 'O'}

In [68]:
def assign_label(examples):
    mapping = {'B-LOC':0, 'B-OBJ':1, 'B-ORG':2, 'B-PER':3, 'I-LOC':4, 'I-OBJ':5, 'I-ORG':6, 'I-PER':7, 'O':8}
    ner_labels = []
    for example in examples["ner_tags"]:
        ner_labels.append(mapping[example])
    examples["ner_labels"] = ner_labels
    return examples

In [69]:
datasets = datasets.map(assign_label)

Map:   0%|          | 0/64155 [00:00<?, ? examples/s]

Map:   0%|          | 0/3565 [00:00<?, ? examples/s]

Map:   0%|          | 0/3565 [00:00<?, ? examples/s]

In [70]:
from transformers import AutoTokenizer

tokenizer = CharacterBertTokenizer(strip_accents=None, do_lower_case=None)

In [71]:
inputs = tokenizer(' '.join(datasets["train"][0]["tokens"]))
# inputs = tokenizer('ত্রাণ ও সমাজকল্যাণ সম্পাদক সুজিত রায় নন্দী রমুখ সংবাদ সম্মেলনে উপস্থিত ছিলেন')

In [72]:
datasets["train"][0]["tokens"]

['ত্রাণ',
 'ও',
 'সমাজকল্যাণ',
 'সম্পাদক',
 'সুজিত',
 'রায়',
 'নন্দী',
 'প্রমুখ',
 'সংবাদ',
 'সম্মেলনে',
 'উপস্থিত',
 'ছিলেন']

In [73]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    for word_id in word_ids:
        if word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            new_labels.append(labels[word_id])
    return new_labels

In [74]:
def get_word_ids(input_tokens):
    word_ids = []
    count = 0
    special_tokens_list = [tokenizer.pad_token_id, tokenizer.unk_token_id, tokenizer.cls_token_id, tokenizer.mask_token_id,\
                       tokenizer.unk_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id]
    for input_token in input_tokens:
        if input_token in special_tokens_list:
            word_id = None
            word_ids.append(word_id)
        else:
            word_id = count
            word_ids.append(word_id)
            count += 1
    return word_ids

In [75]:
labels = datasets["train"][0]["ner_labels"]
word_ids = get_word_ids(inputs['input_ids'])
print(word_ids)
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, None]
[8, 8, 8, 3, 3, 7, 7, 8, 8, 8, 8, 8]
[-100, 8, 8, 8, 3, 3, 7, 7, 8, 8, 8, 8, 8, -100]


In [76]:
datasets["train"][0]['ner_tags']

['O', 'O', 'O', 'B-PER', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O']

In [77]:
mapping = {'B-LOC':0, 'B-OBJ':1, 'B-ORG':2, 'B-PER':3, 'I-LOC':4, 'I-OBJ':5, 'I-ORG':6, 'I-PER':7, 'O':8}
label_names = list()
for k, v in mapping.items():
    label_names.append(k)

label_names

['B-LOC', 'B-OBJ', 'B-ORG', 'B-PER', 'I-LOC', 'I-OBJ', 'I-ORG', 'I-PER', 'O']

In [78]:
# def tokenize_and_align_labels(examples):
#     temp_examples = [' '.join(example) for example in examples["tokens"]]
#     #print(temp_examples)
#     tokenized_inputs = tokenizer(
#         temp_examples
#     )
#     all_labels = examples["ner_labels"]
#     new_labels = []
#     for i, labels in enumerate(all_labels):
#         word_ids = get_word_ids(tokenized_inputs['input_ids'][i])
#         print(len(word_ids))
#         new_labels.append(align_labels_with_tokens(labels, word_ids))
#         print(len(labels))

#     tokenized_inputs["labels"] = new_labels
#     return tokenized_inputs

In [79]:
def tokenize_and_align_labels(examples):
    temp_examples = [' '.join(example) for example in examples["tokens"]]
    #print(temp_examples)
    tokenized_inputs = tokenizer(
        temp_examples, max_length=128
    )
    
    all_labels = examples["ner_labels"]
    new_labels = []
    
    for i, labels in enumerate(all_labels):
        new_labels.append([-100]+labels+[-100])
    
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [80]:
samples = datasets["train"][:2]

In [81]:
samples['tokens'][0]

['ত্রাণ',
 'ও',
 'সমাজকল্যাণ',
 'সম্পাদক',
 'সুজিত',
 'রায়',
 'নন্দী',
 'প্রমুখ',
 'সংবাদ',
 'সম্মেলনে',
 'উপস্থিত',
 'ছিলেন']

In [82]:
tokenize_and_align_labels(samples)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': [[[259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [259, 225, 167, 165, 225, 168, 142, 225, 167, 177, 225, 167, 191, 225, 167, 164, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [259, 225, 167, 148, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [259, 225, 167, 185, 225, 167, 175, 225, 167, 191, 225, 167, 157, 225, 167, 150, 225, 167, 179, 225, 168, 142, 225, 167, 176, 225, 167, 191, 225, 167, 164, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 26

In [83]:
tokenized_datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=datasets["train"].column_names,
)

Map:   0%|          | 0/64155 [00:00<?, ? examples/s]

Map:   0%|          | 0/3565 [00:00<?, ? examples/s]

Map:   0%|          | 0/3565 [00:00<?, ? examples/s]

In [84]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 64155
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3565
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3565
    })
})

In [85]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [86]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    8,    8,    8,    3,    3,    7,    7,    8,    8,    8,    8,
            8, -100, -100, -100, -100, -100],
        [-100,    8,    8,    3,    1,    8,    8,    3,    8,    8,    3,    8,
            8,    3,    8,    1,    8, -100]])

In [87]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 8, 8, 8, 3, 3, 7, 7, 8, 8, 8, 8, 8, -100]
[-100, 8, 8, 3, 1, 8, 8, 3, 8, 8, 3, 8, 8, 3, 8, 1, 8, -100]


In [88]:
import evaluate

metric = evaluate.load("seqeval")

In [89]:
# !pip install seqeval

In [90]:
labels = datasets["train"][0]["ner_labels"]
labels = [label_names[i] for i in labels]
labels

['O', 'O', 'O', 'B-PER', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O']

In [91]:
predictions = labels.copy()
predictions[2] = 'B-PER'
metric.compute(predictions=[predictions], references=[labels])

{'PER': {'precision': 0.6666666666666666,
  'recall': 1.0,
  'f1': 0.8,
  'number': 2},
 'overall_precision': 0.6666666666666666,
 'overall_recall': 1.0,
 'overall_f1': 0.8,
 'overall_accuracy': 0.9166666666666666}

In [92]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [93]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [94]:
id2label

{0: 'B-LOC',
 1: 'B-OBJ',
 2: 'B-ORG',
 3: 'B-PER',
 4: 'I-LOC',
 5: 'I-OBJ',
 6: 'I-ORG',
 7: 'I-PER',
 8: 'O'}

In [95]:
label2id

{'B-LOC': 0,
 'B-OBJ': 1,
 'B-ORG': 2,
 'B-PER': 3,
 'I-LOC': 4,
 'I-OBJ': 5,
 'I-ORG': 6,
 'I-PER': 7,
 'O': 8}

In [96]:
#### LOADING BERT FOR CLASSIFICATION ####

from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    'bert-base-uncased',
    id2label=id2label,
    label2id=label2id,
)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at C:\Users\arifa/.cache\huggingface\transformers\3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "B-LOC",
    "1": "B-OBJ",
    "2": "B-ORG",
    "3": "B-PER",
    "4": "I-LOC",
    "5": "I-OBJ",
    "6": "I-ORG",
    "7": "I-PER",
    "8": "O"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 0,
    "B-OBJ": 1,
    "B-ORG": 2,
    "B-PER": 3,
    "I-LOC": 4,
    "I-OBJ": 5,
    "I-ORG": 6,
    "I-PER": 7,
    "O": 8
  },
  "layer_norm_eps": 1e-12,

In [97]:
#### REPLACING BERT WITH CHARACTER_BERT ####

character_bert_model = CharacterBertModel.from_pretrained(
    "E:\Documents\Character Bert\Question Classification\character-bert")
model.bert = character_bert_model

loading configuration file E:\Documents\Character Bert\Question Classification\character-bert\config.json
Model config CharacterBertConfig {
  "_name_or_path": "helboukkouri/character-bert",
  "architectures": [
    "CharacterBertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "auto_map": {
    "AutoConfig": "configuration_character_bert.CharacterBertConfig",
    "AutoModel": "modeling_character_bert.CharacterBertForPreTraining",
    "AutoModelForMaskedLM": "modeling_character_bert.CharacterBertForMaskedLM"
  },
  "character_embeddings_dim": 16,
  "cnn_activation": "relu",
  "cnn_filters": [
    [
      1,
      32
    ],
    [
      2,
      32
    ],
    [
      3,
      64
    ],
    [
      4,
      128
    ],
    [
      5,
      256
    ],
    [
      6,
      512
    ],
    [
      7,
      1024
    ]
  ],
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,


In [98]:
model.bert.embeddings.word_embeddings  # wordpieces are replaced with a CharacterCNN

CharacterCnn(
  (char_conv_0): Conv1d(16, 32, kernel_size=(1,), stride=(1,))
  (char_conv_1): Conv1d(16, 32, kernel_size=(2,), stride=(1,))
  (char_conv_2): Conv1d(16, 64, kernel_size=(3,), stride=(1,))
  (char_conv_3): Conv1d(16, 128, kernel_size=(4,), stride=(1,))
  (char_conv_4): Conv1d(16, 256, kernel_size=(5,), stride=(1,))
  (char_conv_5): Conv1d(16, 512, kernel_size=(6,), stride=(1,))
  (char_conv_6): Conv1d(16, 1024, kernel_size=(7,), stride=(1,))
  (_highways): Highway(
    (_layers): ModuleList(
      (0-1): 2 x Linear(in_features=2048, out_features=4096, bias=True)
    )
  )
  (_projection): Linear(in_features=2048, out_features=768, bias=True)
)

In [99]:
model.config.num_labels

9

In [100]:
#  disable weights and biases logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [101]:
from transformers import TrainingArguments
batch_size = 32

args = TrainingArguments(
    "models/ner",
    report_to = None,
    logging_dir= None,
    save_strategy="no",
    evaluation_strategy="epoch",
    #save_strategy="epoch",
    #learning_rate=2e-5,
    learning_rate=3e-5,
    #num_train_epochs=4,
    num_train_epochs=3,
    warmup_ratio = 0.06,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [102]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [103]:
import transformers
trainer.remove_callback(transformers.integrations.TensorBoardCallback)

In [104]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 3565
  Batch size = 32


{'eval_loss': 2.238935708999634,
 'eval_precision': 0.029888284494355735,
 'eval_recall': 0.13650327233872045,
 'eval_f1': 0.04903913053909455,
 'eval_accuracy': 0.15092304247827237,
 'eval_runtime': 9.3275,
 'eval_samples_per_second': 382.204,
 'eval_steps_per_second': 12.008}

In [105]:
trainer.train()

***** Running training *****
  Num examples = 64155
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 6015


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2843,0.275357,0.684292,0.646454,0.664835,0.902648
2,0.2363,0.247792,0.708327,0.655536,0.68091,0.908612
3,0.1958,0.240454,0.702036,0.695472,0.698739,0.912866


***** Running Evaluation *****
  Num examples = 3565
  Batch size = 32
***** Running Evaluation *****
  Num examples = 3565
  Batch size = 32
***** Running Evaluation *****
  Num examples = 3565
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=6015, training_loss=0.2646684057991998, metrics={'train_runtime': 1318.1278, 'train_samples_per_second': 146.014, 'train_steps_per_second': 4.563, 'total_flos': 1.6044202762080662e+17, 'train_loss': 0.2646684057991998, 'epoch': 3.0})

In [60]:
# trainer.save_model()

In [106]:
trainer.evaluate(tokenized_datasets["test"])

***** Running Evaluation *****
  Num examples = 3565
  Batch size = 32


{'eval_loss': 0.24045395851135254,
 'eval_precision': 0.7020358635566941,
 'eval_recall': 0.6954721517296647,
 'eval_f1': 0.6987385936661299,
 'eval_accuracy': 0.9128656042010136,
 'eval_runtime': 9.7401,
 'eval_samples_per_second': 366.012,
 'eval_steps_per_second': 11.499,
 'epoch': 3.0}

In [63]:
y_preds, y_true, _ = trainer.predict(tokenized_datasets["test"])

***** Running Prediction *****
  Num examples = 3565
  Batch size = 32


In [64]:
predictions = np.argmax(y_preds, axis=-1)

In [65]:
# Remove ignored index (special tokens) and convert to labels
true_labels = [[label_names[l] for l in label if l != -100] for label in y_true]

In [66]:
true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, y_true)
    ]

In [67]:
misclassified = [i for i in range(len(true_predictions)) if (true_predictions[i] != true_labels[i])]

In [68]:
misclassified

[1,
 2,
 6,
 7,
 8,
 12,
 13,
 15,
 17,
 19,
 21,
 23,
 25,
 28,
 29,
 32,
 34,
 35,
 36,
 38,
 39,
 44,
 47,
 49,
 53,
 56,
 57,
 61,
 62,
 63,
 65,
 66,
 67,
 68,
 70,
 72,
 73,
 74,
 75,
 76,
 78,
 79,
 81,
 83,
 85,
 86,
 87,
 88,
 90,
 91,
 92,
 95,
 96,
 98,
 99,
 101,
 102,
 104,
 106,
 107,
 108,
 109,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 120,
 121,
 122,
 123,
 126,
 127,
 128,
 130,
 131,
 133,
 134,
 135,
 138,
 139,
 142,
 143,
 144,
 146,
 147,
 148,
 152,
 153,
 154,
 155,
 159,
 160,
 164,
 165,
 167,
 168,
 169,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 180,
 181,
 182,
 183,
 184,
 188,
 190,
 191,
 194,
 195,
 197,
 198,
 199,
 200,
 202,
 203,
 207,
 209,
 210,
 211,
 212,
 214,
 216,
 217,
 218,
 219,
 220,
 222,
 227,
 228,
 230,
 232,
 235,
 236,
 237,
 240,
 241,
 242,
 244,
 246,
 248,
 250,
 256,
 258,
 260,
 261,
 262,
 264,
 265,
 266,
 267,
 271,
 273,
 274,
 275,
 277,
 278,
 280,
 283,
 287,
 289,
 293,
 294,
 296,
 298,
 300,
 301,
 303,
 305,
 307

In [69]:
temp = test_ds.select(misclassified)

In [76]:
# search_query = 'বাংলাদেশ'
# search_query = 'শান্ত'
search_query = 'উত্তর-পশ্চিমাঞ্চলের'
found_example = ""
for index, example in zip(misclassified, temp['tokens']):
    if search_query in example:
        print(index)
        print(example)
        found_index = index
        found_example = " ".join(example)
        break

3210
['বাঙালি', 'নদী', 'বাংলাদেশের', 'উত্তর-পশ্চিমাঞ্চলের', 'গাইবান্ধা', 'বগুড়া', 'এবং', 'সিরাজগঞ্জ', 'জেলার', 'একটি', 'নদী']


In [77]:
tokenizer.tokenize(found_example)

['বাঙালি',
 'নদী',
 'বাংলাদেশের',
 'উত্তর',
 '-',
 'পশ্চিমাঞ্চলের',
 'গাইবান্ধা',
 'বগুড়া',
 'এবং',
 'সিরাজগঞ্জ',
 'জেলার',
 'একটি',
 'নদী']

In [78]:
true_labels[found_index]

['B-LOC',
 'I-LOC',
 'B-LOC',
 'B-LOC',
 'B-LOC',
 'B-LOC',
 'O',
 'B-LOC',
 'O',
 'O',
 'B-LOC']

In [79]:
true_predictions[found_index]

['B-LOC',
 'I-LOC',
 'B-LOC',
 'B-LOC',
 'I-LOC',
 'I-LOC',
 'B-LOC',
 'B-LOC',
 'B-LOC',
 'O',
 'B-LOC']