In [1]:
# !pip install pytorch_lightning 

In [1]:
import conllu

In [2]:
import pandas as pd
import re

train_df = pd.DataFrame(columns=['id', 'ner_tags', 'tokens'])
val_df = pd.DataFrame(columns=['id', 'ner_tags', 'tokens'])

In [3]:
def read_conll_file(file_path, df):
    data = open(file_path, mode="r", encoding="utf-8")
    raw_text = data.read().strip()

    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []

    for idx,doc in enumerate(raw_docs):
        tokens = []
        tags = []
        example_id = ''
        for line in doc.split('\n'):
            if line.startswith('#'):
                example_id = line
                continue
            token, tag = line.split('_ _')
            tokens.append(token.strip())
            tags.append(tag.strip())
    #         print(token)
    #         print(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

        df.loc[idx] = pd.Series({'id':example_id,'ner_tags':tags, 'tokens':tokens})
    return df, token_docs, tag_docs

In [4]:
train_df,_,tag_docs = read_conll_file("../datasets/BN-Bangla/bn_train.conll", train_df)
val_df, _,_ = read_conll_file("../datasets/BN-Bangla/bn_dev.conll", val_df)

In [5]:
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

In [6]:
datasets = DatasetDict()
datasets['train'] = train_ds
datasets['validation'] = val_ds

In [7]:
train_df['tokens'][0]

['২০১৮',
 'এর',
 'সেরা',
 '(বর্ণানুক্রমিকভাবে',
 'তালিকাভুক্ত,',
 'র\u200d্যাঙ্ক',
 'করা',
 'হয়নি),',
 'এনপিআর']

In [8]:
# from datasets import Dataset
# dataset = Dataset.from_pandas(train_df)

In [9]:
datasets["train"][0]["tokens"]

['২০১৮',
 'এর',
 'সেরা',
 '(বর্ণানুক্রমিকভাবে',
 'তালিকাভুক্ত,',
 'র\u200d্যাঙ্ক',
 'করা',
 'হয়নি),',
 'এনপিআর']

In [10]:
datasets["train"][0]["ner_tags"]

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CORP']

In [11]:
datasets["train"][0]["id"]

'# id 5e80f5c9-1196-4906-826b-4cbdcaec7b6f\tdomain=train'

In [12]:
# create set from list
expanded_tag_docs = []

for tags in tag_docs:
    for tag in tags:
        expanded_tag_docs.append(tag)

In [13]:
unique_tags = set(expanded_tag_docs)
unique_tags

{'B-CORP',
 'B-CW',
 'B-GRP',
 'B-LOC',
 'B-PER',
 'B-PROD',
 'I-CORP',
 'I-CW',
 'I-GRP',
 'I-LOC',
 'I-PER',
 'I-PROD',
 'O'}

In [14]:
def assign_label(examples):
    mapping = {'B-CORP':0,'B-CW':1,'B-GRP':2, 'B-LOC':3, 'B-PER':4, 'B-PROD':5, 'I-CORP':6, 'I-CW':7, 'I-GRP':8,\
               'I-LOC':9,'I-PER':10,'I-PROD':11,'O':12}
    ner_labels = []
    for example in examples["ner_tags"]:
        ner_labels.append(mapping[example])
    examples["ner_labels"] = ner_labels
    return examples

In [15]:
# mapping = {' B-CORP':0,' B-CW':1,' B-GRP':2, ' B-LOC':3, ' B-PER':4, ' B-PROD':5, ' I-CORP':6, ' I-CW':7, ' I-GRP':8,\
#                ' I-LOC':9,' I-PER':10,' I-PROD':11,' O':12}
# ner_labels = list()
# examples = dataset[0]
# for example in examples["ner_tags"]:
#     print(example)
#     ner_labels.append(mapping[example])
# examples["ner_labels"] = ner_labels

In [16]:
# examples

In [17]:
datasets = datasets.map(assign_label)

  0%|          | 0/15300 [00:00<?, ?ex/s]

  0%|          | 0/800 [00:00<?, ?ex/s]

In [18]:
from transformers import AutoTokenizer

# model_checkpoint = "../Bengali Pretraining/models/unigram/bert-base-pretrained-bengali"
model_checkpoint = "../Bengali Pretraining/models/unigram/unigram-long-text"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [19]:
inputs = tokenizer(datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 '▁',
 '২০১৮',
 '▁এর',
 '▁সেরা',
 '▁',
 '(',
 'বর্ণ',
 'ানু',
 'ক্র',
 'মিক',
 'ভাবে',
 '▁তালিকাভুক্ত',
 ',',
 '▁',
 'র\u200d্যা',
 'ঙ্ক',
 '▁করা',
 '▁হয়নি',
 '),',
 '▁এ',
 'ন',
 'পিআর',
 '[SEP]']

In [20]:
inputs.word_ids()

[None, 0, 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 5, 5, 5, 6, 7, 7, 8, 8, 8, None]

In [21]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [22]:
labels = datasets["train"][0]["ner_labels"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[12, 12, 12, 12, 12, 12, 12, 12, 0]
[-100, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 0, 0, -100]


In [23]:
datasets["train"][0]

{'id': '# id 5e80f5c9-1196-4906-826b-4cbdcaec7b6f\tdomain=train',
 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CORP'],
 'tokens': ['২০১৮',
  'এর',
  'সেরা',
  '(বর্ণানুক্রমিকভাবে',
  'তালিকাভুক্ত,',
  'র\u200d্যাঙ্ক',
  'করা',
  'হয়নি),',
  'এনপিআর'],
 '__index_level_0__': 0,
 'ner_labels': [12, 12, 12, 12, 12, 12, 12, 12, 0]}

In [24]:
mapping = {'B-CORP':0,'B-CW':1,'B-GRP':2, 'B-LOC':3, 'B-PER':4, 'B-PROD':5, 'I-CORP':6, 'I-CW':7, 'I-GRP':8,\
               'I-LOC':9,'I-PER':10,'I-PROD':11,'O':12}
label_names = list()
for k, v in mapping.items():
    label_names.append(k)

label_names

['B-CORP',
 'B-CW',
 'B-GRP',
 'B-LOC',
 'B-PER',
 'B-PROD',
 'I-CORP',
 'I-CW',
 'I-GRP',
 'I-LOC',
 'I-PER',
 'I-PROD',
 'O']

In [25]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [26]:
tokenized_datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=datasets["train"].column_names,
)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [27]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 15300
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
})

In [28]:
temp = tokenized_datasets.filter(lambda x:x if 0 in x["input_ids"] else None)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [29]:
temp

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 9530
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 530
    })
})

In [30]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [31]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,   12,   12,   12,   12,   12,   12,   12,   12,   12,   12,   12,
           12,   12,   12,   12,   12,   12,   12,   12,    0,    0,    0, -100],
        [-100,   12,   12,   12,   12,   12,   12,   12,   12,   12,   12,   12,
            2,    8,   12,   12, -100, -100, -100, -100, -100, -100, -100, -100]])

In [33]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 0, 0, -100]
[-100, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 2, 8, 12, 12, -100]


In [34]:
# !pip install seqeval

In [35]:
import evaluate

metric = evaluate.load("seqeval")

In [36]:
labels = datasets["train"][0]["ner_labels"]
labels = [label_names[i] for i in labels]
labels

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CORP']

In [37]:
predictions = labels.copy()
predictions[2] = 'B-CORP'
metric.compute(predictions=[predictions], references=[labels])

{'CORP': {'precision': 0.5,
  'recall': 1.0,
  'f1': 0.6666666666666666,
  'number': 1},
 'overall_precision': 0.5,
 'overall_recall': 1.0,
 'overall_f1': 0.6666666666666666,
 'overall_accuracy': 0.8888888888888888}

In [38]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [39]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [40]:
id2label

{0: 'B-CORP',
 1: 'B-CW',
 2: 'B-GRP',
 3: 'B-LOC',
 4: 'B-PER',
 5: 'B-PROD',
 6: 'I-CORP',
 7: 'I-CW',
 8: 'I-GRP',
 9: 'I-LOC',
 10: 'I-PER',
 11: 'I-PROD',
 12: 'O'}

In [41]:
label2id

{'B-CORP': 0,
 'B-CW': 1,
 'B-GRP': 2,
 'B-LOC': 3,
 'B-PER': 4,
 'B-PROD': 5,
 'I-CORP': 6,
 'I-CW': 7,
 'I-GRP': 8,
 'I-LOC': 9,
 'I-PER': 10,
 'I-PROD': 11,
 'O': 12}

In [42]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at ../Bengali Pretraining/models/unigram/unigram-long-text were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at .

In [43]:
model.config.num_labels

13

In [44]:
from transformers import TrainingArguments

args = TrainingArguments(
    "models/ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [45]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [46]:
trainer.evaluate()

[34m[1mwandb[0m: Currently logged in as: [33mashahri1[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 2.643169641494751,
 'eval_precision': 0.008607872290476564,
 'eval_recall': 0.07119741100323625,
 'eval_f1': 0.015358838313320305,
 'eval_accuracy': 0.07768871075484302,
 'eval_runtime': 4.7965,
 'eval_samples_per_second': 166.788,
 'eval_steps_per_second': 20.849}

In [47]:
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.306,0.26099,0.555341,0.558576,0.556954,0.918838
2,0.1686,0.219278,0.635645,0.676375,0.655378,0.933868
3,0.0956,0.240201,0.672874,0.701618,0.686946,0.938878


TrainOutput(global_step=5739, training_loss=0.2064756565256845, metrics={'train_runtime': 461.6532, 'train_samples_per_second': 99.425, 'train_steps_per_second': 12.431, 'total_flos': 755261516740392.0, 'train_loss': 0.2064756565256845, 'epoch': 3.0})