In [None]:
!pip3 install datasets
!pip3 install sentencepiece
!pip3 install seqeval
!pip install transformers[torch]
!pip install accelerate -U

In [37]:
import datasets
import pandas as pd

In [38]:
# Let's download the Naampadam (Indic NER) dataset
from datasets import ClassLabel, load_dataset, load_metric, DownloadMode

lang='hi'

train_dataset = load_dataset('ai4bharat/naamapadam', lang , split = 'train[:20000]')
validation_dataset = load_dataset('ai4bharat/naamapadam', lang , split = 'validation')
test_dataset = load_dataset('ai4bharat/naamapadam', lang , split = 'test')



In [39]:
text_column_name = "tokens"
label_column_name = "ner_tags"

label_list = features[label_column_name].feature.names

label_to_id = {label_list[i]: features[label_column_name].feature.str2int( label_list[i] ) for i in range(len(label_list))}

num_labels = len(label_list)

In [40]:
# Import Models and Tokenizer
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import numpy as np

config = AutoConfig.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels, finetuning_task='ner')
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels )

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
# Tokenize all texts and align the labels with them.
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding="max_length",
        truncation=True,
        max_length=256,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    # print(tokenized_inputs)
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        # print('=====')
        # print('{} {}'.format(i,label)) #ak
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    # print(tokenized_inputs)
    return tokenized_inputs

In [None]:
correct_output = tokenize_and_align_labels(train_dataset[0:1])
tokens = tokenizer.convert_ids_to_tokens(correct_output["input_ids"][0])
# print(tokens)

# Now we have input ids and labels to pass to model

In [46]:
# Tokenizing Train Dataset
tokenized_train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
)
print(tokenized_train_dataset)

  0%|          | 0/20 [00:00<?, ?ba/s]

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 20000
})


In [44]:
# Problem is fixed now!!
print(len(tokenized_train_dataset[0]['ner_tags']))
print(len(tokenized_train_dataset[0]['labels']))
print(len(tokenized_train_dataset[0]['input_ids']))

47
256
256


In [45]:
# Test Dataset
tokenized_test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
)
print(tokenized_test_dataset)

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 867
})


In [47]:
# Validation DataSet
tokenized_validation_dataset = validation_dataset.map(
    tokenize_and_align_labels,
    batched=True,
)
print(tokenized_validation_dataset)

  0%|          | 0/14 [00:00<?, ?ba/s]

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 13460
})


In [48]:
# Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)

In [49]:
# Metric to see how my model is performing For NER we use seqeval
metric = datasets.load_metric("seqeval")

In [51]:
# Seeing the working of seqeval
example_text = train_dataset[0]
label_list = train_dataset.features["ner_tags"].feature.names
label_list

labels = [label_list[i] for i in example_text["ner_tags"]]
# print(labels)

metric.compute(predictions=[labels] , references=[labels])

{'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [52]:
# Metrics
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value
    return final_results

In [56]:
# Define Training Args
args = TrainingArguments(
    output_dir=f"model/upos",
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
)

In [57]:
trainer = Trainer(
   model,
   args,
   train_dataset=tokenized_train_dataset,
   eval_dataset=tokenized_validation_dataset,
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics,
)

In [58]:
train_result = trainer.train()
metrics = train_result.metrics

Step,Training Loss,Validation Loss,Loc Precision,Loc Recall,Loc F1,Loc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
500,0.3201,0.346491,0.708628,0.539606,0.612674,10213,0.482123,0.398222,0.436174,9786,0.652571,0.575322,0.611516,10568,0.615165,0.50669,0.555683,0.897141
1000,0.3242,0.3181,0.702113,0.598649,0.646266,10213,0.503783,0.442264,0.471024,9786,0.706839,0.542771,0.614034,10568,0.636578,0.529264,0.577982,0.902196
1500,0.3057,0.293586,0.67213,0.682659,0.677354,10213,0.5504,0.47149,0.507898,9786,0.665309,0.664743,0.665026,10568,0.634863,0.608859,0.621589,0.910497
2000,0.2794,0.286097,0.697576,0.707138,0.702324,10213,0.546242,0.51543,0.530389,9786,0.719745,0.630867,0.672382,10568,0.656256,0.619393,0.637292,0.91334
2500,0.279,0.283199,0.740949,0.645256,0.6898,10213,0.598902,0.434703,0.50376,9786,0.681157,0.655375,0.668017,10568,0.679152,0.581346,0.626454,0.913304
3000,0.2341,0.273426,0.71946,0.694311,0.706662,10213,0.535803,0.551298,0.54344,9786,0.690946,0.705526,0.69816,10568,0.649238,0.652403,0.650817,0.915677
3500,0.2379,0.269328,0.722036,0.716734,0.719375,10213,0.588584,0.507868,0.545255,9786,0.691003,0.709311,0.700037,10568,0.672307,0.647299,0.659566,0.918582
4000,0.2237,0.266063,0.674947,0.752864,0.71178,10213,0.59789,0.521153,0.55689,9786,0.743174,0.672218,0.705917,10568,0.674774,0.6508,0.66257,0.91967
4500,0.2208,0.257876,0.716366,0.732008,0.724103,10213,0.562789,0.571531,0.567126,9786,0.72542,0.714232,0.719783,10568,0.66984,0.674486,0.672155,0.921668
5000,0.2196,0.25099,0.724527,0.742191,0.733253,10213,0.593237,0.557531,0.57483,9786,0.74136,0.704391,0.722403,10568,0.689562,0.670004,0.679642,0.922897


Checkpoint destination directory model/upos/checkpoint-5000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [59]:
# Save Model
trainer.save_model("model_indic_bert_finetuned")

In [60]:
import zipfile
import os
from IPython.display import FileLink

def zip_dir(directory = os.curdir, file_name = 'directory.zip'):
    """
    zip all the files in a directory
    
    Parameters
    ___
    directory: str
        directory needs to be zipped, defualt is current working directory
        
    file_name: str
        the name of the zipped file (including .zip), default is 'directory.zip'
        
    Returns
    ___
    Creates a hyperlink, which can be used to download the zip file)
    """
    os.chdir(directory)
    zip_ref = zipfile.ZipFile(file_name, mode='w')
    for folder, _, files in os.walk(directory):
        for file in files:
            if file_name in file:
                pass
            else:
                zip_ref.write(os.path.join(folder, file))

    return FileLink(file_name)
zip_dir()

In [61]:
# Evaluating MODel
Final_metrics = trainer.evaluate()
trainer.log_metrics("eval", Final_metrics)

***** eval metrics *****
  epoch                   =        3.0
  eval_LOC_f1             =     0.7406
  eval_LOC_number         =      10213
  eval_LOC_precision      =     0.7376
  eval_LOC_recall         =     0.7436
  eval_ORG_f1             =     0.5739
  eval_ORG_number         =       9786
  eval_ORG_precision      =     0.5785
  eval_ORG_recall         =     0.5695
  eval_PER_f1             =     0.7254
  eval_PER_number         =      10568
  eval_PER_precision      =     0.7282
  eval_PER_recall         =     0.7227
  eval_loss               =     0.2557
  eval_overall_accuracy   =     0.9237
  eval_overall_f1         =     0.6823
  eval_overall_precision  =      0.684
  eval_overall_recall     =     0.6806
  eval_runtime            = 0:02:11.01
  eval_samples_per_second =    102.738
  eval_steps_per_second   =     12.846


# Testing on Training Dataset

In [63]:
# Testing on Training dataset
predictions, labels, metrics = trainer.predict(tokenized_train_dataset)
trainer.log_metrics("train",metrics)

sum_of_f1_scores=metrics['test_LOC_f1']+metrics['test_ORG_f1']+metrics['test_PER_f1']
macro_f1=sum_of_f1_scores/3
print('\n Macro f1 score::',macro_f1)

***** train metrics *****
  test_LOC_f1             =      0.832
  test_LOC_number         =      14841
  test_LOC_precision      =     0.8228
  test_LOC_recall         =     0.8413
  test_ORG_f1             =     0.7059
  test_ORG_number         =      14082
  test_ORG_precision      =     0.7151
  test_ORG_recall         =      0.697
  test_PER_f1             =     0.8359
  test_PER_number         =      15614
  test_PER_precision      =     0.8364
  test_PER_recall         =     0.8354
  test_loss               =     0.1506
  test_overall_accuracy   =      0.955
  test_overall_f1         =      0.794
  test_overall_precision  =     0.7943
  test_overall_recall     =     0.7936
  test_runtime            = 0:03:15.09
  test_samples_per_second =    102.516
  test_steps_per_second   =     12.814

 Macro f1 score:: 0.791260325081025


# Testing on 20% Testing Dataset

In [64]:
from datasets import ClassLabel, load_dataset, load_metric, DownloadMode
lang = 'hi'
test_dataset = load_dataset('ai4bharat/naamapadam', lang , split = 'test')
# Test Dataset
tokenized_test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
)
print(tokenized_test_dataset)

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 867
})


In [65]:
# Evaluation of Model on test Data
predictions, labels, metrics = trainer.predict(tokenized_test_dataset)
trainer.log_metrics("Test",metrics)

sum_of_f1_scores=metrics['test_LOC_f1']+metrics['test_ORG_f1']+metrics['test_PER_f1']
macro_f1=sum_of_f1_scores/3
print('\n Macro f1 score::',macro_f1)

***** Test metrics *****
  test_LOC_f1             =     0.7012
  test_LOC_number         =        614
  test_LOC_precision      =     0.7088
  test_LOC_recall         =     0.6938
  test_ORG_f1             =     0.6392
  test_ORG_number         =        525
  test_ORG_precision      =     0.6155
  test_ORG_recall         =     0.6648
  test_PER_f1             =     0.7462
  test_PER_number         =        790
  test_PER_precision      =     0.7443
  test_PER_recall         =     0.7481
  test_loss               =     0.2144
  test_overall_accuracy   =     0.9343
  test_overall_f1         =     0.7021
  test_overall_precision  =     0.6962
  test_overall_recall     =     0.7081
  test_runtime            = 0:00:08.44
  test_samples_per_second =    102.675
  test_steps_per_second   =     12.908

 Macro f1 score:: 0.6955469427691648
