In [1]:
!pip3 install datasets
!pip3 install sentencepiece
!pip3 install seqeval
!pip install transformers[torch]
!pip install accelerate -U

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=4a70bb68e7d2439116af214c9d8a46c23483526a041b6f5fd20853e250bea855
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling col

In [32]:
import datasets
import pandas as pd

In [4]:
# Let's download the Naampadam (Indic NER) dataset
from datasets import ClassLabel, load_dataset, load_metric, DownloadMode

lang='hi'

train_dataset = load_dataset('ai4bharat/naamapadam', lang , split = 'train[:20000]')
validation_dataset = load_dataset('ai4bharat/naamapadam', lang , split = 'validation')
test_dataset = load_dataset('ai4bharat/naamapadam', lang , split = 'test')



In [5]:
features = train_dataset.features
print(features)

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}


In [8]:
text_column_name = "tokens"
label_column_name = "ner_tags"

label_list = features[label_column_name].feature.names

label_to_id = {label_list[i]: features[label_column_name].feature.str2int( label_list[i] ) for i in range(len(label_list))}

print(label_to_id)

num_labels = len(label_list)


{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


In [9]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import numpy as np

config = AutoConfig.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels, finetuning_task='ner')
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels )

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Tokenize all texts and align the labels with them.
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding="max_length",
        truncation=True,
        max_length=256,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    # print(tokenized_inputs)
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        # print('=====')
        # print('{} {}'.format(i,label)) #ak
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    # print(tokenized_inputs)
    return tokenized_inputs

In [11]:
correct_output = tokenize_and_align_labels(train_dataset[0:1])
tokens = tokenizer.convert_ids_to_tokens(correct_output["input_ids"][0])
print(tokens)

# Now we have input ids and labels to pass to model

['[CLS]', '▁सक', 'टर', '▁55', '/', '56', '▁क', '▁एसएचओ', '▁अर', 'वद', '▁कमर', '▁न', '▁बत', 'य', '▁क', '▁इस', '▁म', 'मल', '▁म', '▁आई', 'पस', '▁क', '▁धर', '▁376', '▁-', '▁ड', '▁(', '▁ग', 'गर', 'प', '▁)', '▁क', '▁तहत', '▁म', 'मल', '▁दर', 'ज', '▁कर', '▁लय', '▁ग', 'य', '▁ह', '▁।', '[SEP]', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<p

In [None]:
for token,label in zip(tokenizer.convert_ids_to_tokens(correct_output["input_ids"][0]) , correct_output["labels"][0]):
  print(f"{token:_<40}{label}")


In [13]:
tokenized_train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
)
print(tokenized_train_dataset)

  0%|          | 0/20 [00:00<?, ?ba/s]

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 20000
})


In [14]:
# Problem is fixed now!!
print(len(tokenized_train_dataset[0]['ner_tags']))
print(len(tokenized_train_dataset[0]['labels']))
print(len(tokenized_train_dataset[0]['input_ids']))

30
256
256


In [15]:
# Test Dataset
tokenized_test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
)
print(tokenized_test_dataset)

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 867
})


In [16]:
# Validation DataSet
tokenized_validation_dataset = validation_dataset.map(
    tokenize_and_align_labels,
    batched=True,
)
print(tokenized_validation_dataset)

  0%|          | 0/14 [00:00<?, ?ba/s]

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 13460
})


In [17]:
# Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)

In [18]:
# Metric to see how my model is performing For NER we use seqeval
metric = datasets.load_metric("seqeval")

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [19]:
# Seeing the working of seqeval
example_text = train_dataset[0]
label_list = train_dataset.features["ner_tags"].feature.names
label_list

labels = [label_list[i] for i in example_text["ner_tags"]]
print(labels)

metric.compute(predictions=[labels] , references=[labels])

['O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


{'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [20]:
# Metrics
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value
    return final_results

In [55]:
# Define Training Args
args = TrainingArguments(
    output_dir=f"model/upos",
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.001,
    save_total_limit=1,
)

In [56]:
trainer = Trainer(
   model,
   args,
   train_dataset=tokenized_train_dataset,
   eval_dataset=tokenized_validation_dataset,
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics,
)

In [57]:
train_result = trainer.train()
metrics = train_result.metrics

Step,Training Loss,Validation Loss,Loc Precision,Loc Recall,Loc F1,Loc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
500,0.15,0.317825,0.70741,0.687947,0.697543,10213,0.523925,0.528101,0.526005,9786,0.717409,0.655091,0.684835,10568,0.649222,0.625413,0.637095,0.912453
1000,0.2005,0.28506,0.683075,0.725546,0.70367,10213,0.557036,0.508482,0.531652,9786,0.681676,0.70193,0.691655,10568,0.64586,0.647888,0.646872,0.91383
1500,0.1916,0.29231,0.722901,0.694801,0.708573,10213,0.533052,0.541386,0.537186,9786,0.724052,0.663418,0.69241,10568,0.659182,0.634835,0.646779,0.915969
2000,0.1868,0.293712,0.701358,0.717909,0.709537,10213,0.529124,0.564378,0.546183,9786,0.72637,0.671177,0.697684,10568,0.650683,0.652599,0.65164,0.914431
2500,0.1899,0.292779,0.695077,0.735435,0.714687,10213,0.619295,0.488044,0.545891,9786,0.700524,0.696347,0.698429,10568,0.676911,0.642719,0.659372,0.91733
3000,0.1475,0.295989,0.70648,0.727994,0.717076,10213,0.514359,0.572859,0.542035,9786,0.690361,0.704864,0.697537,10568,0.636079,0.670331,0.652756,0.914211
3500,0.1465,0.303666,0.720578,0.712817,0.716677,10213,0.564136,0.524014,0.543335,9786,0.669922,0.714042,0.691279,10568,0.655153,0.652795,0.653972,0.916321
4000,0.1394,0.300514,0.707261,0.726721,0.716859,10213,0.54602,0.555896,0.550914,9786,0.709468,0.692752,0.70101,10568,0.655803,0.660287,0.658038,0.916817
4500,0.1383,0.299948,0.719504,0.715559,0.717526,10213,0.541564,0.56121,0.551212,9786,0.70649,0.692184,0.699264,10568,0.656238,0.658063,0.657149,0.916696
5000,0.1428,0.29751,0.713223,0.723,0.718078,10213,0.553495,0.551911,0.552702,9786,0.705402,0.694455,0.699886,10568,0.659479,0.658357,0.658918,0.917212


In [24]:
print(metrics)

{'train_runtime': 2388.4194, 'train_samples_per_second': 25.121, 'train_steps_per_second': 1.57, 'total_flos': 662868449280000.0, 'train_loss': 0.3149069254557292, 'epoch': 3.0}


In [25]:
sumoff1score=0.776407+0.624282+0.762044
macrof1=sumoff1score/3
print(macrof1)

0.720911


In [26]:
# Save Model
trainer.save_model("model_indic_ner_5Epoch_one_lakh")

In [None]:
import zipfile
import os
from IPython.display import FileLink

def zip_dir(directory = os.curdir, file_name = 'directory.zip'):
    """
    zip all the files in a directory
    
    Parameters
    ___
    directory: str
        directory needs to be zipped, defualt is current working directory
        
    file_name: str
        the name of the zipped file (including .zip), default is 'directory.zip'
        
    Returns
    ___
    Creates a hyperlink, which can be used to download the zip file)
    """
    os.chdir(directory)
    zip_ref = zipfile.ZipFile(file_name, mode='w')
    for folder, _, files in os.walk(directory):
        for file in files:
            if file_name in file:
                pass
            else:
                zip_ref.write(os.path.join(folder, file))

    return FileLink(file_name)

In [None]:
zip_dir()

In [37]:
for key, value in metrics.items():
    print(f"{key}: {value}")

test_loss: 0.2399081140756607
test_LOC_precision: 0.6621848739495798
test_LOC_recall: 0.6416938110749185
test_LOC_f1: 0.6517783291976841
test_LOC_number: 614
test_ORG_precision: 0.5900383141762452
test_ORG_recall: 0.5866666666666667
test_ORG_f1: 0.5883476599808979
test_ORG_number: 525
test_PER_precision: 0.7523302263648469
test_PER_recall: 0.7151898734177216
test_PER_f1: 0.7332900713822195
test_PER_number: 790
test_overall_precision: 0.6782655246252677
test_overall_recall: 0.6568170036288232
test_overall_f1: 0.6673689755069792
test_overall_accuracy: 0.9253003569094657
test_runtime: 8.246
test_samples_per_second: 105.142
test_steps_per_second: 6.67


# Testing on 20% Testing Dataset

In [60]:
from datasets import ClassLabel, load_dataset, load_metric, DownloadMode
lang = 'hi'
test_dataset = load_dataset('ai4bharat/naamapadam', lang , split = 'test')
# Test Dataset
tokenized_test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
)
print(tokenized_test_dataset)

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 867
})


In [61]:
# Evaluation of Model on test Data
predictions, labels, metrics = trainer.predict(tokenized_test_dataset)
trainer.log_metrics("Test",metrics)

sum_of_f1_scores=metrics['test_LOC_f1']+metrics['test_ORG_f1']+metrics['test_PER_f1']
macro_f1=sum_of_f1_scores/3
print('\n Macro f1 score::',macro_f1)

***** Test metrics *****
  test_LOC_f1             =     0.6716
  test_LOC_number         =        614
  test_LOC_precision      =     0.6806
  test_LOC_recall         =     0.6629
  test_ORG_f1             =     0.6393
  test_ORG_number         =        525
  test_ORG_precision      =      0.614
  test_ORG_recall         =     0.6667
  test_PER_f1             =     0.7289
  test_PER_number         =        790
  test_PER_precision      =     0.7364
  test_PER_recall         =     0.7215
  test_loss               =     0.2455
  test_overall_accuracy   =     0.9296
  test_overall_f1         =     0.6856
  test_overall_precision  =     0.6833
  test_overall_recall     =     0.6879
  test_runtime            = 0:00:08.37
  test_samples_per_second =    103.545
  test_steps_per_second   =     13.018

 Macro f1 score:: 0.6799289412877805
