# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import re

# Loading Dataset and Cleaning

In [2]:
file =  open("hi_train.conll",encoding="utf-8")
text = file.read()
text = re.sub(r'[^\w\s]', '', text)

Spliting each sentences into words and tags

In [3]:
words,tags = [],[]
for line in text.split('\n'):
  if(line == '' or line[0] == '#'):
    continue
  else:
      line = line.split()
      if line[0] !='id':
        words.append(line[0])
        tags.append(line[-1])

In [4]:
df = pd.DataFrame({"sentence":words,"word_labels":tags})
df

Unnamed: 0,sentence,word_labels
0,जयरजयन,O
1,०१९,O
2,जपज,O
3,_,O
4,जयरजयन,O
...,...,...
244561,कवलकम,BPROD
244562,सनपडरगन,IPROD
244563,_,O
244564,क,O


Finding Unique labels and assign them ids for our finetuning process


In [5]:
labels_to_ids = {k: v for v, k in enumerate(df.word_labels.unique())}
ids_to_labels = {v: k for v, k in enumerate(df.word_labels.unique())}
labels_to_ids
ids_to_labels

{0: 'O',
 1: 'BCW',
 2: 'BPROD',
 3: 'BPER',
 4: 'IPER',
 5: 'IPROD',
 6: 'BLOC',
 7: 'BCORP',
 8: 'ICORP',
 9: 'BGRP',
 10: 'IGRP',
 11: 'ICW',
 12: 'ILOC'}

In [6]:
df = df[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
df.head()

Unnamed: 0,sentence,word_labels
0,जयरजयन,O
1,०१९,O
2,जपज,O
3,_,O
4,आध,BCW


In [7]:
df['word_labels'].unique()

array(['O', 'BCW', 'BPROD', 'BPER', 'IPER', 'IPROD', 'BLOC', 'BCORP',
       'ICORP', 'BGRP', 'IGRP', 'ICW', 'ILOC'], dtype=object)

Training Parameters


Ref: https://indicnlp.ai4bharat.org/indic-bert/

In [8]:
MAX_LEN = 20
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 2
LEARNING_RATE = 6*1e-03
MAX_GRAD_NORM = 10
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')

Reference: https://huggingface.co/docs/transformers/training

In [9]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        #sentence and word labels for the given index
        sentence = self.data.sentence[index].strip()
        word_labels = self.data.word_labels[index].split(",") 

        #using tokenizer to encode sentence
        encoding = self.tokenizer(sentence,
                                  return_offsets_mapping=True, 
                                  padding='max_length',
                                  truncation=True, 
                                  max_length=self.max_len)
        
        #create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels] 

        #turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(labels[0])
        
        return item

  def __len__(self):
        return self.len

In [10]:
train_size = 0.8
#val_size = 0.5

train_dataset = df.sample(frac=train_size,random_state=200)
val_dataset = df.drop(train_dataset.index)

train_dataset = train_dataset.reset_index(drop=True)
val_dataset = val_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
validation_set = dataset(val_dataset, tokenizer, MAX_LEN)

FULL Dataset: (26891, 2)
TRAIN Dataset: (21513, 2)


In [11]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **test_params)

In [12]:
import torch
import torch.nn as nn

# Training

Defining Forward Pass for Finetuning

In [13]:
class finetune(nn.Module):
  
  def __init__(self,freeze_bert=False):
     super(finetune, self).__init__()
     self.auto =  AutoModel.from_pretrained('ai4bharat/indic-bert')
     self.classifier =  nn.Sequential(nn.Linear(768, 150), nn.ReLU(),nn.Linear(150, 80),nn.ReLU(),nn.Dropout(0.15),nn.Linear(80, 45),nn.ReLU(),nn.Dropout(0.15),nn.Linear(45,13))
     if freeze_bert:
            for param in self.auto.parameters():
                param.requires_grad = False
  def forward(self,ids,mask):
    output = self.auto(input_ids=ids,attention_mask=mask)
    hidden_stat = output[0][:, 0, :]
    
    logits = self.classifier(hidden_stat)
    return logits




Model Parameters

In [14]:
device=torch.cuda.current_device()
loss_fn = nn.CrossEntropyLoss()
bertt = finetune(True)
bertt.to(device)

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertModel: ['predictions.dense.bias', 'sop_classifier.classifier.weight', 'predictions.LayerNorm.bias', 'predictions.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.weight', 'predictions.dense.weight', 'sop_classifier.classifier.bias', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


finetune(
  (auto): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(200000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
            

Defining Training Funtion

In [15]:
def train(epochs):

  tr_loss, tr_accuracy = 0, 0
  nb_tr_examples, nb_tr_steps = 0, 0
  tr_preds, tr_labels = [], []
  
  optimizer = torch.optim.Adam(params=bertt.parameters(), lr=LEARNING_RATE)
  bertt.train()
  for idx, batch in enumerate(training_loader):
    ids = batch['input_ids'].to(device, dtype = torch.long)
    mask = batch['attention_mask'].to(device, dtype = torch.long)
    labels = batch['labels'].to(device, dtype = torch.long)
    
    logits = bertt(ids,mask)
    loss = loss_fn(logits, labels)
    tr_loss += loss.item()
  
    nb_tr_steps += 1
    nb_tr_examples += labels.size(0)
    
    if idx % 100==0:
        loss_step = tr_loss/nb_tr_steps
        print(f"Training loss per 100 training steps: {loss_step}")

    flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
    active_logits = logits.view(-1,13) # shape (batch_size * seq_len, num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len)

    active_accuracy = labels.view(-1) != -100

    labels = torch.masked_select(flattened_targets, active_accuracy)
    predictions = torch.masked_select(flattened_predictions, active_accuracy)
    tr_labels.extend(labels)
    tr_preds.extend(predictions)

    tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
    tr_accuracy += tmp_tr_accuracy
  
    torch.nn.utils.clip_grad_norm_(
        parameters=bertt.parameters(), max_norm=MAX_GRAD_NORM
    )
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  epoch_loss = tr_loss / nb_tr_steps
  tr_accuracy= tr_accuracy / nb_tr_steps
  print(f"Training loss epoch: {epoch_loss}")
  print(f"Training accuracy epoch: {tr_accuracy}")

Fine Tunning

In [16]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.556023120880127
Training loss per 100 training steps: 2.275794744491577
Training loss per 100 training steps: 2.204578047368064
Training loss per 100 training steps: 2.167753530103107
Training loss per 100 training steps: 2.1541588704782235
Training loss per 100 training steps: 2.141475921857381
Training loss per 100 training steps: 2.138277149438461
Training loss per 100 training steps: 2.133813806335188
Training loss per 100 training steps: 2.132943584826704
Training loss per 100 training steps: 2.1258546338229545
Training loss per 100 training steps: 2.1284458022136667
Training loss per 100 training steps: 2.1267987729526454
Training loss per 100 training steps: 2.125574550114504
Training loss per 100 training steps: 2.12871920260716
Training loss per 100 training steps: 2.1277516125186184
Training loss per 100 training steps: 2.127253579704226
Training loss per 100 training steps: 2.12929603902345
Training loss per 100 train

Defining Evaluation Function

In [17]:
def evaluate(bertt,loader):
   
    # For each batch in our validation set...
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
  
    
    bertt.eval()
    with torch.no_grad():
       for idx, batch in enumerate(loader):
             ids = batch['input_ids'].to(device, dtype = torch.long)
             mask = batch['attention_mask'].to(device, dtype = torch.long)
             labels = batch['labels'].to(device, dtype = torch.long)
    
             logits = bertt(ids,mask)
             loss = loss_fn(logits, labels)
             tr_loss += loss.item()
  
             nb_tr_steps += 1
             nb_tr_examples += labels.size(0)
    
             if idx % 100==0:
                loss_step = tr_loss/nb_tr_steps
                print(f"Training loss per 100 training steps: {loss_step}")

             flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
             active_logits = logits.view(-1,13) # shape (batch_size * seq_len, num_labels)
             flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len)

             active_accuracy = labels.view(-1) != -100

             labels = torch.masked_select(flattened_targets, active_accuracy)
             predictions = torch.masked_select(flattened_predictions, active_accuracy)
          
             tr_labels.extend(labels)
             tr_preds.extend(predictions)
            
             tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
             tr_accuracy += tmp_eval_accuracy
  
    labels = [ids_to_labels[id.item()] for id in tr_labels]
    predictions = [ids_to_labels[id.item()] for id in tr_preds]
    
    eval_loss = tr_loss / nb_tr_steps 
    eval_accuracy = tr_accuracy / nb_tr_steps 
    print(f"Validation Loss: { eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy }")

    return labels, predictions

In [18]:
x = evaluate(bertt,validation_loader)

Training loss per 100 training steps: 2.3116698265075684
Training loss per 100 training steps: 2.109209692714238
Training loss per 100 training steps: 2.10298116201192
Training loss per 100 training steps: 2.1166473924123568
Training loss per 100 training steps: 2.0996899139554124
Training loss per 100 training steps: 2.096250448160305
Training loss per 100 training steps: 2.1158464110631514
Training loss per 100 training steps: 2.121652326298168
Training loss per 100 training steps: 2.114468197027842
Training loss per 100 training steps: 2.1089759940047905
Training loss per 100 training steps: 2.108643101466881
Training loss per 100 training steps: 2.1046007540310434
Training loss per 100 training steps: 2.106894886315018
Training loss per 100 training steps: 2.105998875451583
Validation Loss: 2.1033085860284286
Validation Accuracy: 0.42044609665427507


# Testing

In [19]:
file_folder = "hi_dev.conll"
file =  open(file_folder,encoding="utf-8")
text = file.read()


In [20]:
import re
text = re.sub(r'[^\w\s]', '', text)

In [21]:
s_cout = 0
l1 = []
l2 = []
l3 = []
for word in text.split('\n'):
  if(word == ''):
    s_cout =  s_cout + 1
  elif(word[0] == '#'):
    continue
    
  else:
      word = word.split(" ")
      k  = "sent_"+ str(s_cout)
      l1.append(k)
      l2.append(word[0])
      l3.append(word[-1])

In [22]:
df.head()

Unnamed: 0,sentence,word_labels
0,जयरजयन,O
1,०१९,O
2,जपज,O
3,_,O
4,आध,BCW


In [23]:
df = pd.DataFrame({"sent":l1,"word":l2,"tag":l3})
df.head()

Unnamed: 0,sent,word,tag
0,sent_0,,bb5d9ab998a342828dbdc1107808c6c6\tdomaindev
1,sent_0,१४९२,O
2,sent_0,म,O
3,sent_0,एक,O
4,sent_0,चरटर,O


In [24]:
lis = []
for i in range(df.shape[0]):
  if(df.iloc[i,1] != ''):
    lis.append(i)
df = df.iloc[lis,:]

In [25]:
lis = []
for i in range(df.shape[0]):
  if(df.iloc[i,2] != ''):
    lis.append(i)
df = df.iloc[lis,:]

In [26]:
df['sentence'] =  df['word']
df['word_labels']  = df['tag']
df.head()

Unnamed: 0,sent,word,tag,sentence,word_labels
1,sent_0,१४९२,O,१४९२,O
2,sent_0,म,O,म,O
3,sent_0,एक,O,एक,O
4,sent_0,चरटर,O,चरटर,O
5,sent_0,क,O,क,O


In [27]:
labels_to_ids = {k: v for v, k in enumerate(df.word_labels.unique())}
ids_to_labels = {v: k for v, k in enumerate(df.word_labels.unique())}
labels_to_ids
ids_to_labels

{0: 'O',
 1: 'BLOC',
 2: 'BPROD',
 3: 'BCORP',
 4: 'ICORP',
 5: 'BGRP',
 6: 'IGRP',
 7: 'BCW',
 8: 'BPER',
 9: 'IPER',
 10: 'ICW',
 11: 'ILOC',
 12: 'IPROD'}

In [28]:
from collections import Counter
Counter(df.word_labels)

Counter({'BCORP': 134,
         'BCW': 113,
         'BGRP': 148,
         'BLOC': 131,
         'BPER': 133,
         'BPROD': 169,
         'ICORP': 138,
         'ICW': 150,
         'IGRP': 296,
         'ILOC': 77,
         'IPER': 165,
         'IPROD': 107,
         'O': 10676})

In [29]:
df = df[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
df.head()

Unnamed: 0,sentence,word_labels
0,१४९२,O
1,म,O
2,एक,O
3,चरटर,O
4,क,O


In [30]:
testing_set = dataset(df, tokenizer, MAX_LEN)

In [31]:
test_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }


testing_loader = DataLoader(testing_set, **test_params)

Testing Validation Accuracy

In [32]:
x = evaluate(bertt,testing_loader)

Training loss per 100 training steps: 2.048029661178589
Training loss per 100 training steps: 1.7715914851368064
Training loss per 100 training steps: 1.750408106182345
Training loss per 100 training steps: 1.7534855881402658
Training loss per 100 training steps: 1.744390343787367
Validation Loss: 1.7444436985712786
Validation Accuracy: 0.6128917378917379


Testing F1 Score

In [33]:
from sklearn.metrics import f1_score
f1_score(x[0], x[1], average='micro')

0.6127739176910743