In [1]:
# pip install transformers
# !pip install seqeval
import sys
import os
sys.path.append('../')

In [18]:
import pandas as pd
import os
import torch 
import numpy as np
from transformers import AutoModelForTokenClassification, AutoTokenizer
from datasets import Dataset
from tqdm import tqdm
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
import random
from transformers import DataCollatorForTokenClassification
import evaluate
from model import CRF
from dataloader import PreDataCollator
from util.eval import get_tag_mappings
os.environ["WANDB_DISABLED"] = "true"

### Env Setup

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

### Seed all

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

### Preparing data

In [4]:
LANG = 'bn' # use None for all lang

In [5]:
# Load data as pandas dataframe

df = pd.read_csv('./Dataset/train.csv')
train_df, dev_df = train_test_split(df, test_size=0.2, random_state=SEED)


if LANG!=None:
    train_df = train_df[train_df['lang']==LANG]
    dev_df = dev_df[dev_df['lang']==LANG]

In [6]:
## Transform into hugginface dataset

train_data = Dataset.from_pandas(train_df)
dev_data = Dataset.from_pandas(dev_df)


In [7]:
# Check random data item

print(train_data[0]['sent'])
print(train_data[0]['labels'])

ইহুদিরা  হিজরত  বইটিতে  তাওরাত  বইতে  বর্ণিত  হিসাবে  দাসত্ব  থেকে  তাদের  পালানোর  স্মরণে  এই  ছুটি  উদযাপন  করে। 
 O  B-WrittenWork  O  B-WrittenWork  O  O  O  O  O  O  O  O  O  O  O  O


### Tokenization

In [8]:
tags_to_ids, ids_to_tags = get_tag_mappings()
number_of_labels = len(tags_to_ids)

In [9]:
MAX_LEN = 256
TOKENIZER_NAME = 'xlm-roberta-base'

In [10]:
## load appropiate tokenizer for pre-trained models

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)

In [19]:
collator = PreDataCollator(tokenizer=tokenizer, max_len=MAX_LEN, tags_to_ids = tags_to_ids)

In [20]:
train_tokenized = train_data.map(collator, remove_columns=train_data.column_names, batch_size=4, num_proc=4, batched=True)
dev_tokenized = dev_data.map(collator, remove_columns=dev_data.column_names, batch_size=4, num_proc=4, batched=True)

     

#0:   0%|          | 0/489 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/489 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/489 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/488 [00:00<?, ?ba/s]

     

#0:   0%|          | 0/119 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/119 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/119 [00:00<?, ?ba/s]

#3:   0%|          | 0/119 [00:00<?, ?ba/s]

### Training

In [15]:
MODEL_NAME = 'distilbert-base-uncased'
model = CRF(MODEL_NAME,ids_to_tags,number_of_labels,device=device)
model = model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
EPOCHS = 7
LEARNING_RATE = 1e-04
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
SAVE_STEPS = 500
EVAL_STEPS = 500
SAVE_LIMIT = 2
WARMUP_STEPS = 100

In [17]:
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors='pt')

In [18]:
metric_acc = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

def compute_metrics(pred):
    
    
    pred_logits = pred.predictions
    pred_ids = torch.tensor(pred_logits)

    tr_active_acc = torch.from_numpy(pred.label_ids != -100)
    pr_active_acc = torch.from_numpy(pred_logits != -100)

    train_tags = torch.masked_select(torch.from_numpy(pred.label_ids), tr_active_acc)
    train_predicts = torch.masked_select(pred_ids, pr_active_acc)
    
#     print(tr_active_acc)
    
#     print(train_tags.size())
#     print(train_predicts.size())

    acc = metric_acc.compute(predictions=train_predicts, references=train_tags)
    f1 = metric_f1.compute(predictions=train_predicts, references=train_tags, average='macro')
    
    return {'accuracy': acc['accuracy'], 'f1':f1['f1']}


In [19]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=f"./output/{MODEL_NAME}-{LANG}",
  group_by_length=True,
  per_device_train_batch_size=TRAIN_BATCH_SIZE,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=EPOCHS,
  fp16=False,
  save_steps=SAVE_STEPS,
  eval_steps=EVAL_STEPS,
  logging_steps=EVAL_STEPS,
  learning_rate=LEARNING_RATE,
  warmup_steps=WARMUP_STEPS,
  save_total_limit=SAVE_LIMIT,
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [20]:
from transformers import Trainer


trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_tokenized,
    eval_dataset=dev_tokenized,
    tokenizer=tokenizer
)

In [21]:
trainer.train()

***** Running training *****
  Num examples = 13416
  Num Epochs = 7
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 5866


Step,Training Loss,Validation Loss,Accuracy,F1
500,10.8459,2.542761,0.894256,0.339035
1000,1.5328,1.053723,0.911364,0.529765
1500,-0.1121,0.583365,0.915592,0.586536
2000,-1.3645,0.392314,0.920613,0.614334
2500,-2.0215,0.242593,0.919184,0.621434
3000,-2.9764,0.498134,0.921824,0.642507
3500,-3.2935,0.679397,0.921506,0.649452
4000,-3.7985,0.813263,0.919382,0.64939
4500,-4.0608,0.878239,0.921407,0.654342
5000,-4.2795,1.083787,0.923372,0.663827


***** Running Evaluation *****
  Num examples = 3361
  Batch size = 8
Saving model checkpoint to ./output/distilbert-base-uncased-en/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in ./output/distilbert-base-uncased-en/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./output/distilbert-base-uncased-en/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3361
  Batch size = 8
Saving model checkpoint to ./output/distilbert-base-uncased-en/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in ./output/distilbert-base-uncased-en/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./output/distilbert-base-uncased-en/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3361
  Batch size = 8
Saving model checkpoint to ./output/distilbert-base-uncased-en/checkpoint-1

TrainOutput(global_step=5866, training_loss=-1.4765557335620443, metrics={'train_runtime': 13053.8618, 'train_samples_per_second': 7.194, 'train_steps_per_second': 0.449, 'total_flos': 0.0, 'train_loss': -1.4765557335620443, 'epoch': 7.0})