In [None]:
import sys
import os
sys.path.append('../')
import pandas as pd
import torch 
import numpy as np
from transformers import AutoModelForTokenClassification, AutoTokenizer
from datasets import Dataset
from tqdm import tqdm
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
import random
from transformers import DataCollatorForTokenClassification
import evaluate
from util.utils import get_tag_mappings, get_data, compute_metrics
from util.dataloader import PreDataCollator

os.environ["WANDB_DISABLED"] = "true"

### Env Setup

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

### Seed all

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

### Instructions

Set the variables in the next cell according to the experiment:

``LANG``: Set the language. You can find the language codes in the excel file.

``TOKENIZER_NAME`` or ``MODEL_NAME``: Huggingface Model link. Also mentioned in excel file. 

``SET``: Select the dataset

- ``None`` --> **None Augmentation** (No Augmentation from wiki) NB: None is **not** a string value here
- ``tags`` --> **Max Augmentation** (Maximum Augmentation from wiki)
- ``LM`` --> **Entity Extractor** (Augmentation from wiki after extracting tags using other NER model)
 
``IS_CRF``: True if you want to try the CRF model. Recommended to finish all non-CRF experiments first


**Please ensure that you are saving the trained models**

[Link to Excel File](https://docs.google.com/spreadsheets/d/11LXkOBWxpWDGMsi9XC72eMNSJI14Qo2iwP8qugwjyqU/edit#gid=0)

### Define Variables

In [None]:

LANG = 'en' # use None for all lang
MAX_LEN = 256
TOKENIZER_NAME = 'distilbert-base-uncased'
MODEL_NAME = 'distilbert-base-uncased'
SET = 'tags' # or 'tags' or 'LM' or None
IS_CRF = True

if IS_CRF:
    from model import CRF
    output_dir = f"./output/{MODEL_NAME}-{LANG}-{SET}-CRF" if SET!=None else f"./output/{MODEL_NAME}-{LANG}-CRF"
else:
    output_dir = f"./output/{MODEL_NAME}-{LANG}-{SET}" if SET!=None else f"./output/{MODEL_NAME}-{LANG}"
    

### Preparing data

In [None]:
# Load data as pandas dataframe

df = get_data(LANG, SET, train=True)
    
train_df, dev_df = train_test_split(df, test_size=0.2, random_state=SEED)


if LANG!=None:
    train_df = train_df[train_df['lang']==LANG]
    dev_df = dev_df[dev_df['lang']==LANG]

In [None]:
## Transform into hugginface dataset
train_df['length'] = train_df.sent.apply(lambda x:len(x.split()))
dev_df['length'] = dev_df.sent.apply(lambda x:len(x.split()))
train_data = Dataset.from_pandas(train_df)
dev_data = Dataset.from_pandas(dev_df)


In [None]:
# Check random data item

print(train_data[0]['sent'])
print(train_data[0]['labels'])

### Tokenization

In [None]:
# getting the tags
tags_to_ids, ids_to_tags = get_tag_mappings()
number_of_labels = len(tags_to_ids)

In [None]:
## load appropiate tokenizer for pre-trained models

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)

In [None]:
collator = PreDataCollator(tokenizer=tokenizer, max_len=MAX_LEN, tags_to_ids = tags_to_ids)

In [None]:
train_tokenized = train_data.map(collator, remove_columns=train_data.column_names, batch_size=4, num_proc=4, batched=True)


In [None]:
dev_tokenized = dev_data.map(collator, remove_columns=dev_data.column_names, batch_size=4, num_proc=4, batched=True)

### Training

In [None]:
if IS_CRF:
    model = CRF(MODEL_NAME,ids_to_tags,number_of_labels,device=device)
else:
    model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=number_of_labels)
    
model = model.to(device)

In [None]:
EPOCHS = 7
LEARNING_RATE = 1e-04
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
SAVE_STEPS = 500
EVAL_STEPS = 500
SAVE_LIMIT = 2
WARMUP_STEPS = 100

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors='pt')

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir= output_dir,
  group_by_length=True,
  per_device_train_batch_size=TRAIN_BATCH_SIZE,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=EPOCHS,
  fp16=False,
  save_steps=SAVE_STEPS,
  eval_steps=EVAL_STEPS,
  logging_steps=EVAL_STEPS,
  learning_rate=LEARNING_RATE,
  warmup_steps=WARMUP_STEPS,
  save_total_limit=SAVE_LIMIT,
)

In [None]:
from transformers import Trainer


trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics_crf if IS_CRF else compute_metrics,
    train_dataset=train_tokenized,
    eval_dataset=dev_tokenized,
    tokenizer=tokenizer
)

In [None]:
# If you want to continue training from a checkpoint
# CHECKPOINT = 2500
# chkpt_model = f'{output_dir}/checkpoint-{CHECKPOINT}'
# trainer.train(chkpt_model)

In [None]:
trainer.train()

In [None]:
trainer.save_model(f"{output_dir}/Final")