In [1]:
# pip install transformers


In [3]:
import pandas as pd
import os
import torch 
import numpy as np
from transformers import AutoModelForTokenClassification
from datasets import Dataset
from tqdm import tqdm
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
import random
from transformers import DataCollatorForTokenClassification
import evaluate

from dataloader import PreDataCollator
os.environ["WANDB_DISABLED"] = "true"

### Env Setup

In [4]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

### Seed all

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

### Preparing data

In [5]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'


In [6]:
LANG = 'en' # use None for all lang

In [7]:
# Load data as pandas dataframe
test_df = pd.read_csv('./Dataset/dev.csv')


if LANG!=None:
    test_df = test_df[test_df['lang']==LANG]

In [8]:
## Transform into hugginface dataset

test_data = Dataset.from_pandas(test_df)


In [9]:
# Check random data item

print(test_data[6]['sent'])
print(test_data[1]['labels'])

two  important  voices  who  applied  incommensurability  to  historical  and  philosophical  notions  of  science  in  the  1960s  are  thomas  kuhn  and  paul  feyerabend  . 
 B-OtherPER  I-OtherPER  O  O  O  O  B-OtherPROD  O  O  O  O  O  O  O  B-HumanSettlement  O  O  O


### Tokenization

In [10]:
# getting the tags

def get_tag_mappings():
    with open('tags.txt','r') as file:
        unique_tags = [line.strip() for line in file]


    tags_to_ids = {k: v for v, k in enumerate(unique_tags)}
    ids_to_tags = {v: k for v, k in enumerate(unique_tags)}

    return tags_to_ids, ids_to_tags

tags_to_ids, ids_to_tags = get_tag_mappings()
number_of_labels = len(tags_to_ids)

In [11]:
MAX_LEN = 128
TOKENIZER_NAME = 'distilbert-base-uncased'

In [12]:
## load appropiate tokenizer for pre-trained models

from transformers import BertTokenizerFast, DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained(TOKENIZER_NAME)

In [13]:
collator = PreDataCollator(tokenizer=tokenizer, max_len=MAX_LEN, tags_to_ids = tags_to_ids)

In [14]:

test_tokenized = test_data.map(collator, remove_columns=test_data.column_names, batch_size=4, num_proc=4, batched=True)



      

#0:   0%|          | 0/55 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/55 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/55 [00:00<?, ?ba/s]

#3:   0%|          | 0/55 [00:00<?, ?ba/s]

### Training

In [15]:
MODEL = 'distilbert-base-uncased'
CHECKPOINT = 5000
saved_model_dir = f'./output/{MODEL}-{LANG}/checkpoint-{CHECKPOINT}'
model = AutoModelForTokenClassification.from_pretrained(saved_model_dir, num_labels=number_of_labels)
model = model.to(device)

In [16]:
metric_acc = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")


def compute_metrics_test(preds,labels):
    

    tr_active_acc = labels != -100

    tags = torch.masked_select(labels, tr_active_acc)
    predicts = torch.masked_select(preds, tr_active_acc)

    acc = metric_acc.compute(predictions=predicts, references=tags)
    f1 = metric_f1.compute(predictions=predicts, references=tags, average='macro')
    
    return {'accuracy': acc['accuracy'], 'f1':f1['f1']}, tags.tolist(), predicts.tolist()


def print_predictions(tokens, pred_tags, true_tags):
    

    tokens = tokens.split()
    pred_tags = [ids_to_tags[idx] for idx in pred_tags if idx!=-100]
    true_tags = [ids_to_tags[idx] for idx in true_tags if idx!=-100]
    
    
    if len(tokens) != len(pred_tags):
        print(tokens)
        return " "
    
    output = []
    from colorama import Fore, Style, Back
    
    for t,tl,pl in zip(tokens,true_tags,pred_tags):

        if tl == pl:
            o = f"{t} {Back.GREEN}[{tl}][{pl}]{Style.RESET_ALL}"

        else:
            o = f"{t} {Back.GREEN}[{tl}]{Style.RESET_ALL}{Back.RED}[{pl}]{Style.RESET_ALL}"

        output.append(o)
        
    return " ".join(output)," ".join(pred_tags), " ".join(true_tags)


In [17]:
visualization = []
acc = 0
f1  = 0
outputs = []
test_len = len(test_tokenized)

for i in tqdm(range(test_len)): 

    inp_ids = torch.as_tensor([test_tokenized[i]["input_ids"]]).to(device)
#     print(inp_ids)
    label_ids = torch.as_tensor(test_tokenized[i]["labels"]).to(device)
    
    mask = torch.as_tensor([test_tokenized[i]["attention_mask"]]).to(device)

    logits = model(input_ids=inp_ids, attention_mask=mask).logits

    pred_ids = torch.argmax(logits, dim=-1)[0]
    
    result, tags, predicts = compute_metrics_test(pred_ids,label_ids)
    
    vis, pred_tags, true_tags = print_predictions(test_data[i]['sent'],predicts,tags)
    
    outputs.append((test_data[i]['sent'], pred_tags, true_tags))
    
    acc += result['accuracy']
    f1 += result['f1']
    visualization.append(vis)
    
#     print(output)
#     break
    
    
print(f'Accuracy: {acc/test_len}')
print(f'F1: {f1/test_len}')

100%|█████████████████████████████████████████| 870/870 [00:09<00:00, 93.41it/s]

Accuracy: 0.9260487103017591
F1: 0.7456196553228979





In [20]:
print(visualization[10])

it [42m[O][O][0m stars [42m[O][O][0m tomokazu [42m[B-Artist][B-Artist][0m sugita [42m[I-Artist][I-Artist][0m daisuke [42m[B-OtherPER][0m[41m[B-Artist][0m sakaguchi [42m[I-OtherPER][0m[41m[I-Artist][0m rie [42m[B-Artist][B-Artist][0m kugimiya [42m[I-Artist][I-Artist][0m among [42m[O][O][0m others [42m[O][O][0m . [42m[O][O][0m


In [19]:
df = pd.DataFrame(outputs, columns=['sent','predictions','true'])
df.to_csv(f'./output/{MODEL}-{LANG}/outputs.csv',index=False)