In [1]:
import sys
sys.path.append('../')
import pandas as pd
import os
import torch 
import numpy as np
from transformers import AutoModelForTokenClassification, AutoTokenizer
from datasets import Dataset
from tqdm import tqdm
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
import random
from transformers import DataCollatorForTokenClassification
import evaluate
from model import CRF
from dataloader import PreDataCollator
from util.eval import get_tag_mappings
os.environ["WANDB_DISABLED"] = "true"

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

### Seed all

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [30]:
# Load data as pandas dataframe
LANG = 'en' # use None for all lang

df = pd.read_csv('./Dataset/train.csv')
train_df, dev_df = train_test_split(df, test_size=0.2, random_state=SEED)


if LANG!=None:
    train_df = train_df[train_df['lang']==LANG]
    dev_df = dev_df[dev_df['lang']==LANG]
    
    
train_data = Dataset.from_pandas(train_df)
dev_data = Dataset.from_pandas(dev_df)


In [47]:
MAX_LEN = 256
TOKENIZER_NAME = 'distilbert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)

In [32]:

print(train_data[0]['sent'])
print(train_data[0]['labels'])
# sentence = train_data[0]['sent'].split()

irving  pichel  actor  and  director  (  b  .  1891  ) 
 B-Artist  I-Artist  O  O  O  O  O  O  B-WrittenWork  O


In [48]:
sentence = ['drama', 'city', '쑥과', '마늘에', '관한', '진실', '(', 'kbs2', '2003', ')']
encoding = tokenizer(sentence,
                         is_split_into_words=True, 
                         return_offsets_mapping=True, 
                         padding='max_length', 
                         truncation=True, 
                         max_length=516)


In [49]:
i = 0
for idx, mapping in enumerate(encoding["offset_mapping"]):
    if mapping[0] == 0 and mapping[1] != 0:
        # overwrite the tag
        try:
#             encoded_tags[idx] = tags[i]
            i += 1
        except Exception as err:
#                     print(encoding["offset_mapping"])
#                     print(i)
            print(sentence)
#                     print(len(tags), tags[i])
i

10

In [44]:
print(encoding['offset_mapping'])

[(0, 0), (0, 5), (0, 4), (0, 1), (0, 1), (0, 1), (1, 2), (1, 2), (0, 1), (0, 1), (1, 2), (1, 2), (1, 2), (2, 3), (2, 3), (0, 1), (0, 1), (0, 1), (1, 2), (1, 2), (1, 2), (0, 1), (0, 1), (0, 1), (1, 2), (1, 2), (1, 2), (0, 1), (0, 2), (2, 3), (3, 4), (0, 4), (0, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),

In [45]:
# len(sentence)
print(encoding['input_ids'][3:6])

[1462, 30014, 30020]


In [46]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][3:6]))

['ᄊ', '##ᅮ', '##ᆨ']


In [36]:
# tokenizer.convert_ids_to_tokens(encoding['input_ids'])
tags_to_ids, ids_to_tags = get_tag_mappings()
number_of_labels = len(tags_to_ids)

In [37]:
collator = PreDataCollator(tokenizer=tokenizer, max_len=MAX_LEN, tags_to_ids = tags_to_ids)
train_tokenized = train_data.map(collator, remove_columns=train_data.column_names, batch_size=4, num_proc=4, batched=True)

     

#0:   0%|          | 0/839 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/839 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/839 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/839 [00:00<?, ?ba/s]

['son', 'in', 'law', ':', 'yi', 'sun', ',', 'king', 'sukjong', 'of', 'joseon', '(', '조선', '숙종', '이순', '朝鮮', '肅宗', '李焞', ')']
['son', 'in', 'law', ':', 'yi', 'sun', ',', 'king', 'sukjong', 'of', 'joseon', '(', '조선', '숙종', '이순', '朝鮮', '肅宗', '李焞', ')']
['son', 'in', 'law', ':', 'yi', 'sun', ',', 'king', 'sukjong', 'of', 'joseon', '(', '조선', '숙종', '이순', '朝鮮', '肅宗', '李焞', ')']
['son', 'in', 'law', ':', 'yi', 'sun', ',', 'king', 'sukjong', 'of', 'joseon', '(', '조선', '숙종', '이순', '朝鮮', '肅宗', '李焞', ')']
['drama', 'city', '쑥과', '마늘에', '관한', '진실', '(', 'kbs2', '2003', ')']
['drama', 'city', '쑥과', '마늘에', '관한', '진실', '(', 'kbs2', '2003', ')']
['drama', 'city', '쑥과', '마늘에', '관한', '진실', '(', 'kbs2', '2003', ')']
['drama', 'city', '쑥과', '마늘에', '관한', '진실', '(', 'kbs2', '2003', ')']
['drama', 'city', '쑥과', '마늘에', '관한', '진실', '(', 'kbs2', '2003', ')']
['drama', 'city', '쑥과', '마늘에', '관한', '진실', '(', 'kbs2', '2003', ')']
['drama', 'city', '쑥과', '마늘에', '관한', '진실', '(', 'kbs2', '2003', ')']
['the', 'complex'

In [24]:
print(tokenizer.convert_ids_to_tokens()

['[CLS]', 'ই', '##হ', '##ু', '##দি', '##রা', 'হ', '##িজ', '##রত', 'ব', '##ইট', '##িতে', 'তা', '##ও', '##রা', '##ত', 'ব', '##ই', '##তে', 'ব', '##র', '##্ণ', '##িত', 'হিসাবে', 'দ', '##াস', '##ত্ব', 'থেকে', 'তাদের', 'প', '##াল', '##ান', '##োর', 'স', '##্', '##ম', '##রণ', '##ে', 'এই', 'ছ', '##ু', '##টি', 'উ', '##দ', '##য', '##াপ', '##ন', 'করে', '।', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PA