In [2]:
import sys
import os
sys.path.append('../')
import pandas as pd
import torch 
import numpy as np
from transformers import AutoModelForTokenClassification, AutoTokenizer
from datasets import Dataset
from tqdm import tqdm
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
import random
from transformers import DataCollatorForTokenClassification
import evaluate
from util.utils import feval, get_tag_mappings, get_data 
from util.dataloader import PreDataCollator
os.environ["WANDB_DISABLED"] = "true"

2022-12-26 01:40:38.684301: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-26 01:40:39.238040: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.4/lib64
2022-12-26 01:40:39.238107: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.4/lib64
2022-12-26 01:40:39.633773: I tensorflow/compiler/xla/stream_executor/cuda/c

### Env Setup

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

### Seed all

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

### Define Variables

In [4]:
LANG = 'en' # use None for all lang
MAX_LEN = 256
TOKENIZER_NAME = 'distilbert-base-uncased'
MODEL_NAME = 'distilbert-base-uncased'
SET = 'LM' # or tags or LM or None
CHECKPOINT = '5500' # or final

IS_CRF = False

if IS_CRF:
    from model import CRF
    output_dir = f"./output/{MODEL_NAME}-{LANG}-{SET}-CRF" if SET!=None else f"./output/{MODEL_NAME}-{LANG}-CRF"
else:
    output_dir = f"./output/{MODEL_NAME}-{LANG}-{SET}" if SET!=None else f"./output/{MODEL_NAME}-{LANG}"

### Preparing data

In [5]:
# Load data as pandas dataframe
test_df = get_data(LANG, SET, train=False)


if LANG!=None:
    test_df = test_df[test_df['lang']==LANG]

In [6]:
## Transform into hugginface dataset

test_data = Dataset.from_pandas(test_df)


In [7]:
# Check random data item

print(test_data[6]['sent'])
print(test_data[6]['labels'])

two  important  voices  who  applied  incommensurability  to  historical  and  philosophical  notions  of  science  in  the  1960s  are  thomas  kuhn  and  paul  feyerabend  . 
 O  O  O  O  O  O  O  O  O  O  O  O  O  O  O  O  O  B-Scientist  I-Scientist  O  B-OtherPER  I-OtherPER  O


### Tokenization

In [8]:
tags_to_ids, ids_to_tags = get_tag_mappings()
number_of_labels = len(tags_to_ids)

In [9]:
## load appropiate tokenizer for pre-trained models
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)

In [10]:
collator = PreDataCollator(tokenizer=tokenizer, max_len=MAX_LEN, tags_to_ids = tags_to_ids)

In [11]:

test_tokenized = test_data.map(collator, remove_columns=test_data.column_names, batch_size=4, num_proc=4, batched=True)



      

#0:   0%|          | 0/55 [00:00<?, ?ba/s]

#1:   0%|          | 0/55 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/55 [00:00<?, ?ba/s]

#3:   0%|          | 0/55 [00:00<?, ?ba/s]

### Load Saved Model

In [12]:

saved_model_dir = f'{output_dir}/checkpoint-{CHECKPOINT}' if CHECKPOINT !='Final' else f'{output_dir}/Final'
model = AutoModelForTokenClassification.from_pretrained(saved_model_dir, num_labels=number_of_labels)
model = model.to(device)

### Evaluation

In [13]:
outputs, vis = feval(test_data,test_tokenized, model, device, IS_CRF=IS_CRF)

  0%|                                                   | 0/870 [00:00<?, ?it/s]


RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [None]:
print(vis[10])

In [None]:
df = pd.DataFrame(outputs, columns=['sent','predictions','true'])

In [None]:
df.to_csv(f'{output_dir}/outputs.csv',index=False)