In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from ast import literal_eval
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from datasets import Dataset
from pprint import pprint

2024-01-06 16:16:55.745761: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-06 16:16:55.782521: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-06 16:16:55.782539: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-06 16:16:55.783555: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-06 16:16:55.789886: I tensorflow/core/platform/cpu_feature_guar

In [2]:
import os
import warnings
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# Suppress the warning about WANDB_DISABLED being deprecated
warnings.filterwarnings("ignore", category=DeprecationWarning)
os.environ["WANDB_SILENT"] = "true"
os.environ["WANDB_DISABLED"] = "true"

In [3]:
df = pd.read_csv('data/ner.csv')
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [4]:
df = df.dropna()
df.shape

(47959, 4)

In [5]:
def get_vocabulary(df):
    """
    Return tag2idx, idx2tag
    """
    def get_vocab_helper(set_, start_idx = 0):
        set2idx = {}
        idx2set = {}
        i = start_idx
        for s in sorted(set_):
            set2idx[s] = i
            idx2set[i] = s
            i += 1
        return set2idx, idx2set
    
    tags = set()
    for _, row in df.iterrows():
        tags.update(literal_eval(row['Tag']))

    tag2idx, idx2tag = get_vocab_helper(tags)

    return tag2idx, idx2tag

In [6]:
tag2idx, idx2tag = get_vocabulary(df)
for tag, idx in tag2idx.items():
    print(f"{tag} : {idx}")

B-art : 0
B-eve : 1
B-geo : 2
B-gpe : 3
B-nat : 4
B-org : 5
B-per : 6
B-tim : 7
I-art : 8
I-eve : 9
I-geo : 10
I-gpe : 11
I-nat : 12
I-org : 13
I-per : 14
I-tim : 15
O : 16


In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

@tf.function
def f1_metric(y_true, y_pred):
    labels = tf.reshape(y_true, [-1])  # Flatten labels
    preds = tf.argmax(y_pred, axis=-1)
    preds = tf.reshape(preds, [-1])  # Flatten preds
    
    # Calculate F1-score with macro average
    f1 = tf.py_function(lambda labels, preds: f1_score(labels.numpy(), preds.numpy(), average='macro'),
                        (labels, preds), tf.float64)
    
    return f1

@tf.function
def accuracy_metric(y_true, y_pred):
    labels = tf.reshape(y_true, [-1])  # Flatten labels
    preds = tf.argmax(y_pred, axis=-1)
    preds = tf.reshape(preds, [-1])  # Flatten preds
    
    # Calculate F1-score with macro average
    f1 = tf.py_function(lambda labels, preds: accuracy_score(labels.numpy(), preds.numpy()),
                        (labels, preds), tf.float64)
    
    return f1

In [8]:
def get_data(df, tag2idx, tokenizer, max_len = 128, split_str = ' ', test_size = 0.15, use_tf = True):
    """
    Returns train_dataset, eval_dataset in pytorch data format
    """
    sentences_li = [txt.split(split_str) for txt in df['Sentence'].values.tolist()]
    labels = [literal_eval(txt) for txt in df['Tag'].values.tolist()]
    labels = [[tag2idx[tag] for tag in tag_li] for tag_li in labels]
    tokenized_inputs = tokenizer(sentences_li, truncation=True, is_split_into_words=True, 
                             padding='max_length', max_length=128, add_special_tokens = False)
    
    aligned_labels = []

    for sentence_li, corr_label in zip(sentences_li, labels):
        temp_label = []
        for i, word in enumerate(sentence_li):
            word_tokens = tokenizer(word, add_special_tokens=False)['input_ids']
            temp_label.extend([corr_label[i]] * len(word_tokens))
        aligned_labels.append(temp_label)

    aligned_labels_padded = pad_sequences(aligned_labels, maxlen = max_len, padding='post', 
                                          truncating='post', value=tag2idx['O'])
    
    train_inputs, eval_inputs, train_labels, eval_labels = train_test_split(
        np.array(tokenized_inputs['input_ids']), np.array(aligned_labels_padded), 
        shuffle=True, test_size=test_size)
    
    if not use_tf:
        train_data = {"input_ids": train_inputs, "labels": train_labels}
        eval_data = {"input_ids": eval_inputs, "labels": eval_labels}

        train_dataset = Dataset.from_dict(train_data)
        eval_dataset = Dataset.from_dict(eval_data)
    else:
        train_dataset = tf.data.Dataset.from_tensor_slices((
            train_inputs, train_labels
        ))
        eval_dataset = tf.data.Dataset.from_tensor_slices((
            eval_inputs, eval_labels
        ))

    return train_dataset, eval_dataset

Using ```distilbert-base-cased```

In [9]:
model_name = 'distilbert-base-cased'
from transformers import DistilBertTokenizerFast #, TFDistilBertModel
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

In [10]:
train_data, eval_data = get_data(df, tag2idx, tokenizer)

2024-01-06 04:35:01.788063: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-01-06 04:35:01.788123: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:129] retrieving CUDA diagnostic information for host: cmslab
2024-01-06 04:35:01.788136: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:136] hostname: cmslab
2024-01-06 04:35:01.788410: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:159] libcuda reported version is: 535.129.3
2024-01-06 04:35:01.788453: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:163] kernel reported version is: 525.116.4
2024-01-06 04:35:01.788465: E external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:244] kernel version 525.116.4 does not match DSO version 535.129.3 -- cannot find working devices in this configuration


In [14]:
from transformers import TFDistilBertForTokenClassification
# model = TFDistilBertForTokenClassification.from_pretrained(model_name, 
#                                                            num_labels=len(tag2idx))

In [12]:
# optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
# model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=[f1_metric, accuracy_metric])
# model.fit(train_data.batch(4),
#           epochs=1, 
#           batch_size=4,
#          validation_data=eval_data.batch(4))

In [13]:
# model.fit(train_data.batch(4),
#           epochs=1, 
#           batch_size=4,
#          validation_data=eval_data.batch(4))

In [15]:
# Done with training using script.py, now load the model
model = TFDistilBertForTokenClassification.from_pretrained('models/distilbert-base-cased/')

All model checkpoint layers were used when initializing TFDistilBertForTokenClassification.

All the layers of TFDistilBertForTokenClassification were initialized from the model checkpoint at models/distilbert-base-cased/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForTokenClassification for predictions without further training.


In [12]:
def test_for_sentence(sentence, tokenizer, model):
    sentence_li = [sentence.split(' ')]
    tokenized_inputs = tokenizer(sentence_li, truncation=True, is_split_into_words=True, 
                             padding='max_length', max_length=128, add_special_tokens = False, 
                                 return_tensors = 'tf')
#     print('here')
    pred = np.argmax(model(tokenized_inputs).logits, axis = -1)[0]
    word_ids = tokenized_inputs.word_ids(batch_index = 0)
    out = dict()
    prev_idx = -1
    for i, idx in enumerate(word_ids):
        if idx == None: break
        if idx != prev_idx:
            out[sentence_li[0][idx]] = [idx2tag[pred[i]]]
        else: out[sentence_li[0][idx]].append(idx2tag[pred[i]])
        prev_idx = idx
    print(word_ids)
    print([idx2tag[idx] for idx in pred])
    return out

In [48]:
example = "Aaryan is going on Tuesday to join LAPD in Azerbaijan"
example = "Teddybear is going in Teddyland"
pprint(test_for_sentence(example, tokenizer, model))

[0, 0, 0, 1, 2, 3, 4, 4, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
['B-per', 'B-per', 'B-per', 'O', 'O', 'O', 'B-geo', 'B-geo', 'O', 'O', 'B-geo', 'B-per', 'B-per', 'B-per', 'B-per', 'B-per', 'O', 'B-per', 'B-per', 'O', 'O', 'O', 'B-geo', 'B-geo', 'O', 'B-geo', 'O', 'O', 'B-per', 'B-per', 'B-per', 'B-per', 'B-per', 'B-pe

Using ```dslim/bert-base-NER```

In [19]:
model_name = 'dslim/bert-base-NER'
from transformers import BertTokenizerFast 
tokenizer = BertTokenizerFast.from_pretrained(model_name)

In [64]:
from transformers import TFBertForTokenClassification
model = TFBertForTokenClassification.from_pretrained(model_name)
model.config.num_labels = len(tag2idx)
model.classifier = tf.keras.layers.Dense(len(tag2idx))

All PyTorch model weights were used when initializing TFBertForTokenClassification.

All the weights of TFBertForTokenClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


In [61]:
train_data, eval_data = get_data(df, tag2idx, tokenizer)

In [62]:
next(iter(train_data.batch(1)))

(<tf.Tensor: shape=(1, 128), dtype=int64, numpy=
 array([[ 1109,  6907,  1104,   158,   119,   156,   119,  2978,  5200,
          3234, 14319,  7661,  1144,  1452,  1121,  4182,   119,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,

In [65]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=[f1_metric, accuracy_metric])
model.fit(train_data.batch(4),
          epochs=1, 
          batch_size=4,
         validation_data=eval_data.batch(4))

  727/10192 [=>............................] - ETA: 1:57:07 - loss: 0.0870 - f1_metric: 0.5296 - accuracy_metric: 0.9788

KeyboardInterrupt: 

In [23]:
# Trained using script.py, now loading
from transformers import TFBertForTokenClassification
model = TFBertForTokenClassification.from_pretrained(f'models/{model_name}')

Some layers from the model checkpoint at models/dslim/bert-base-NER were not used when initializing TFBertForTokenClassification: ['dropout_37', 'dense']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at models/dslim/bert-base-NER and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
print(model.config.num_labels)

17


In [25]:
example = "Aaryan is going on Tuesday to join LAPD in Azerbaijan"
# example = "Teddybear is going in Teddyland"
pprint(test_for_sentence(example, tokenizer, model))

[0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
['I-art', 'I-art', 'I-art', 'I-tim', 'I-tim', 'I-tim', 'I-art', 'I-tim', 'I-tim', 'I-gpe', 'I-gpe', 'I-tim', 'I-org', 'I-org', 'I-tim', 'I-gpe', 'I-org', 'I-art', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-gpe',

Using ```ydshieh/roberta-large-ner-english```

In [9]:
from transformers import AutoTokenizer, TFAutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english")

In [12]:
model = TFAutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
model.config.num_labels = len(tag2idx)
model.classifier = tf.keras.layers.Dense(len(tag2idx))

2024-01-06 05:49:08.716152: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-01-06 05:49:08.716201: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:129] retrieving CUDA diagnostic information for host: cmslab
2024-01-06 05:49:08.716272: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:136] hostname: cmslab
2024-01-06 05:49:08.716687: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:159] libcuda reported version is: 535.129.3
2024-01-06 05:49:08.716758: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:163] kernel reported version is: 525.116.4
2024-01-06 05:49:08.716774: E external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:244] kernel version 525.116.4 does not match DSO version 535.129.3 -- cannot find working devices in this configuration
Some weights of the PyTorch model were not used when initializing 

In [13]:
train_data, eval_data = get_data(df, tag2idx, tokenizer)

In [14]:
next(iter(train_data.batch(1)))

(<tf.Tensor: shape=(1, 128), dtype=int64, numpy=
 array([[12233,     9,    82,    33,  5303,     7,     5,  1424,  1139,
             9,   234,  1253,   260,    11,   436,   128,    29, 23640,
         10197,  2791,    42,   353,     7,  5111, 10165,    11, 23623,
          1097,    11, 35044,   128,    29, 18966,   194,  2156,   511,
             5,  9737,     9,   168,  3517,    11,     5,   443,   479,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,

In [15]:
# optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
# model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=[f1_metric, accuracy_metric])
# model.fit(train_data.batch(4),
#           epochs=1, 
#           batch_size=4,
#          validation_data=eval_data.batch(4))

    3/10192 [..............................] - ETA: 10:12:05 - loss: 1.5586 - f1_metric: 0.0528 - accuracy_metric: 0.5983

KeyboardInterrupt: 