In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import random
import logging
import re
from tqdm import tqdm
from ast import literal_eval
import matplotlib.pyplot as plt

2023-12-21 06:29:18.723437: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-21 06:29:18.725907: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-21 06:29:18.764480: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-21 06:29:18.764497: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-21 06:29:18.765493: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [2]:
df = pd.read_csv('data/ner.csv')
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [3]:
df = df.dropna()
df.shape

(47959, 4)

In [4]:
def get_vocabulary(df, split_str = ' '):
    """
    Return all word2idx, idx2word, tag2idx, idx2tag
    """
    def get_vocab_helper(set_, start_idx = 0):
        set2idx = {}
        idx2set = {}
        i = start_idx
        for s in sorted(set_):
            set2idx[s] = i
            idx2set[i] = s
            i += 1
        return set2idx, idx2set
    
    words = set()
    tags = set()
    for _, row in df.iterrows():
        words.update(row['Sentence'].split(split_str))
        tags.update(literal_eval(row['Tag']))

    word2idx, idx2word = get_vocab_helper(words, start_idx=1)
    tag2idx, idx2tag = get_vocab_helper(tags)

    return word2idx, idx2word, tag2idx, idx2tag

word2idx, idx2word, tag2idx, idx2tag = get_vocabulary(df)

In [5]:
WORDS = len(word2idx.keys())+1 # 1 unknown word
TAGS = len(tag2idx.keys())
print(f"Vocab size: {WORDS}")
print(f"Number of tags: {TAGS}")

Vocab size: 35179
Number of tags: 17


In [6]:
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [9]:
from transformers import DistilBertTokenizerFast #, TFDistilBertModel
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

In [10]:
from transformers import TFDistilBertForTokenClassification

model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=TAGS)

Downloading model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

2023-12-21 06:40:39.811781: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model that you exp

In [33]:
# Use tokenizer and align labels
sentences_li = [txt.split(' ') for txt in df['Sentence'].values.tolist()]
tokenized_inputs = tokenizer(sentences_li, truncation=True, is_split_into_words=True, 
                             padding='max_length', max_length=64)
len(tokenized_inputs['input_ids'])

47959

In [34]:
labels = []
for i, row in df.iterrows():
    label_li = []
    word_ids = tokenized_inputs.word_ids(batch_index=i)
    label = literal_eval(row['Tag'])
    for word_idx in word_ids:
        if word_idx is None:
            label_li.append(-100)
        else:
            label_li.append(tag2idx[label[word_idx]])
    labels.append(label_li)
tokenized_inputs['labels'] = labels

In [35]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    tokenized_inputs['input_ids'],
    tokenized_inputs['labels']
))

In [36]:
# Creates a new dataset with batches of BATCH_SIZE samples
train_dataset.batch(1).as_numpy_iterator().next()

(array([[  101, 26159,  1104,  8568,  4487,  5067,  1138,  9639,  1194,
          1498,  1106,  5641,  1103,  1594,  1107,  5008,  1105,  4555,
          1103, 10602,  1104,  1418,  2830,  1121,  1115,  1583,   119,
           102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]], dtype=int32),
 array([[-100,   16,   16,   16,   16,   16,   16,   16,   16,    2,   16,
           16,   16,   16,   16,    2,   16,   16,   16,   16,   16,    3,
           16,   16,   16,   16,   16, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100]],
       dtype=int32))

In [37]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=['accuracy'])
model.fit(train_dataset.batch(8),
          epochs=10, 
          batch_size=8)

Epoch 1/10
  12/5995 [..............................] - ETA: 44:05 - loss: 2.3203 - accuracy: 0.2038

KeyboardInterrupt: 

In [38]:
# Load model from model/distilbert directory
# Load tokenizer from tokenizer/distilbert directory
model_trained = TFDistilBertForTokenClassification.from_pretrained('model/distilbert', num_labels=TAGS)
tokenizer_trained = DistilBertTokenizerFast.from_pretrained('tokenizer/distilbert')

Some layers from the model checkpoint at model/distilbert were not used when initializing TFDistilBertForTokenClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForTokenClassification were not initialized from the model checkpoint at model/distilbert and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
example_sentence = "Aaryan, Shalu, Vandana and Prince live in Jind"
# Tokenize and predict using trained model
example_sentence = example_sentence.split(' ')
inputs = tokenizer_trained(example_sentence, return_tensors="tf", is_split_into_words=True, truncation=True, padding='max_length', max_length=64)
outputs = model_trained(inputs)[0]
predictions = tf.argmax(outputs, axis=2)
predictions = predictions.numpy().flatten().tolist()
# Print prediction for every word rather than token
word_ids = inputs.word_ids(batch_index=0)
previous_word_idx = None
for word_idx, pred_idx in zip(word_ids, predictions):
    if word_idx is None:
        continue
    if word_idx != previous_word_idx:
        print(f"{example_sentence[word_idx]}: {idx2tag[pred_idx]}")
    # else:
    #     print(f"{example_sentence[word_idx]}: {idx2tag[pred_idx]}")
    previous_word_idx = word_idx

Aaryan,: B-per
Shalu,: B-per
Vandana: B-per
and: O
Prince: B-per
live: O
in: O
Jind: B-geo
