In [21]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from ast import literal_eval
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from sklearn.model_selection import train_test_split
import torch
from datasets import Dataset

In [2]:
df = pd.read_csv('data/ner.csv')
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [3]:
df = df.dropna()
df.shape

(47959, 4)

In [4]:
def get_vocabulary(df):
    """
    Return tag2idx, idx2tag
    """
    def get_vocab_helper(set_, start_idx = 0):
        set2idx = {}
        idx2set = {}
        i = start_idx
        for s in sorted(set_):
            set2idx[s] = i
            idx2set[i] = s
            i += 1
        return set2idx, idx2set
    
    tags = set()
    for _, row in df.iterrows():
        tags.update(literal_eval(row['Tag']))

    tag2idx, idx2tag = get_vocab_helper(tags)

    return tag2idx, idx2tag

In [5]:
tag2idx, idx2tag = get_vocabulary(df)
for tag, idx in tag2idx.items():
    print(f"{tag} : {idx}")

B-art : 0
B-eve : 1
B-geo : 2
B-gpe : 3
B-nat : 4
B-org : 5
B-per : 6
B-tim : 7
I-art : 8
I-eve : 9
I-geo : 10
I-gpe : 11
I-nat : 12
I-org : 13
I-per : 14
I-tim : 15
O : 16


In [30]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate precision, recall, and F1-score
    precision = precision_score(labels, preds, average='macro')
    recall = recall_score(labels, preds, average='macro')
    f1 = f1_score(labels, preds, average='macro')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [22]:
def get_data(df, tag2idx, tokenizer, max_len = 128, split_str = ' ', test_size = 0.15):
    """
    Returns train_dataset, eval_dataset in pytorch data format
    """
    sentences_li = [txt.split(split_str) for txt in df['Sentence'].values.tolist()]
    labels = [literal_eval(txt) for txt in df['Tag'].values.tolist()]
    labels = [[tag2idx[tag] for tag in tag_li] for tag_li in labels]
    tokenized_inputs = tokenizer(sentences_li, truncation=True, is_split_into_words=True, 
                             padding='max_length', max_length=128, add_special_tokens = False)
    
    aligned_labels = []

    for sentence_li, corr_label in zip(sentences_li, labels):
        temp_label = []
        for i, word in enumerate(sentence_li):
            word_tokens = tokenizer(word, add_special_tokens=False)['input_ids']
            temp_label.extend([corr_label[i]] * len(word_tokens))
        aligned_labels.append(temp_label)

    aligned_labels_padded = pad_sequences(aligned_labels, maxlen = max_len, padding='post', 
                                          truncating='post', value=tag2idx['O'])
    
    train_inputs, eval_inputs, train_labels, eval_labels = train_test_split(
        np.array(tokenized_inputs['input_ids']), np.array(aligned_labels_padded), 
        shuffle=True, test_size=test_size)
    
    train_data = {"input_ids": train_inputs, "labels": train_labels}
    eval_data = {"input_ids": eval_inputs, "labels": eval_labels}

    train_dataset = Dataset.from_dict(train_data)
    eval_dataset = Dataset.from_dict(eval_data)

    return train_dataset, eval_dataset

In [23]:
from transformers import BertForTokenClassification, AutoTokenizer

model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)
model.config.num_labels = len(tag2idx)
model.classifier = torch.nn.Linear(model.config.hidden_size, model.config.num_labels)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
from transformers import TFBertForTokenClassification

model = TFBertForTokenClassification.from_pretrained(model_name)
print(model.config.num_labels)

2024-01-02 17:04:28.954220: W external/local_xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc:504] Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice. This may result in compilation or runtime failures, if the program we try to run uses routines from libdevice.
Searched for CUDA in the following directories:
  ./cuda_sdk_lib
  /usr/local/cuda-12.2
  /usr/local/cuda
  /home/co21btech11001/anaconda3/lib/python3.11/site-packages/tensorflow/python/platform/../../../nvidia/cuda_nvcc
  /home/co21btech11001/anaconda3/lib/python3.11/site-packages/tensorflow/python/platform/../../../../nvidia/cuda_nvcc
  .
You can choose the search directory by setting xla_gpu_cuda_data_dir in HloModule's DebugOptions.  For most apps, setting the environment variable XLA_FLAGS=--xla_gpu_cuda_data_dir=/path/to/cuda will work.
2024-01-02 17:04:28.968347: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-01-02 17:04:28.973301

UnknownError: Exception encountered when calling layer 'LayerNorm' (type LayerNormalization).

{{function_node __wrapped__Rsqrt_device_/job:localhost/replica:0/task:0/device:GPU:0}} JIT compilation failed. [Op:Rsqrt] name: 

Call arguments received by layer 'LayerNorm' (type LayerNormalization):
  • inputs=tf.Tensor(shape=(1, 2, 768), dtype=float32)

In [24]:
train_data, eval_data = get_data(df, tag2idx, tokenizer)

In [25]:
from transformers import Trainer, TrainingArguments

In [26]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    # compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=eval_data,
)

In [31]:
trainer.train()

RuntimeError: shape '[-1, 9]' is invalid for input of size 17408