In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers datasets evaluate seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [4]:
import pandas as pd

In [5]:
data = pd.read_csv('/content/drive/My Drive/Teste_A3Data/id_desc.csv')

In [6]:
data.head()

Unnamed: 0,id,desc
0,004c744a-2b82-49b3-a536-90dbb68f368e,Vectra is a cybersecurity platform that uses A...
1,0062d2eb-8210-480a-adef-ec91d0e74855,Roadzen is a global leader in the P&C insuranc...
2,00657562-4dd3-41f9-aceb-df114606bc9e,Restream is a multi-streaming solution that al...
3,00b16654-5d01-4f2f-b6b3-ee2ef13c0d9e,DLP works for 1% of companies; fortunately the...
4,00b4e919-4d10-4ce8-b013-b6cf73ebd780,Shift Technology delivers AI-native decision a...


In [7]:
data.shape

(3617, 2)

In [8]:
data[data["desc"].isna()]

Unnamed: 0,id,desc
28,0b24ca09-7a47-4513-91d2-dc0e245c9827,
95,2ea3e420-25ff-43ae-9362-40d0122ceede,
116,37573898-6cf3-4946-9467-0af84fdc7823,
133,4182b4cb-fe1e-4f7f-b090-5455eb84f226,
161,50b3f33a-a7e8-481c-927b-a6fc9c24cde5,
...,...,...
3167,d763163f-c397-4a9b-b0ef-15682ada5c6f,
3192,d9546de6-a6de-436c-b1e0-212091463b13,
3227,dcdf8d62-0f15-4218-bbfc-d5714ca861d0,
3254,debd5d6e-b1af-4a07-82ca-4ec5de31a85c,


In [9]:
data = data.dropna()

In [10]:
dict_data = data.to_dict('records')

In [11]:
import json
with open('/content/drive/My Drive/Teste_A3Data/labels_dict_list.json', 'r') as f:
    labels_dict_list = json.load(f)

In [12]:
def DataSetRowGen():
  id = 0
  for sample_id in labels_dict_list['samples']:
    encoded_input = tokenizer(dict_data[int(sample_id)]['desc'], truncation=True)
    encoded_input['id'] = id
    encoded_input['tokens'] = tokenizer.convert_ids_to_tokens(encoded_input["input_ids"])
    encoded_input['ner_tags'] = []
    encoded_input['labels'] = []
    encoded_input['word_ids'] = encoded_input.word_ids()
    for ix, input_id in enumerate(encoded_input['input_ids']):
      if input_id not in (101, 102):
        if encoded_input['word_ids'][ix-1] != encoded_input['word_ids'][ix]:
          if sample_id in labels_dict_list['samples'] and 'ner_company' in labels_dict_list['samples'][sample_id] and ix >= labels_dict_list['samples'][sample_id]['ner_company']['start'] and ix < labels_dict_list['samples'][sample_id]['ner_company']['end']:
            encoded_input['ner_tags'].append(1)
            encoded_input['labels'].append(1)
          else:
            encoded_input['ner_tags'].append(0)
            encoded_input['labels'].append(0)
        else:
          encoded_input['labels'].append(-100)
      elif input_id in (101, 102):
        encoded_input['labels'].append(-100)
    id += 1
    yield encoded_input

In [13]:
gen = DataSetRowGen()

In [14]:
#next(gen)

In [15]:
#labeled_encoded_input = next(gen)

In [16]:
# print(tokenizer.decode(labeled_encoded_input["input_ids"]))
# print(tokenizer.convert_ids_to_tokens(labeled_encoded_input["input_ids"]))
# print(labeled_encoded_input["labels"])

In [17]:
from datasets import Dataset

In [18]:
ds = Dataset.from_generator(DataSetRowGen)



In [19]:
ds

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'id', 'tokens', 'ner_tags', 'labels', 'word_ids'],
    num_rows: 100
})

In [20]:
ds.remove_columns('word_ids')
ds

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'id', 'tokens', 'ner_tags', 'labels', 'word_ids'],
    num_rows: 100
})

In [21]:
#features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels']

In [22]:
ds_train_test = ds.train_test_split(train_size=0.85, seed=13)
ds_train_valid_test = ds_train_test['train'].train_test_split(train_size=0.83, seed=31)
ds_train_valid_test['validation'] = ds_train_valid_test.pop("test")
ds_train_valid_test['test'] = ds_train_test['test']
ds_train_valid_test



DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'id', 'tokens', 'ner_tags', 'labels', 'word_ids'],
        num_rows: 70
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'id', 'tokens', 'ner_tags', 'labels', 'word_ids'],
        num_rows: 15
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'id', 'tokens', 'ner_tags', 'labels', 'word_ids'],
        num_rows: 15
    })
})

In [23]:
import evaluate
seqeval = evaluate.load("seqeval")

In [24]:
example = ds_train_valid_test["train"][0]

In [25]:
import numpy as np

labels = ['corporation' if i == 1 else 0 for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [26]:
id2label = {
    0: "O",
    1: "corporation",
}
label2id = {
    "O": 0,
    "corporation": 1,
}

In [27]:
from transformers import create_optimizer

batch_size = 14
num_train_epochs = 8
num_train_steps = (len(ds) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=7e-7,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [28]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    "bert-base-cased", num_labels=2, id2label=id2label, label2id=label2id
)

All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

In [30]:
tf_train_set = model.prepare_tf_dataset(
    ds,
    shuffle=True,
    batch_size=14,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    ds,
    shuffle=False,
    batch_size=14,
    collate_fn=data_collator,
)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [31]:
import tensorflow as tf

model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [32]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [33]:
callbacks = [metric_callback]
label_list = ["O", "corporation"]

In [34]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=8, callbacks=callbacks)

Epoch 1/8


  tensor = as_tensor(value)


Epoch 2/8




Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7fbf6c3ddff0>

In [35]:
tf_save_directory = "./tf_save_pretrained"
tokenizer.save_pretrained(tf_save_directory)
model.save_pretrained(tf_save_directory)

In [36]:
from transformers import pipeline

In [37]:
classifier = pipeline("ner", model=tf_save_directory)

Some layers from the model checkpoint at ./tf_save_pretrained were not used when initializing TFBertForTokenClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at ./tf_save_pretrained.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


In [38]:
text = "LifeScore Labs is an insurtech company delivers data-driven risk-scoring model."
classifier(text)

[{'entity': 'corporation',
  'score': 0.5061104,
  'index': 13,
  'word': '-',
  'start': 52,
  'end': 53}]

In [39]:
text = "A3Data is an artificial intelligence consulting company with specialists from a broad expertise in differentes data related areas, such as Data Science, Machine Learning Engineer, Data Analytics and Data Engineer."
classifier(text)

[{'entity': 'corporation',
  'score': 0.51112074,
  'index': 24,
  'word': 'such',
  'start': 131,
  'end': 135},
 {'entity': 'corporation',
  'score': 0.51281804,
  'index': 26,
  'word': 'Data',
  'start': 139,
  'end': 143}]