In [1]:
!pip install transformers
!pip install tqdm
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 34.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 50.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 10.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
label_list = ['O','ORG','PER','DAT','TIM','LOC','EVE','mainLOC','NAT']
labels_to_ids = {k: v for v, k in enumerate(label_list)}
ids_to_labels = {v: k for v, k in enumerate(label_list)}

print(labels_to_ids)
print(ids_to_labels)

{'O': 0, 'ORG': 1, 'PER': 2, 'DAT': 3, 'TIM': 4, 'LOC': 5, 'EVE': 6, 'mainLOC': 7, 'NAT': 8}
{0: 'O', 1: 'ORG', 2: 'PER', 3: 'DAT', 4: 'TIM', 5: 'LOC', 6: 'EVE', 7: 'mainLOC', 8: 'NAT'}


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
        "HooshvareLab/bert-fa-base-uncased-clf-digimag"
    )

Downloading tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [5]:
import torch

def get_final_labels(text, label):
  encodings = tokenizer(text, return_offsets_mapping=True,
                               padding='max_length', max_length = 512, truncation=True,)
  position_list = encodings['offset_mapping']
  n = len(position_list)
  final_labels = [-100]*n
  word_index = 0
  for label in label:
      interval = label['range']
      label_id = labels_to_ids[label['name']]
      while word_index <= n-1 and position_list[word_index][0] < interval[0] :
        final_labels[word_index] = 0
        word_index+=1
      while word_index <= n-1 and position_list[word_index][1] <= interval[1]:
        final_labels[word_index] = label_id
        word_index+=1
  final_labels[0] = -100 
  return final_labels

class DataSequence(torch.utils.data.Dataset):

    def __init__(self, news_list):
        labels = []
        texts = []
        for news in news_list:

          header = news['header']
          text = news['text']
          header_annotaiton = news['annotations'][0]['header']
          text_annotation = news['annotations'][0]['text']
          header_label = get_final_labels(header,header_annotaiton)
          text_label = get_final_labels(text, text_annotation)

          texts.append(tokenizer(header, return_tensors="pt",
                               padding='max_length', max_length = 512, truncation=True,)) 
          labels.append(header_label)

          texts.append(tokenizer(text, return_tensors="pt",
                               padding='max_length', max_length = 512, truncation=True,))  
          labels.append(text_label)

        self.texts = texts
        self.labels = labels

    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

In [None]:
from transformers import BertForTokenClassification

class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained("HooshvareLab/bert-fa-base-uncased-clf-digimag", num_labels=9, ignore_mismatched_sizes=True)

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [24]:
from torch.utils.data import DataLoader
from torch.optim import SGD
from tqdm import tqdm

def train_loop(model, train, evaluation):

    train_dataset = DataSequence(train)
    val_dataset = DataSequence(evaluation)

    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=1, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    print(device)
    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()
        for train_data, train_label in tqdm(train_dataloader):
            train_label = train_label[0].to(device)
            mask = train_data['attention_mask'][0].to(device)
            input_id = train_data['input_ids'][0].to(device)
            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)
            logits_clean = logits[0][train_label != -100]
            label_clean = train_label[train_label != -100]

            predictions = logits_clean.argmax(dim=1)

            acc = (predictions == label_clean).float().mean()
            total_acc_train += acc
            total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0
        for val_data, val_label in val_dataloader:

            val_label = val_label[0].to(device)
            mask = val_data['attention_mask'][0].to(device)

            input_id = val_data['input_ids'][0].to(device)

            loss, logits = model(input_id, mask, val_label)

            logits_clean = logits[0][val_label != -100]
            label_clean = val_label[val_label != -100]

            predictions = logits_clean.argmax(dim=1)          

            acc = (predictions == label_clean).float().mean()
            total_acc_val += acc
            total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(val_dataset)
        val_loss = total_loss_val / len(val_dataset)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(train): .3f} | Accuracy: {total_acc_train / len(train): .3f} | Val_Loss: {total_loss_val / len(evaluation): .3f} | Accuracy: {total_acc_val / len(evaluation): .3f}')

LEARNING_RATE = 1e-2
EPOCHS = 5



0.01


In [None]:
def evaluate(model, df_test):

    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0

    for test_data, test_label in test_dataloader:

        test_label = test_label[0].to(device)
        mask = test_data['attention_mask'][0].to(device)
        input_id = test_data['input_ids'][0].to(device)
          
        loss, logits = model(input_id, mask, test_label.long())

        logits_clean = logits[0][test_label != -100]
        label_clean = test_label[test_label != -100]

        predictions = logits_clean.argmax(dim=1)
              
        acc = (predictions == label_clean).float().mean()
        total_acc_test += acc

    val_accuracy = total_acc_test / len(df_test)
    print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')



In [23]:
def get_mask(text):
  encodings = tokenizer(text, return_offsets_mapping=True,
                               padding='max_length', max_length = 512, truncation=True,)
  position_list = encodings['offset_mapping']
  n = len(position_list)
  final_labels = [-100]*n
  word_index = 1
  print(len(text))
  print('_____________________')
  while word_index < n and position_list[word_index][1]!=0:
    a = position_list[word_index][0]
    b = position_list[word_index][1]
    print(b)
    final_labels[a:b+1] = [0]*(b-a+1)
    word_index+=1
  final_labels[0] = -100
  return final_labels
            
def evaluate_one_text(model, sentence):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, return_tensors="pt",
                              padding='max_length', max_length = 512, truncation=True,)

    mask = text['attention_mask'][0].unsqueeze(0).to(device)

    input_id = text['input_ids'][0].unsqueeze(0).to(device)
    print(input_id.shape)
    label_ids = torch.Tensor(get_mask(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]
    print(logits_clean)
    print(logits_clean[0].shape)
    predictions = logits_clean[0][0].argmax(dim=1).tolist()
    print(predictions)
    prediction_label = [ids_to_labels[i] for i in predictions]
    print(sentence)
    print(prediction_label)

In [None]:
import json

path = '/content/drive/MyDrive/Colab Notebooks/dataset_annotated_splited.json'
with open(path, 'r') as f:
    data = json.load(f)
    train_data = data['train']
    test_data = data['test']
    evaluation_data = data['eval']

model = BertModel()
print('training ...')
train_loop(model, train_data, evaluation_data)
print('evaluating ...')


Some weights of BertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased-clf-digimag and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([7, 768]) in the checkpoint and torch.Size([9, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([7]) in the checkpoint and torch.Size([9]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


training ...


  cpuset_checked))


cuda


100%|██████████| 2700/2700 [05:06<00:00,  8.82it/s]


Epochs: 1 | Loss:  nan | Accuracy:  nan | Val_Loss:  nan | Accuracy:  nan


100%|██████████| 2700/2700 [05:09<00:00,  8.73it/s]


Epochs: 2 | Loss:  nan | Accuracy:  nan | Val_Loss:  nan | Accuracy:  nan


100%|██████████| 2700/2700 [05:09<00:00,  8.71it/s]


Epochs: 3 | Loss:  nan | Accuracy:  nan | Val_Loss:  nan | Accuracy:  nan


100%|██████████| 2700/2700 [05:09<00:00,  8.73it/s]


Epochs: 4 | Loss:  nan | Accuracy:  nan | Val_Loss:  nan | Accuracy:  nan


100%|██████████| 2700/2700 [05:09<00:00,  8.73it/s]


Epochs: 5 | Loss:  nan | Accuracy:  nan | Val_Loss:  nan | Accuracy:  nan
evaluating ...


In [22]:
# evaluate(model, test_data)
text = 'امین به ایران آمد.'
evaluate_one_text(model , text)


NameError: ignored

In [21]:
import json
k = 4
header = 'header'
# header = 'text'
path = '/content/drive/MyDrive/Colab Notebooks/dataset_annotated_splited.json'
with open(path, 'r') as f:
    data = json.load(f)
    sample_text = data['train'][k][header]
    sample_label = data['train'][k]['annotations'][0][header]
    print(sample_text)
    print(sample_label)
    final_labels =get_final_labels(sample_text, sample_label)
    print(final_labels)
    # print(get_mask(sample_text))

WHO منشاء جدیدی برای ویروس کرونای جدید پیدا کرده است
[{'name': 'ORG', 'range': [0, 3]}]
52
_____________________
3
9
15
20
26
33
38
43
48
52
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -10