In [7]:
!pip install transformers
!pip install tqdm
!pip install torch
!pip install evaluate


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.2.2-py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 3.5 MB/s 
[?25hCollecting datasets>=2.0.0
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 10.3 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 64.7 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.7.1-py

In [9]:
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

In [2]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
label_list = ["O", "ORG", "PER", "DAT", "TIM", "LOC", "EVE", "mainLOC", "NAT"]
labels_to_ids = {k: v for v, k in enumerate(label_list)}
ids_to_labels = {v: k for v, k in enumerate(label_list)}

print(labels_to_ids)
print(ids_to_labels)

{'O': 0, 'ORG': 1, 'PER': 2, 'DAT': 3, 'TIM': 4, 'LOC': 5, 'EVE': 6, 'mainLOC': 7, 'NAT': 8}
{0: 'O', 1: 'ORG', 2: 'PER', 3: 'DAT', 4: 'TIM', 5: 'LOC', 6: 'EVE', 7: 'mainLOC', 8: 'NAT'}


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "HooshvareLab/bert-fa-base-uncased-clf-digimag"
)

Downloading tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [5]:
import torch

def get_mask(encodings, n):
    position_list = encodings["offset_mapping"]
    final_labels = [-100] * n
    word_index = 1
    while word_index < n and position_list[word_index][1] != 0:
        b = position_list[word_index][1]
        word_index += 1

    b = min(b, n - 1)
    final_labels[1:b] = [0] * (b - 1)
    return final_labels


def get_final_labels(text, label):
    n = 512
    encodings = tokenizer(
        text,
        return_offsets_mapping=True,
        padding="max_length",
        max_length=n,
        truncation=True,
    )
    position_list = encodings["offset_mapping"]
    final_labels = get_mask(encodings, n)
    word_index = 0
    for label in label:
        interval = label["range"]
        label_id = labels_to_ids[label["name"]]
        while word_index <= n - 1 and position_list[word_index][0] < interval[0]:
            word_index += 1
        while (
            word_index <= n - 1
            and position_list[word_index][1] <= interval[1]
            and position_list[word_index][1] != 0
        ):
            final_labels[word_index] = label_id
            word_index += 1
    final_labels[0] = -100
    return final_labels


class DataSequence(torch.utils.data.Dataset):
    def __init__(self, news_list):
        labels = []
        texts = []
        for news in news_list:
            header = news["header"]
            text = news["text"]
            header_annotaiton = news["annotations"][0]["header"]
            text_annotation = news["annotations"][0]["text"]
            header_label = get_final_labels(header, header_annotaiton)
            text_label = get_final_labels(text, text_annotation)

            for t, l in [(header,header_label), (text, text_label)]:
                texts.append(
                    tokenizer(
                        t,
                        return_tensors="pt",
                        padding="max_length",
                        max_length=512,
                        truncation=True,
                    )
                )
                labels.append(l)

        print("labels_lenght")
        print(len(labels_to_ids))
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def get_batch_data(self, idx):
        return self.texts[idx]

    def get_batch_labels(self, idx):
        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):
        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)
        return batch_data, batch_labels

In [6]:
from transformers import BertForTokenClassification

class BertModel(torch.nn.Module):
    def __init__(self):
        super(BertModel, self).__init__()
        self.bert = BertForTokenClassification.from_pretrained(
            "HooshvareLab/bert-fa-base-uncased-clf-digimag",
            num_labels=9,
            ignore_mismatched_sizes=True,
        )

    def forward(self, input_id, mask, label):
        output = self.bert(
            input_ids=input_id, attention_mask=mask, labels=label, return_dict=False
        )
        return output

In [8]:
LEARNING_RATE = 1e-2
EPOCHS = 5

In [None]:
from torch.utils.data import DataLoader
from torch.optim import SGD
from tqdm import tqdm


def train_loop(model, train, evaluation):
    train_dataset = DataSequence(train)
    train_dataloader = DataLoader(
        train_dataset, num_workers=4, batch_size=1, shuffle=True
    )

    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_loss = 1000

    for epoch_num in range(EPOCHS):
        total_loss_train = 0
        model.train()
        for input_batch, batch_labels in tqdm(train_dataloader):
            batch_labels = batch_labels[0].to(device)
            mask = input_batch["attention_mask"][0].to(device)
            input_id = input_batch["input_ids"][0].to(device)
            optimizer.zero_grad()

            loss, logits = model(input_id, mask, batch_labels)
            logits_clean = logits[0][batch_labels != -100]
            label_clean = batch_labels[batch_labels != -100]

            predictions = logits_clean.argmax(dim=1)
            total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        evaluate(model, evaluation)

        print(
            f"Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(train_dataset): .3f} | Val_Loss: {total_loss_val / len(val_dataset): .3f}"
        )



In [None]:
import evaluate as e

f1_metric = e.load("f1")
accuracy_metric = e.load("accuracy")
recall_metric = e.load("recall")

def evaluate_metrics(predictions, label_clean):
    f1_score_w = f1_metric.compute(
        predictions=predictions, references=label_clean, average="weighted"
    )
    print("f1 score w is", f1_score_w)
    f1_score_mac = f1_metric.compute(
        predictions=predictions, references=label_clean, average="macro"
    )
    print("f1 score mac is", f1_score_mac)
    f1_score_mic = f1_metric.compute(
        predictions=predictions, references=label_clean, average="micro"
    )
    print("f1 score mic is", f1_score_mic)
    f1_score_non = f1_metric.compute(
        predictions=predictions, references=label_clean, average=None
    )
    print("f1 score non is", f1_score_non)
    accuracy_score = accuracy_metric.compute(
        predictions=predictions, references=label_clean
    )
    print("accuracy is", accuracy_score)

    recall_score = recall_metric.compute(
        predictions=predictions, references=label_clean, average="weighted"
    )
    print("recall w is", recall_score)
    recall_score = recall_metric.compute(
        predictions=predictions, references=label_clean, average="macro"
    )
    print("recall mac is", recall_score)
    recall_score = recall_metric.compute(
        predictions=predictions, references=label_clean, average="micro"
    )
    print("recall mic is", recall_score)
    recall_score = recall_metric.compute(
        predictions=predictions, references=label_clean, average=None
    )
    print("recall non is", recall_score)

def evaluate(model, test):
    model.eval()
    test_dataset = DataSequence(test)
    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0
    for test_data, test_label in test_dataloader:
        test_label = test_label[0].to(device)
        mask = test_data["attention_mask"][0].to(device)
        input_id = test_data["input_ids"][0].to(device)

        loss, logits = model(input_id, mask, test_label.long())
        logits_clean = logits[0][test_label != -100]
        label_clean = test_label[test_label != -100]

        predictions = logits_clean.argmax(dim=1)

        acc = (predictions == label_clean).float().mean()
        total_acc_test += acc

    evaluate_metrics(predictions, label_clean)

    val_accuracy = total_acc_test / len(test_dataset)
    print(f"Test Accuracy: {total_acc_test / len(test_dataset): .3f}")

In [None]:
def show_result(text, predictions):
    encoding = tokenizer(
        text,
        return_offsets_mapping=True,
        padding="max_length",
        max_length=512,
        truncation=True,
    )
    intervals = []
    for interval in encoding["offset_mapping"]:
        if not (interval[0] == 0 and interval[1] == 0):
            intervals.append(interval)
    print(len(intervals))
    print(len(predictions))
    for interval, label in zip(intervals, predictions):
        # print(interval)
        print(text[interval[0] : interval[1]], "   ", label)

In [None]:
def evaluate_one_text(model, sentence):
    if use_cuda:
        model = model.cuda()

    encoding = tokenizer(
        sentence,
        return_tensors="pt",
        return_offsets_mapping=True,
        padding="max_length",
        max_length=512,
        truncation=True,
    )
    mask = encoding["attention_mask"][0].unsqueeze(0).to(device)

    input_id = encoding["input_ids"][0].unsqueeze(0).to(device)
    label_ids = torch.Tensor(get_mask(sentence, 512)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]
    # for g in logits[0][label_ids != -100].argmax(dim=1).tolist():
    #   print(g)
    # print(logits_clean)
    # print(logits_clean.shape)
    predictions = logits_clean.argmax(dim=1).tolist()
    # print(predictions)
    prediction_label = [ids_to_labels[i] for i in predictions]
    print(sentence)
    print(prediction_label)
    show_result(sentence, prediction_label)

In [None]:
import json

path = "/content/drive/MyDrive/Colab Notebooks/dataset_annotated_splited.json"
with open(path, "r") as f:
    data = json.load(f)
    train_data = data["train"]
    test_data = data["test"]
    evaluation_data = data["eval"]

model = BertModel()
print("training ...")
train_loop(model, train_data, evaluation_data)
print("evaluating ...")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased-clf-digimag and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([7, 768]) in the checkpoint and torch.Size([9, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([7]) in the checkpoint and torch.Size([9]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


training ...
labels_lenght
9
labels_lenght
9
cuda


  cpuset_checked))
100%|██████████| 2700/2700 [05:01<00:00,  8.97it/s]


Epochs: 1 | Loss:  0.127 | Accuracy:  0.969 | Val_Loss:  0.073 | Accuracy:  0.976


100%|██████████| 2700/2700 [05:02<00:00,  8.92it/s]


Epochs: 2 | Loss:  0.071 | Accuracy:  0.976 | Val_Loss:  0.063 | Accuracy:  0.977


100%|██████████| 2700/2700 [05:02<00:00,  8.92it/s]


Epochs: 3 | Loss:  0.060 | Accuracy:  0.979 | Val_Loss:  0.070 | Accuracy:  0.974


100%|██████████| 2700/2700 [05:02<00:00,  8.92it/s]


Epochs: 4 | Loss:  0.051 | Accuracy:  0.982 | Val_Loss:  0.060 | Accuracy:  0.978


100%|██████████| 2700/2700 [05:02<00:00,  8.92it/s]


Epochs: 5 | Loss:  0.045 | Accuracy:  0.984 | Val_Loss:  0.065 | Accuracy:  0.978
evaluating ...


In [None]:
evaluate(model, test_data)
# text = 'امین به ایران آمد.'
# text = "به گزارش خبرنگار مهر، نماینده ولی فقیه در آذربایجان شرقی پیش از ظهر امروز در مراسم بزرگداشت یوم الله ۱۲ بهمن که در تالار اجتماعات مصلی اعظم امام خمینی ره برگزار شد گفت: مشکلات اقتصادی و تحریم ها در کشور وجود دارد اما باید قدردان انقلاب اسلامی ایران بود و به همین دلیل، باید توگه بیشتری به موفقیت های به دست آمده در طول دوران انقلاب اسلامی داشت. حجت الاسلام و المسلمین سید محمد علی آل هاشم ادامه داد: سرعت پیشرفت علم در ایران بعد از انقلاب اسلامی و هم اکنون، ۱۱ برابر دنیاست؛ امروزه جمهوری اسلامی ایران، هشتمین کشور تولید کننده اورانیوم ۲۰ درصد جهان است"
# # text = "به گزارش برنا؛ تقریبا از اسفند ماه سال گذشته واکسیناسیون عمومی در کشور با واردات واکسن های خارجی کرونا انجام شد و این روند به صورتی بود که محموله های جدید واکسن پس از خریداری شدن به کشور وارد می شد و تزریق ها برای گروه های اولویت دار انجام می گرفت البته در این میان جهش های جدیدی از ویروس در کشور زیاد شد و در مقابل واردات واکسن های خارجی با مشکلاتی مواجه بود و مسیر این اقدام با پستی ها و بلندی های زیادی رو به رو شد اما در حال حاضر با وجود همه اتفاقات بنا به گفته مسئولان ستاد مقابله با کرونا دو هفته ای از برنامه واکسیناسیون عقب هستیم و دلیل اصلی این اتفاق محدودیت وجود واکسن است. مسعود یونسیان، استاد اپیدمیولوژی دانشگاه علوم پزشکی تهران در گفت وگو با خبرنگار برنا درباره خرید واکسن های خارجی توسط شرکت های خصوصی گفت: دولت از خرید واکسن شرکت های خصوصی استقبال می کند و اصولا بسیاری از کشور های دیگر نیز واردات واکسن های خارجی را به شرکت های خصوصی سپرده اند اما در نهایت تحویل وزارت بهداشت می شود"
# evaluate_one_text(model , text)

labels_lenght
9


  cpuset_checked))


f1 score w is {'f1': 0.9901846134655564}
f1 score mac is {'f1': 0.6653439153439153}
f1 score mic is {'f1': 0.9921414538310412}
f1 score non is {'f1': array([0.99603175, 1.        , 0.        ])}
accuracy is {'accuracy': 0.9921414538310412}
recall w is {'recall': 0.9921414538310412}
recall mac is {'recall': 0.6660039761431412}
recall mic is {'recall': 0.9921414538310412}
recall non is {'recall': array([0.99801193, 1.        , 0.        ])}
Test Accuracy:  0.974


In [None]:
import json

k = 4
header = "header"
# header = 'text'
path = "/content/drive/MyDrive/Colab Notebooks/dataset_annotated_splited.json"
with open(path, "r") as f:
    data = json.load(f)
    sample_text = data["train"][k][header]
    sample_label = data["train"][k]["annotations"][0][header]
    print(sample_text)
    print(sample_label)
    final_labels = get_final_labels(sample_text, sample_label)
    print(final_labels)
    # print(get_mask(sample_text))

WHO منشاء جدیدی برای ویروس کرونای جدید پیدا کرده است
[{'name': 'ORG', 'range': [0, 3]}]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -1