# Install Packages

install required packages using requirement.txt file.

In [1]:
!pip install transformers
!pip install tqdm
!pip install torch
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Import Libraries

In [30]:
import torch
import json
import evaluate as e
from google.colab import drive
from transformers import AutoTokenizer, BertForTokenClassification
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import SGD

## Check whether cuda is available

Check whether cuda is available and based on this, device object is built that is used in for pytorch tensors computation.

In [31]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [32]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Hyper-Parameter Setting

In this section, hyper-parameters that used in bert fine-tuning are defined. hyper-parameter optimization (HPO) will be done in the next parts.

TODO: HPO

In [33]:
MAX_LEN = 512
LEARNING_RATE = 1e-2
EPOCHS = 5
BATCH_SIZE = 4
BIO = False

# Load Data

load train, validation, test dataset with json.

In [34]:
# path = "/content/drive/MyDrive/Colab Notebooks/dataset_annotated_splited.json"
path = "/content/drive/MyDrive/NLP/HW5/dataset_annotated_splited.json"
with open(path, "r") as f:
    data = json.load(f)
    train_data = data["train"]
    test_data = data["test"]
    val_data = data["eval"]

print(train_data[0].keys())
print(train_data[0]['annotations'])

dict_keys(['header', 'text', 'annotations'])
[{'header': [{'name': 'DAT', 'range': [27, 37]}], 'text': [{'name': 'ORG', 'range': [9, 24]}, {'name': 'ORG', 'range': [32, 49]}, {'name': 'PER', 'range': [51, 62]}, {'name': 'ORG', 'range': [68, 79]}, {'name': 'ORG', 'range': [153, 164]}, {'name': 'PER', 'range': [210, 222]}, {'name': 'DAT', 'range': [269, 280]}, {'name': 'ORG', 'range': [349, 360]}, {'name': 'PER', 'range': [369, 375]}, {'name': 'PER', 'range': [414, 426]}, {'name': 'TIM', 'range': [478, 485]}, {'name': 'DAT', 'range': [465, 470]}, {'name': 'ORG', 'range': [510, 521]}]}, {'header': [], 'text': [{'name': 'PER', 'range': [51, 62]}, {'name': 'PER', 'range': [210, 222]}, {'name': 'PER', 'range': [369, 375]}, {'name': 'PER', 'range': [414, 426]}]}]


## Label to ID Mapping

In this section, labels to ids and ids to labels are built for next usage in bert fine-tuning training.

In [35]:
if BIO:
    label_list = [
        "O", 
        "B_ORG", "B_PER", "B_DAT", "B_TIM", "B_LOC", "B_EVE", "B_mainLOC", "B_NAT",
        "I_ORG", "I_PER", "I_DAT", "I_TIM", "I_LOC", "I_EVE", "I_mainLOC", "I_NAT"
    ]
else:
    label_list = ["O", "ORG", "PER", "DAT", "TIM", "LOC", "EVE", "mainLOC", "NAT"]
labels_to_ids = {k: v for v, k in enumerate(label_list)}
ids_to_labels = {v: k for v, k in enumerate(label_list)}

print(labels_to_ids)
print(ids_to_labels)

{'O': 0, 'ORG': 1, 'PER': 2, 'DAT': 3, 'TIM': 4, 'LOC': 5, 'EVE': 6, 'mainLOC': 7, 'NAT': 8}
{0: 'O', 1: 'ORG', 2: 'PER', 3: 'DAT', 4: 'TIM', 5: 'LOC', 6: 'EVE', 7: 'mainLOC', 8: 'NAT'}


## Initialise Bert Tokenizer

In this section, **ParsBERT(v2.0)** tokenizer is used for tokenization. ParsBERT (v2.0) is a Transformer-based Model for Persian Language Understanding that reconstructed the vocabulary and fine-tuned the ParsBERT v1.1 on the new Persian corpora in order to provide some functionalities for using ParsBERT in other scopes! Follow the ParsBERT repo for the latest information about previous and current models. Persian Text Classification [DigiMag, Persian News] The task target is labeling texts in a supervised manner in both existing datasets DigiMag and Persian News. A total of 8,515 articles scraped from **Digikala** Online Magazine. This dataset includes seven different classes.

In [36]:
tokenizer = AutoTokenizer.from_pretrained(
    "HooshvareLab/bert-fa-base-uncased-clf-digimag"
)

# texts = ['امین', 
#          'آرمین پسر خوبی است'
#          ]
# encodings = tokenizer(
#     texts,
#     return_offsets_mapping=True,
#     padding="max_length",
#     max_length=120,
#     truncation=True,
#     return_tensors="pt"
# )
# encodings['input_ids'].shape

# Data Preprocessing

In this section, pre-processing is done on the raw data to convert the input data into the form of the huggingface transformers library.

## Handle Overlaps between Named Entity Tags

In this section, some functions are defined to handle overlapping ner tags in such a way that the inner tags are removed and only the outermost tags are kept.

In [37]:
def has_intersection(first, second):
    if first[0] < second[0]:
        if first[1] <= second[0]:
            return False
        else:
            return True
    else:
        if first[0] >= second[1]:
            return False
        else:
            return True

def remove_annotation_overlap(annotations):
    annotations = sorted(annotations, key=lambda x: x["range"][0])
    n = len(annotations)
    if n == 0:
        return []
    i = 0
    j = 1
    while i < n and j < n:
        first = annotations[i]
        first_range = first["range"]
        second = annotations[j]
        second_range = second["range"]
        if has_intersection(first_range, second_range):
            new = first if (first_range[1]-first_range[0]) > (second_range[1]-second_range[0]) else second
            annotations[i]= new
            annotations[j] = None
        else:
            i = j
        j += 1

    annotations = list(filter(lambda x: not x is None, annotations))
    return annotations


In [38]:
# DONT REMOVE

# data = train_data[1]
# text = data['text']
# annotations = []
# for i in range(len(data['annotations'])): 
#     annotations.extend(data['annotations'][i]["text"])

# print(text)
# print(len(text.split()))
# print(annotations)
# print(len(data['annotations']))
  

# encodings = tokenizer(
#     text,
#     return_offsets_mapping=True,
#     padding="max_length",
#     max_length=120,
#     truncation=True,
#     return_tensors="pt"
# )
# # print(encodings)
# tokenized = tokenizer.convert_ids_to_tokens(encodings["input_ids"][0])
# print(tokenized)
# print(encodings.word_ids())
# labels = get_final_label(encodings, annotations)
# labels = list(map(lambda x: ids_to_labels[x] if x != -100 else None, labels))
# print(labels)
# print(list(zip(tokenized, labels)))

# Character Level to Token Level Indexing

In this section, some functions are defined to handle token level indexing. to overcome token level indexing, CLS and END must be considered.

In [39]:
def get_starting_token_index(tag_start, word_index, token_offsets):
    while word_index <= MAX_LEN - 1 and token_offsets[word_index][0] < tag_start:
        word_index += 1
    return word_index

def get_ending_token_index(tag_stop, word_index, token_offsets):
    while (
        word_index <= MAX_LEN - 1
        and token_offsets[word_index][1] < tag_stop
        and token_offsets[word_index][1] != 0
    ):
        word_index += 1
    return word_index

def get_final_label(encoding, annotation):
    token_offsets = encoding["offset_mapping"]
    input_ids = encoding["input_ids"]
    end_element = torch.argmin(input_ids)
    final_labels = [-100] * MAX_LEN
    final_labels[1:end_element] = [0] * (end_element - 1)

    annotations_without_overlap = remove_annotation_overlap(annotation)

    word_index = 1
    for label in annotations_without_overlap:
        interval = label["range"]
        label_name = label["name"]
        word_index = get_starting_token_index(interval[0], word_index, token_offsets)
        start_token_index = word_index
        if start_token_index == 512:
          break
        word_index = get_ending_token_index(interval[1], word_index, token_offsets)
        end_token_index = word_index
        if BIO:
            final_labels[start_token_index:end_token_index+1] = [labels_to_ids["I_"+label_name]] * (end_token_index-start_token_index+1)
            final_labels[start_token_index] = labels_to_ids["B_"+label_name]
        else:
            final_labels[start_token_index:end_token_index+1] = [labels_to_ids[label_name]] * (end_token_index-start_token_index+1)

        word_index += 1
    return final_labels



# Define DataSequence and DataLoader

In this section, DataSequence and DataLoader that used in bert fine-tuning are defined.

In [40]:
class DataSequence(torch.utils.data.Dataset):
    def __init__(self, news_list):
        labels = []
        texts = []
        for news in news_list:
            header = news["header"]
            text = news["text"]
            header_annotaition = []
            text_annotation = []
            for i in range(len(news["annotations"])): 
                header_annotaition.extend(news["annotations"][i]["header"])
                text_annotation.extend(news["annotations"][i]["text"])

            for t, annotation in [(header,header_annotaition), (text, text_annotation)]:
                encoding = tokenizer(
                    t,
                    return_offsets_mapping=True,
                    padding='max_length',
                    max_length=MAX_LEN, # including [CLS] end [SEP]
                    truncation=True,
                    return_tensors="pt",
                )
                for key in ['input_ids', 'attention_mask', 'token_type_ids', 'offset_mapping']:
                    encoding[key] = encoding[key][0]
                label = get_final_label(encoding, annotation)
                texts.append(encoding)
                labels.append(label)


        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def get_batch_data(self, idx):
        return self.texts[idx]

    def get_batch_labels(self, idx):
        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):
        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)
        return batch_data, batch_labels

def to_device(data, device):
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device)

class DeviceDataLoader():
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device

    def __iter__(self):
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        return len(self.dl)

# Initialise DataSequence and DataLoader Object

In [41]:
def get_dataloader(data, batch=None, cuda=True):
    dataset = DataSequence(data)
    print('len ds: ', len(dataset))
    if batch is None:
        batch = len(dataset)
    dataloader = DataLoader(
        dataset, num_workers=1, batch_size=batch, shuffle=True
    )
    if cuda:
        dataloader = DeviceDataLoader(dataloader, device)
    return dataloader

train_dataloader = get_dataloader(train_data, BATCH_SIZE)
val_dataloader = get_dataloader(val_data, BATCH_SIZE)
test_dataloader = get_dataloader(test_data)

len ds:  2700
len ds:  150
len ds:  150


In [42]:
print(len(train_dataloader))
print(len(val_dataloader))
print(len(test_dataloader))

675
38
1


In [43]:
for xb, yb in train_dataloader:
  # print(xb)
  print(xb['input_ids'].shape)
  print(xb['attention_mask'].shape)
  print(yb.shape)
  break

torch.Size([4, 512])
torch.Size([4, 512])
torch.Size([4, 512])


# Define Bert NER Model

In [44]:
from transformers import BertForTokenClassification

class BertNER(torch.nn.Module):
    def __init__(self):
        super(BertNER, self).__init__()
        self.bert = BertForTokenClassification.from_pretrained(
            "HooshvareLab/bert-fa-base-uncased-clf-digimag",
            num_labels=9,
            ignore_mismatched_sizes=True,
        )

    def forward(self, input_batch, labels):
        input_ids = input_batch["input_ids"]
        mask = input_batch["attention_mask"]
        output = self.bert(
            input_ids=input_ids, attention_mask=mask, labels=labels, return_dict=False
        )
        return output

In [45]:
model = BertNER()
if use_cuda:
    model = model.cuda()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased-clf-digimag and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([7, 768]) in the checkpoint and torch.Size([9, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([7]) in the checkpoint and torch.Size([9]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train Loop

In [46]:
from torch.optim import SGD

def train_loop(model, train_dataloader, val_dataloader):
    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

    for epoch_num in range(EPOCHS):
        total_loss_train = 0
        model.train()
        i = 0
        for input_batch, batch_labels in tqdm(train_dataloader):
            loss, logits = model(input_batch, batch_labels)
            total_loss_train += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epochs: {epoch_num} Loss: {total_loss_train}")
        evaluate(model, val_dataloader)



# Model Evaluation

For Evaluation task, evaluate library is used. Evaluate is a library that makes evaluating and comparing models and reporting their performance easier and more standardized.



In [48]:
import evaluate as e
from pprint import pprint

f1_metric = e.load("f1")
accuracy_metric = e.load("accuracy")
recall_metric = e.load("recall")

def evaluate_metrics(predictions, label_clean):
    metrics = {}
    metrics['size'] = predictions.shape[0]

    predictions = predictions.view(-1)
    label_clean = label_clean.view(-1)
    metrics['f1_weighted'] = f1_metric.compute(
        predictions=predictions, references=label_clean, average="weighted"
    )['f1']
    metrics['f1_macro'] = f1_metric.compute(
        predictions=predictions, references=label_clean, average="macro"
    )['f1']
    metrics['f1_micro'] = f1_metric.compute(
        predictions=predictions, references=label_clean, average="micro"
    )['f1']
    # metrics['f1_none'] = f1_metric.compute(
    #     predictions=predictions, references=label_clean, average=None
    # )['f1']
    metrics['accuracy'] = accuracy_metric.compute(
        predictions=predictions, references=label_clean
    )['accuracy']
    metrics['recall_weighted'] = recall_metric.compute(
        predictions=predictions, references=label_clean, average="weighted"
    )['recall']
    metrics['recall_macro'] = recall_metric.compute(
        predictions=predictions, references=label_clean, average="macro"
    )['recall']
    metrics['recall_micro'] = recall_metric.compute(
        predictions=predictions, references=label_clean, average="micro"
    )['recall']
    # metrics['recall_none'] = recall_metric.compute(
    #     predictions=predictions, references=label_clean, average=None
    # )['recall']
    # pprint(metrics)
    return metrics

def average_metrics(metrics):
    average_metrics = {key:0 for key in metrics[0].keys() if key != 'size'}
    metric_names = average_metrics.keys()
    total_size = 0
    for metric_dict in metrics:
        size = metric_dict.pop('size')
        total_size += size
        for metric, value in metric_dict.items():
            average_metrics[metric] += value*size
    
    average_metrics = {key: value/total_size for key, value in average_metrics.items()}
    return average_metrics


def evaluate(model, val_dataloader):
    model.eval()
    metrics = []
    for input_batch, batch_labels in val_dataloader:
        loss, logits = model(input_batch, batch_labels)
        predictions = logits.argmax(dim=2)
        predictions[batch_labels == -100] = -100
        metrics.append(evaluate_metrics(predictions, batch_labels))
    
    metrics = average_metrics(metrics)
    pprint(metrics)

In [49]:
evaluate(model, val_dataloader)
# evaluate(model, train_dataloader)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

{'accuracy': 0.8658854166666666,
 'f1_macro': 0.13922344982400237,
 'f1_micro': 0.8658854166666666,
 'f1_weighted': 0.873359756893056,
 'recall_macro': 0.17168758643537202,
 'recall_micro': 0.8658854166666666,
 'recall_weighted': 0.8658854166666666}


In [None]:
print("training ...")
train_loop(model, train_dataloader, val_dataloader)
# print("evaluating ...")

training ...


 30%|███       | 205/675 [01:14<02:50,  2.76it/s]

In [None]:
evaluate(model, test_dataloader)
# evaluate(model, val_dataloader)
# evaluate(model, train_dataloader)
# text = 'امین به ایران آمد.'
# text = "به گزارش خبرنگار مهر، نماینده ولی فقیه در آذربایجان شرقی پیش از ظهر امروز در مراسم بزرگداشت یوم الله ۱۲ بهمن که در تالار اجتماعات مصلی اعظم امام خمینی ره برگزار شد گفت: مشکلات اقتصادی و تحریم ها در کشور وجود دارد اما باید قدردان انقلاب اسلامی ایران بود و به همین دلیل، باید توگه بیشتری به موفقیت های به دست آمده در طول دوران انقلاب اسلامی داشت. حجت الاسلام و المسلمین سید محمد علی آل هاشم ادامه داد: سرعت پیشرفت علم در ایران بعد از انقلاب اسلامی و هم اکنون، ۱۱ برابر دنیاست؛ امروزه جمهوری اسلامی ایران، هشتمین کشور تولید کننده اورانیوم ۲۰ درصد جهان است"
# # text = "به گزارش برنا؛ تقریبا از اسفند ماه سال گذشته واکسیناسیون عمومی در کشور با واردات واکسن های خارجی کرونا انجام شد و این روند به صورتی بود که محموله های جدید واکسن پس از خریداری شدن به کشور وارد می شد و تزریق ها برای گروه های اولویت دار انجام می گرفت البته در این میان جهش های جدیدی از ویروس در کشور زیاد شد و در مقابل واردات واکسن های خارجی با مشکلاتی مواجه بود و مسیر این اقدام با پستی ها و بلندی های زیادی رو به رو شد اما در حال حاضر با وجود همه اتفاقات بنا به گفته مسئولان ستاد مقابله با کرونا دو هفته ای از برنامه واکسیناسیون عقب هستیم و دلیل اصلی این اتفاق محدودیت وجود واکسن است. مسعود یونسیان، استاد اپیدمیولوژی دانشگاه علوم پزشکی تهران در گفت وگو با خبرنگار برنا درباره خرید واکسن های خارجی توسط شرکت های خصوصی گفت: دولت از خرید واکسن شرکت های خصوصی استقبال می کند و اصولا بسیاری از کشور های دیگر نیز واردات واکسن های خارجی را به شرکت های خصوصی سپرده اند اما در نهایت تحویل وزارت بهداشت می شود"
# evaluate_one_text(model , text)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


RuntimeError: ignored

In [None]:
from transformers import pipeline
nlp = pipeline("ner", model=model.bert.to('cpu'), tokenizer=tokenizer)
example = "حسین تقوی به سازمان جهاد کشاورزی رفت."

ner_results = nlp(example)
print(ner_results)

[{'entity': 'LABEL_2', 'score': 0.9788249, 'index': 1, 'word': 'حسین', 'start': 0, 'end': 4}, {'entity': 'LABEL_2', 'score': 0.9859956, 'index': 2, 'word': 'تقوی', 'start': 5, 'end': 9}, {'entity': 'LABEL_0', 'score': 0.9809533, 'index': 3, 'word': 'به', 'start': 10, 'end': 12}, {'entity': 'LABEL_1', 'score': 0.9332991, 'index': 4, 'word': 'سازمان', 'start': 13, 'end': 19}, {'entity': 'LABEL_1', 'score': 0.9741847, 'index': 5, 'word': 'جهاد', 'start': 20, 'end': 24}, {'entity': 'LABEL_1', 'score': 0.9170967, 'index': 6, 'word': 'کشاورزی', 'start': 25, 'end': 32}, {'entity': 'LABEL_0', 'score': 0.9796014, 'index': 7, 'word': 'رفت', 'start': 33, 'end': 36}, {'entity': 'LABEL_0', 'score': 0.97118855, 'index': 8, 'word': '.', 'start': 36, 'end': 37}]


In [None]:
# def show_result(text, predictions):
#     encoding = tokenizer(
#         text,
#         return_offsets_mapping=True,
#         padding="max_length",
#         max_length=512,
#         truncation=True,
#     )
#     intervals = []
#     for interval in encoding["offset_mapping"]:
#         if not (interval[0] == 0 and interval[1] == 0):
#             intervals.append(interval)
#     print(len(intervals))
#     print(len(predictions))
#     for interval, label in zip(intervals, predictions):
#         # print(interval)
#         print(text[interval[0] : interval[1]], "   ", label)

In [None]:
# def evaluate_one_text(model, sentence):
#     encoding = tokenizer(
#         sentence,
#         return_tensors="pt",
#         return_offsets_mapping=True,
#         padding="max_length",
#         max_length=512,
#         truncation=True,
#     )

#     logits = model(encoding, None)
#     logits_clean = logits[0][label_ids != -100]
#     # for g in logits[0][label_ids != -100].argmax(dim=1).tolist():
#     #   print(g)
#     # print(logits_clean)
#     # print(logits_clean.shape)
#     predictions = logits_clean.argmax(dim=1).tolist()
#     # print(predictions)
#     prediction_label = [ids_to_labels[i] for i in predictions]
#     print(sentence)
#     print(prediction_label)
#     show_result(sentence, prediction_label)

In [None]:
# import json

# k = 4

# data = train_data[k]
# text = data['text']
# annotations = []
# for i in range(len(data['annotations'])): 
#     annotations.extend(data['annotations'][i]["text"])

# print(text)
# print(annotations)
# final_labels = get_final_label(text, annotations)
# print(final_labels)

به گزارش خبرگزاری علم و فناوری و به نقل از IFL Science، دکتر «پیتر دازاک» که یک اکولوژیست از سازمان «اکو هلت الیانس» است و در هیئت فرستاده سازمان بهداشت چهانی به چین قرار داشت، به « ان پی آر» گفت که تحقیقات اخیر آنها شواهد جدیدی را نشان داده است که مزارع حیات وحش به فروشندگان حیوانات در بازار غذاهای دریایی در ووهان، حیواناتی را می فروختند. با توجه به این تئوری می توان اینطور نتیجه گرفت که ممکن است خفاش های وحشی ویروس را به برخی از حیوانات پرورش یافته انتقال داده باشند و حیوانات که به ووهان منتقل شده اند، آن را به انسانها منتقل بوده باشند
[{'name': 'ORG', 'range': [9, 30]}, {'name': 'ORG', 'range': [43, 54]}, {'name': 'PER', 'range': [62, 72]}, {'name': 'ORG', 'range': [93, 116]}, {'name': 'ORG', 'range': [139, 158]}, {'name': 'LOC', 'range': [162, 165]}, {'name': 'ORG', 'range': [182, 190]}, {'name': 'mainLOC', 'range': [311, 316]}, {'name': 'LOC', 'range': [489, 494]}, {'name': 'ORG', 'range': [9, 30]}, {'name': 'ORG', 'range': [43, 55]}, {'name': 'PER', 'range': [61, 73]}, {'name': '

TypeError: ignored