<a href="https://colab.research.google.com/github/ZahraDehghani99/Natural-Language-Processing/blob/main/HW6/NER_with_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Home Work6 - Named Entity Recognition using Transformers

Named Entity Recognition using ParsBERT and Multiligual modles


In [1]:
!pip install -q datasets 
!pip install -q tokenizers
!pip install -q transformers 
!pip install -q seqeval 

In [2]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
#check if gpu is present
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [5]:
!nvidia-smi

Tue Aug  9 17:34:36 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
base_path = "/content/drive/MyDrive/HW6/"

## Read data and Data preprocessing

### Define functions

Let's define some helper functions.

#### Define function to convert text file to dataframe

In this section we want to convert text file to the dataframe in order to do label encoder.

In [7]:
def convert_file2df(file_name):

  with open(base_path + file_name ,'r') as f:
      lines = f.read().splitlines()

  words, tags = [], []
  for i in range(len(lines)):
    try : 
      x, y = lines[i].split(" ")
      words.append(x)
      tags.append(y)
    except ValueError: # if we have empty line
      pass

  data = {'tokens': words, 'tags': tags}  
  df = pd.DataFrame(data)
  output_file_name = file_name.split(".")[0] 
  df.to_csv(base_path + output_file_name + "_dataframe", index=False)
  return df  

#### Define function to convert text file to dictionary 

In [8]:
def convert_txt2dict(file_name):
  
  '''
  This function read input file and then convert in to the dictionary.
  Producted dictionary have two keys (tokens and tags) :
  tokens contain tokens of each sentence and tags contain tags of each sentence.
  '''
  with open(base_path + file_name ,'r') as f:
      lines = f.read().splitlines()

  #because after each sentence we have blank line, we should append blank
  #line at the end of the last sentence in the file    
  lines.append("") 

  words, tags = [], []
  words_line, tags_lines = [], []
  for i in range(len(lines)):
    try : 
        x, y = lines[i].split(" ")
        words_line.append(x)
        tags_lines.append(y)
    except ValueError: # if we have empty line
        words.append(words_line)
        tags.append(tags_lines)
        words_line, tags_lines= [], []

  data = {'tokens': words, 'ner_tags': tags}   
  output_file_name = file_name.split(".")[0]    
  output_file = open(base_path + output_file_name + ".pkl", 'wb')
  pickle.dump(data, output_file)
  output_file.close()  


#### Define function to save and load pkl files

In [9]:
def save_dict_in_pkl(dict_name, output_file_name):
  output_file = open(base_path +  output_file_name, 'wb')
  pickle.dump(dict_name, output_file)
  output_file.close() 

def load_pkl(file_name):
  with open(base_path + file_name,'rb') as f:
      file = pickle.load(f)
  return file    

### Read data and convert it to dataframe

In [None]:
train_data = convert_file2df("Train.txt")
test_data = convert_file2df("Test.txt")

In [None]:
train_data.head()

Unnamed: 0,tokens,tags
0,ميشوند,V
1,.,DELM
2,نتيجهي,N
3,بحث,N
4,بالا,ADJ


In [None]:
print(f'train_data shape : {train_data.shape}')

train_data shape : (259794, 2)


In [None]:
train_data.tags.value_counts()

N          109310
P           32580
ADJ         29832
DELM        25844
V           22327
CON         22094
PRO          5714
DET          4130
ADV          3646
QUA          1820
AR           1175
IF            396
SPEC          350
MS            198
MORP          132
PP             85
MQUA           81
PS             31
DEFAULT        20
OH             12
NP             10
OHH             5
INT             2
Name: tags, dtype: int64

In [None]:
train_tags = [tag for tag, count in train_data.tags.value_counts().items()]
print(f'len train tags : {len(train_tags)}')

len train tags : 23


In [None]:
test_data.head()

Unnamed: 0,tokens,tags
0,هجري,ADJ
1,شمسي,ADJ
2,فوت,N
3,كرد,V
4,و,CON


In [None]:
print(f'test_data shape : {test_data.shape}')

test_data shape : (259794, 2)


In [None]:
test_data.tags.value_counts()

N          115385
ADJ         32226
P           30790
DELM        27934
CON         20781
V           18826
PRO          4493
DET          3839
ADV          3096
QUA          1418
SPEC          380
IF            177
MORP          176
AR             75
MS             62
PP             56
MQUA           37
DEFAULT        21
OH             10
PS              7
INT             5
Name: tags, dtype: int64

In [None]:
test_tags = [tag for tag, count in train_data.tags.value_counts().items()]
print(f'len test tags : {len(test_tags)}')

len test tags : 23


#### Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(train_data["tags"])

print(f'y_train : {y_train}')

y_train : [22  5 12 ... 12 22  5]


let's create a dictionary to map tags to int.

In [None]:
tag2int = {}
inv =  le.inverse_transform([i for i in range(len(train_tags))])
for i in range(len(inv)):
  tag2int[inv[i]] = i

print(f'tag2int : {tag2int}')

tag2int : {'ADJ': 0, 'ADV': 1, 'AR': 2, 'CON': 3, 'DEFAULT': 4, 'DELM': 5, 'DET': 6, 'IF': 7, 'INT': 8, 'MORP': 9, 'MQUA': 10, 'MS': 11, 'N': 12, 'NP': 13, 'OH': 14, 'OHH': 15, 'P': 16, 'PP': 17, 'PRO': 18, 'PS': 19, 'QUA': 20, 'SPEC': 21, 'V': 22}


In [None]:
# save_dict_in_pkl(tag2int, "tag2int.pkl")
tag2int = load_pkl("tag2int.pkl")

#### Convert text file to dictionary 

برای استفاده از دیتاست باید آن را به فرمت قابل قبول در آوریم 

تبدیل به فرمت قابل قبول برای transformers

lets convert train.txt and test.txt to dictionary.

In [None]:
convert_txt2dict("Train.txt")
convert_txt2dict("Test.txt")

Than we load these dictinaries.

In [None]:
train = load_pkl("Train.pkl")
test = load_pkl("Test.pkl")

In [None]:
print(f'len of train.pkl : {len(train)}')
print(f'key of train.pkl : {train.keys()}')
print(f'first elements of train["tokens"] :\n{train["tokens"][0]}\n')
print(f'first elements of train["ner_tags"] :\n{train["ner_tags"][0]}')

len of train.pkl : 2
key of train.pkl : dict_keys(['tokens', 'ner_tags'])
first elements of train["tokens"] :
['ميشوند', '.']

first elements of train["ner_tags"] :
['V', 'DELM']


In [None]:
print(f'number of sentences in train : {len(train["tokens"])}')

tokens = 0
for sent in train["tokens"]:
  tokens += len(sent)

print(f'number of tokens in train :  {tokens}')

number of sentences in train : 8722
number of tokens in train :  259794


In [None]:
print(f'len of test.pkl : {len(test)}')
print(f'key of test.pkl : {test.keys()}')
print(f'first elements of test["tokens"] :\n{test["tokens"][0]}\n')
print(f'first elements of test["ner_tags"] :\n{test["ner_tags"][0]}')

len of test.pkl : 2
key of test.pkl : dict_keys(['tokens', 'ner_tags'])
first elements of test["tokens"] :
['هجري', 'شمسي', 'فوت', 'كرد', 'و', 'در', 'قم', 'دفن', 'شد', '.']

first elements of test["ner_tags"] :
['ADJ', 'ADJ', 'N', 'V', 'CON', 'P', 'N', 'N', 'V', 'DELM']


In [None]:
print(f'number of sentences in test : {len(test["tokens"])}')

tokens = 0
for sent in test["tokens"]:
  tokens += len(sent)

print(f'number of tokens in test :  {tokens}')

number of sentences in test : 9280
number of tokens in test :  259794


Let's convert tags into int in train and test dictionaries

In [None]:
tag2int["N"]

12

In [None]:
ner_tags_train = []

for sent in train["ner_tags"]:
  sent_tags_int = []
  for token in sent:
    sent_tags_int.append(tag2int[token])
  ner_tags_train.append(sent_tags_int)

In [None]:
ner_tags_test = []

for sent in test["ner_tags"]:
  sent_tags_int = []
  for token in sent:
    sent_tags_int.append(tag2int[token])
  ner_tags_test.append(sent_tags_int)

In [None]:
print(f'len ner_tags_train : {len(ner_tags_train)}')
print(f'len ner_tags_test : {len(ner_tags_test)}')

len ner_tags_train : 8722
len ner_tags_test : 9280


In [None]:
train_data = {"tokens": train["tokens"], "ner_tags": ner_tags_train}
test_data = {"tokens": test["tokens"], "ner_tags": ner_tags_test}

In [None]:
# save_dict_in_pkl(train_data, "Train_modified.pkl")
# save_dict_in_pkl(test_data, "Test_modified.pkl")

### Create dataset in the DatasetDict format

In [10]:
train_data = load_pkl("Train_modified.pkl")
test_data = load_pkl("Test_modified.pkl") 
tag2int = load_pkl("tag2int.pkl")

In [11]:
df_train = pd.DataFrame(train_data)
df_train.head()

Unnamed: 0,tokens,ner_tags
0,"[ميشوند, .]","[22, 5]"
1,"[نتيجهي, بحث, بالا, اين, است, كه, فعلها, از, ن...","[12, 12, 0, 18, 22, 3, 12, 16, 12, 12, 16, 12,..."
2,"[صورتهاي, گسترشيافته, و, نيز, الگوي, برجستگي, ...","[12, 0, 3, 3, 12, 12, 20, 12, 16, 12, 12, 16, ..."
3,"[هر, كدام, از, فعلهاي, سهگانه, بالا, در, صورت,...","[20, 12, 16, 12, 0, 0, 16, 12, 12, 12, 0, 22, ..."
4,"[در, زير, به, الگوي, برجستگي, گروههاي, فعلي, م...","[16, 12, 16, 12, 12, 12, 0, 22, 3, 12, 18, 0, ..."


In [15]:
df_test = pd.DataFrame(test_data)
df_test.head()

Unnamed: 0,tokens,ner_tags
0,"[هجري, شمسي, فوت, كرد, و, در, قم, دفن, شد, .]","[0, 0, 12, 22, 3, 16, 12, 12, 22, 5]"
1,"[پروين, اعتصامي, ،, از, داستان, "", ويلانالدوله...","[12, 12, 5, 16, 12, 5, 12, 5, 12, 5, 18, 22, 1..."
2,"[محض, خالي, نبودن, عريضه, با, چايي, مقدار, معت...","[16, 0, 12, 12, 16, 12, 12, 0, 12, 0, 0, 22, 5..."
3,"[بعد, معلوم, ميشود, وقتيكه, ويلانالدوله, خواب,...","[3, 0, 22, 3, 12, 12, 22, 5, 12, 16, 12, 5, 12..."
4,"[ويلانالدوله, خدا, را, شكر, ميكند, كه, آخرش, پ...","[12, 12, 16, 12, 22, 3, 12, 12, 16, 12, 12, 3,..."


In [12]:
import datasets
from datasets import DatasetDict

dataset = DatasetDict()

In [13]:
dataset

DatasetDict({
    
})

https://discuss.huggingface.co/t/convert-a-list-of-dictionaries-to-hugging-face-dataset-object/14670

In [16]:
train = datasets.Dataset.from_pandas(pd.DataFrame(df_train))
test = datasets.Dataset.from_pandas(pd.DataFrame(df_test))

dataset['train'] = train
dataset['test'] = test

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 8722
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 9280
    })
})

In [18]:
dataset['train'][0]

{'ner_tags': [22, 5], 'tokens': ['ميشوند', '.']}

In [None]:
dataset['train'].features

{'ner_tags': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [None]:
example = dataset["train"][1]
pd.DataFrame([example["tokens"], example["ner_tags"]],
['Tokens', 'Tags'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
Tokens,نتيجهي,بحث,بالا,اين,است,كه,فعلها,از,نظر,ساختمان,...,سه,گروه,ساده,،,پيشوندي,و,گروهي,تقسيم,ميشوند,.
Tags,12,12,0,18,22,3,12,16,12,12,...,12,12,0,5,0,3,0,12,22,5


## Define needed functions

In [20]:
label_names = [" "+key for key, value in tag2int.items()]
print(f'label names : \n {label_names}')

label names : 
 [' ADJ', ' ADV', ' AR', ' CON', ' DEFAULT', ' DELM', ' DET', ' IF', ' INT', ' MORP', ' MQUA', ' MS', ' N', ' NP', ' OH', ' OHH', ' P', ' PP', ' PRO', ' PS', ' QUA', ' SPEC', ' V']


### Tokenizing

In [29]:
#Get the values for input_ids, attention_mask, adjusted labels
def tokenize_adjust_labels(all_samples_per_split):
  tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"], is_split_into_words=True, truncation=True)
  
  total_adjusted_labels = []
  
  for k in range(0, len(tokenized_samples["input_ids"])):
    prev_wid = -1
    word_ids_list = tokenized_samples.word_ids(batch_index=k)
    existing_label_ids = all_samples_per_split["ner_tags"][k]
    i = -1
    adjusted_label_ids = []
   
    for word_idx in word_ids_list:
      # Special tokens have a word id that is None. We set the label to -100 so they are automatically
      # ignored in the loss function.
      if(word_idx is None):
        adjusted_label_ids.append(-100)
      elif(word_idx!=prev_wid):
        i = i + 1
        adjusted_label_ids.append(existing_label_ids[i])
        prev_wid = word_idx
      else:
        label_name = label_names[existing_label_ids[i]]
        adjusted_label_ids.append(existing_label_ids[i])
        
    total_adjusted_labels.append(adjusted_label_ids)
  
  #add adjusted labels to the tokenized samples
  tokenized_samples["labels"] = total_adjusted_labels
  return tokenized_samples

### Compute metric

In [22]:
from datasets import load_metric

metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p

    #select predicted index with maximum logit for each token
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }



Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

##NER using ParsBERT v1.0 Model

###Tokenizing the whole dataset

In [69]:
from transformers import AutoTokenizer
model_name = "HooshvareLab/bert-base-parsbert-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Could not locate the tokenizer configuration file, will try to use the model config instead.
https://huggingface.co/HooshvareLab/bert-base-parsbert-uncased/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpobqdgzky


Downloading config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

storing https://huggingface.co/HooshvareLab/bert-base-parsbert-uncased/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/d3b7c3283a6a4ad4471f59269c9de8adadfab0b05eebf49a64e046fca56cdab2.58cfea678e7bd2c1de3bfd4a5357101526b9fbc32a994b9456047e55b0afbebe
creating metadata file for /root/.cache/huggingface/transformers/d3b7c3283a6a4ad4471f59269c9de8adadfab0b05eebf49a64e046fca56cdab2.58cfea678e7bd2c1de3bfd4a5357101526b9fbc32a994b9456047e55b0afbebe
loading configuration file https://huggingface.co/HooshvareLab/bert-base-parsbert-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/d3b7c3283a6a4ad4471f59269c9de8adadfab0b05eebf49a64e046fca56cdab2.58cfea678e7bd2c1de3bfd4a5357101526b9fbc32a994b9456047e55b0afbebe
Model config BertConfig {
  "_name_or_path": "HooshvareLab/bert-base-parsbert-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "

Downloading vocab.txt:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

storing https://huggingface.co/HooshvareLab/bert-base-parsbert-uncased/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/b80b05f64dc19f3c880b7074ef09108d0bc244e4b6f50d6dba094da0f1c231fd.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474
creating metadata file for /root/.cache/huggingface/transformers/b80b05f64dc19f3c880b7074ef09108d0bc244e4b6f50d6dba094da0f1c231fd.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474
loading file https://huggingface.co/HooshvareLab/bert-base-parsbert-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/b80b05f64dc19f3c880b7074ef09108d0bc244e4b6f50d6dba094da0f1c231fd.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474
loading file https://huggingface.co/HooshvareLab/bert-base-parsbert-uncased/resolve/main/tokenizer.json from cache at None
loading file https://huggingface.co/HooshvareLab/bert-base-parsbert-uncased/resolve/main/added_tokens.json from cache at None
l

In [70]:
tokenizer

PreTrainedTokenizerFast(name_or_path='HooshvareLab/bert-base-parsbert-uncased', vocab_size=100000, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [71]:
def tokenize_function(examples):
    return tokenizer(examples["tokens"], padding="max_length", max_length=512, truncation=True, is_split_into_words=True)

In [None]:
tokenized_datasets_ = dataset.map(tokenize_function, batched=True)

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets_

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8722
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9280
    })
})

In [None]:
tokenized_datasets_['train'][0]['input_ids'][:20]

[2, 328, 1216, 16628, 15, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
tokenized_datasets_['train'][0]['ner_tags'][:20]

[22, 5]

In [None]:
len(tokenized_datasets_['train'][0]['input_ids']) == len(tokenized_datasets_['train'][0]['ner_tags'])


False

so we should use tokenize_adjust_labels

In [72]:
tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True, remove_columns=['tokens', 'ner_tags'])

  0%|          | 0/9 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/10 [00:00<?, ?ba/s]

In [73]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8722
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 9280
    })
})

In [74]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [75]:
data_collator

DataCollatorForTokenClassification(tokenizer=PreTrainedTokenizerFast(name_or_path='HooshvareLab/bert-base-parsbert-uncased', vocab_size=100000, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

### Fine-Tuning Transformers

In [76]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForTokenClassification, AdamW

In [77]:
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_names))
model.to(device)

loading configuration file https://huggingface.co/HooshvareLab/bert-base-parsbert-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/d3b7c3283a6a4ad4471f59269c9de8adadfab0b05eebf49a64e046fca56cdab2.58cfea678e7bd2c1de3bfd4a5357101526b9fbc32a994b9456047e55b0afbebe
Model config BertConfig {
  "_name_or_path": "HooshvareLab/bert-base-parsbert-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19"

Downloading pytorch_model.bin:   0%|          | 0.00/624M [00:00<?, ?B/s]

storing https://huggingface.co/HooshvareLab/bert-base-parsbert-uncased/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/a0c74740a336a1e8d32c2a24ddd7f51256436208e6ee744101707612f4e754a1.f8e37b3c71897e638d5002618547e9f2cee56b935e0cace3465cf4a33ae9f446
creating metadata file for /root/.cache/huggingface/transformers/a0c74740a336a1e8d32c2a24ddd7f51256436208e6ee744101707612f4e754a1.f8e37b3c71897e638d5002618547e9f2cee56b935e0cace3465cf4a33ae9f446
loading weights file https://huggingface.co/HooshvareLab/bert-base-parsbert-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/a0c74740a336a1e8d32c2a24ddd7f51256436208e6ee744101707612f4e754a1.f8e37b3c71897e638d5002618547e9f2cee56b935e0cace3465cf4a33ae9f446
Some weights of the model checkpoint at HooshvareLab/bert-base-parsbert-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.pred

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(100000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [78]:
from transformers import TrainingArguments, Trainer

batch_size = 16
logging_steps = len(tokenized_dataset['train']) // batch_size
epochs = 3

training_args = TrainingArguments(
    output_dir= base_path + model_name + "-finetuned",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [79]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [80]:
trainer.train_dataset[0]

{'attention_mask': [1, 1, 1, 1, 1, 1],
 'input_ids': [2, 328, 1216, 16628, 15, 4],
 'labels': [-100, 22, 22, 22, 5, -100],
 'token_type_ids': [0, 0, 0, 0, 0, 0]}

In [81]:
trainer.train()

***** Running training *****
  Num examples = 8722
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1638


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3066,0.181822,0.899004,0.899888,0.899446,0.941282
2,0.1291,0.173521,0.903682,0.913036,0.908335,0.945706
3,0.0779,0.188907,0.908911,0.918063,0.913464,0.949149


Saving model checkpoint to /content/drive/MyDrive/HW6/HooshvareLab/bert-base-parsbert-uncased-finetuned/checkpoint-500
Configuration saved in /content/drive/MyDrive/HW6/HooshvareLab/bert-base-parsbert-uncased-finetuned/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/HW6/HooshvareLab/bert-base-parsbert-uncased-finetuned/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/HW6/HooshvareLab/bert-base-parsbert-uncased-finetuned/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/HW6/HooshvareLab/bert-base-parsbert-uncased-finetuned/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 9280
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/HW6/HooshvareLab/bert-base-parsbert-uncased-finetuned/checkpoint-1000
Configuration saved in /content/drive/MyDrive/HW6/HooshvareLab/bert-base-parsbert-uncased-finetuned/checkpoint-1000/config.json
Model weight

TrainOutput(global_step=1638, training_loss=0.17099082120608933, metrics={'train_runtime': 851.4466, 'train_samples_per_second': 30.731, 'train_steps_per_second': 1.924, 'total_flos': 1530063532190172.0, 'train_loss': 0.17099082120608933, 'epoch': 3.0})

### Evaluation on the Test set

In [82]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 9280
  Batch size = 16


{'epoch': 3.0,
 'eval_accuracy': 0.9491489871748772,
 'eval_f1': 0.9134640733993763,
 'eval_loss': 0.18890716135501862,
 'eval_precision': 0.9089113029011127,
 'eval_recall': 0.9180626835032105,
 'eval_runtime': 70.9438,
 'eval_samples_per_second': 130.808,
 'eval_steps_per_second': 8.175}

In [83]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)
# Remove ignored index (special tokens)
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
results = metric.compute(predictions=true_predictions, references=true_labels)
results

***** Running Prediction *****
  Num examples = 9280
  Batch size = 16


{'ADJ': {'f1': 0.8297724066425305,
  'number': 28953,
  'precision': 0.81690093708166,
  'recall': 0.8430559872897454},
 'ADV': {'f1': 0.7610559946191359,
  'number': 3047,
  'precision': 0.7803448275862069,
  'recall': 0.7426977354775188},
 'AR': {'f1': 0.10526315789473684,
  'number': 21,
  'precision': 0.11764705882352941,
  'recall': 0.09523809523809523},
 'CON': {'f1': 0.9637146675907134,
  'number': 20142,
  'precision': 0.9609081934846989,
  'recall': 0.9665375831595671},
 'DEFAULT': {'f1': 0.0, 'number': 21, 'precision': 0.0, 'recall': 0.0},
 'DELM': {'f1': 0.9791907302495279,
  'number': 27109,
  'precision': 0.973390683093971,
  'recall': 0.9850603120734811},
 'DET': {'f1': 0.9694686241392749,
  'number': 3838,
  'precision': 0.9668307851775071,
  'recall': 0.9721208963001563},
 'IF': {'f1': 0.9348441926345609,
  'number': 177,
  'precision': 0.9375,
  'recall': 0.9322033898305084},
 'INT': {'f1': 0.0, 'number': 5, 'precision': 0.0, 'recall': 0.0},
 'MORP': {'f1': 0.365671641

### Test the pipeline with a sample news

In [84]:
from transformers import pipeline
model_id = "/content/drive/MyDrive/HW6/HooshvareLab/bert-base-parsbert-uncased-finetuned/checkpoint-1500"
token_classifier = pipeline("ner", model=model_id, tokenizer=model_id)
example = "دانشگاه تهران در میدان انقلاب واقع شده است."

loading configuration file /content/drive/MyDrive/HW6/HooshvareLab/bert-base-parsbert-uncased-finetuned/checkpoint-1500/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/HW6/HooshvareLab/bert-base-parsbert-uncased-finetuned/checkpoint-1500",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22"
  },
  "initializer_ra

In [85]:
ner_results = token_classifier(example)
print(ner_results)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'LABEL_12', 'score': 0.99930835, 'index': 1, 'word': 'دانشگاه', 'start': 0, 'end': 7}, {'entity': 'LABEL_12', 'score': 0.99840504, 'index': 2, 'word': 'تهران', 'start': 8, 'end': 13}, {'entity': 'LABEL_16', 'score': 0.99920374, 'index': 3, 'word': 'در', 'start': 14, 'end': 16}, {'entity': 'LABEL_12', 'score': 0.9996555, 'index': 4, 'word': 'میدان', 'start': 17, 'end': 22}, {'entity': 'LABEL_12', 'score': 0.99726856, 'index': 5, 'word': 'انقلاب', 'start': 23, 'end': 29}, {'entity': 'LABEL_0', 'score': 0.99427754, 'index': 6, 'word': 'واقع', 'start': 30, 'end': 34}, {'entity': 'LABEL_0', 'score': 0.9180323, 'index': 7, 'word': 'شده', 'start': 35, 'end': 38}, {'entity': 'LABEL_22', 'score': 0.9991536, 'index': 8, 'word': 'است', 'start': 39, 'end': 42}, {'entity': 'LABEL_5', 'score': 0.99943537, 'index': 9, 'word': '.', 'start': 42, 'end': 43}]


##NER using Multilingual Model

###Tokenizing the whole dataset

In [30]:
from transformers import AutoTokenizer
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [31]:
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-multilingual-cased', vocab_size=119547, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [32]:
tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True, remove_columns=['tokens', 'ner_tags'])

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

In [33]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8722
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 9280
    })
})

In [34]:
tokenized_dataset['train']['labels'][0]

[-100, 22, 22, 22, 5, -100]

In [35]:
len(tokenized_dataset['train']['input_ids'][1])

37

In [36]:
len(tokenized_dataset['train']['labels'][1])

37

In [37]:
tokenized_dataset['train'][:2]

{'attention_mask': [[1, 1, 1, 1, 1, 1],
  [1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1]],
 'input_ids': [[101, 788, 89218, 33571, 119, 102],
  [101,
   789,
   26614,
   28437,
   10461,
   78812,
   40920,
   763,
   11294,
   10566,
   786,
   10388,
   95458,
   10742,
   10383,
   21312,
   67653,
   10327,
   23823,
   22964,
   775,
   49447,
   752,
   817,
   89218,
   33571,
   10461,
   791,
   22964,
   10461,
   766,
   104972,
   788,
   89218,
   33571,
   119,
   102]],
 'labels': [[-100, 22, 22, 22, 5, -100],
  [-100,
   12,
   12,
   12,
   12,
   12,
   0,
   18,
   18,
   22,
   3,
   3,
   12,
   12,
   16,
   12,
   12,
   16,
   12,
   12,
   0,
   0,
   5,
   0,
   0,
   0,
   0,
   3,
   0,
   0,
   12,
   12,
   22,
   22,
   22,
   5,
   -100]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0],
  [0

In [38]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [39]:
data_collator

DataCollatorForTokenClassification(tokenizer=PreTrainedTokenizerFast(name_or_path='bert-base-multilingual-cased', vocab_size=119547, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

### Fine-Tuning Transformers

In [40]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForTokenClassification, AdamW

In [41]:
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_names))
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [45]:
dataset["train"][1]

{'ner_tags': [12,
  12,
  0,
  18,
  22,
  3,
  12,
  16,
  12,
  12,
  16,
  12,
  12,
  0,
  5,
  0,
  3,
  0,
  12,
  22,
  5],
 'tokens': ['نتيجهي',
  'بحث',
  'بالا',
  'اين',
  'است',
  'كه',
  'فعلها',
  'از',
  'نظر',
  'ساختمان',
  'به',
  'سه',
  'گروه',
  'ساده',
  '،',
  'پيشوندي',
  'و',
  'گروهي',
  'تقسيم',
  'ميشوند',
  '.']}

In [42]:
example = dataset["train"][1]
labels = [label_names[i] for i in example[f"ner_tags"]]
metric.compute(predictions=[labels], references=[labels])

{'ADJ': {'f1': 1.0, 'number': 4, 'precision': 1.0, 'recall': 1.0},
 'CON': {'f1': 1.0, 'number': 2, 'precision': 1.0, 'recall': 1.0},
 'DELM': {'f1': 1.0, 'number': 2, 'precision': 1.0, 'recall': 1.0},
 'N': {'f1': 1.0, 'number': 5, 'precision': 1.0, 'recall': 1.0},
 'P': {'f1': 1.0, 'number': 2, 'precision': 1.0, 'recall': 1.0},
 'PRO': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'V': {'f1': 1.0, 'number': 2, 'precision': 1.0, 'recall': 1.0},
 'overall_accuracy': 1.0,
 'overall_f1': 1.0,
 'overall_precision': 1.0,
 'overall_recall': 1.0}

In [46]:
from transformers import TrainingArguments, Trainer

batch_size = 16
logging_steps = len(tokenized_dataset['train']) // batch_size
epochs = 2

training_args = TrainingArguments(
    output_dir= base_path + model_name + "-finetuned",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps)

In [47]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [48]:
trainer.train_dataset[0]

{'attention_mask': [1, 1, 1, 1, 1, 1],
 'input_ids': [101, 788, 89218, 33571, 119, 102],
 'labels': [-100, 22, 22, 22, 5, -100],
 'token_type_ids': [0, 0, 0, 0, 0, 0]}

In [49]:
trainer.train()

***** Running training *****
  Num examples = 8722
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1092


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2716,0.151097,0.92469,0.933608,0.929128,0.95206
2,0.1016,0.145442,0.932464,0.941523,0.936972,0.956577


Saving model checkpoint to /content/drive/MyDrive/HW6/bert-base-multilingual-cased-finetuned/checkpoint-500
Configuration saved in /content/drive/MyDrive/HW6/bert-base-multilingual-cased-finetuned/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/HW6/bert-base-multilingual-cased-finetuned/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/HW6/bert-base-multilingual-cased-finetuned/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/HW6/bert-base-multilingual-cased-finetuned/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 9280
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/HW6/bert-base-multilingual-cased-finetuned/checkpoint-1000
Configuration saved in /content/drive/MyDrive/HW6/bert-base-multilingual-cased-finetuned/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/HW6/bert-base-multilingual-cased-finetuned/

TrainOutput(global_step=1092, training_loss=0.186533564722145, metrics={'train_runtime': 696.2907, 'train_samples_per_second': 25.053, 'train_steps_per_second': 1.568, 'total_flos': 1360046491078656.0, 'train_loss': 0.186533564722145, 'epoch': 2.0})

### Evaluation on the Test set

In [50]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 9280
  Batch size = 16


{'epoch': 2.0,
 'eval_accuracy': 0.9565766304528389,
 'eval_f1': 0.9369716059852388,
 'eval_loss': 0.1454424113035202,
 'eval_precision': 0.9324639956123346,
 'eval_recall': 0.9415230083975041,
 'eval_runtime': 106.8799,
 'eval_samples_per_second': 86.826,
 'eval_steps_per_second': 5.427}

In [51]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)
# Remove ignored index (special tokens)
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
results = metric.compute(predictions=true_predictions, references=true_labels)
results

***** Running Prediction *****
  Num examples = 9280
  Batch size = 16


{'ADJ': {'f1': 0.8673174381823747,
  'number': 28953,
  'precision': 0.8542999765486281,
  'recall': 0.8807377473836908},
 'ADV': {'f1': 0.7940643033800494,
  'number': 3047,
  'precision': 0.7978793903247183,
  'recall': 0.7902855267476206},
 'AR': {'f1': 0.1714285714285714,
  'number': 21,
  'precision': 0.12244897959183673,
  'recall': 0.2857142857142857},
 'CON': {'f1': 0.983554586883297,
  'number': 20142,
  'precision': 0.9813185726994168,
  'recall': 0.9858008142190448},
 'DEFAULT': {'f1': 0.0, 'number': 21, 'precision': 0.0, 'recall': 0.0},
 'DELM': {'f1': 0.9948925225059709,
  'number': 27110,
  'precision': 0.9910688140556368,
  'recall': 0.9987458502397639},
 'DET': {'f1': 0.9776791541574206,
  'number': 3838,
  'precision': 0.9795971749934607,
  'recall': 0.9757686294945284},
 'IF': {'f1': 0.9162011173184358,
  'number': 177,
  'precision': 0.9060773480662984,
  'recall': 0.9265536723163842},
 'INT': {'f1': 0.0, 'number': 5, 'precision': 0.0, 'recall': 0.0},
 'MORP': {'f1':

In [55]:
predictions.shape

(9280, 505)

In [54]:
labels.shape

(9280, 505)

### Test the pipeline with a sample news

In [66]:
from transformers import pipeline
model_id = "/content/drive/MyDrive/HW6/bert-base-multilingual-cased-finetuned/checkpoint-1000"
token_classifier = pipeline("ner", model=model_id, tokenizer=model_id)
example = "دانشگاه تهران در میدان انقلاب واقع شده است."

loading configuration file /content/drive/MyDrive/HW6/bert-base-multilingual-cased-finetuned/checkpoint-1000/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/HW6/bert-base-multilingual-cased-finetuned/checkpoint-1000",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22"
  },
  "initiali

In [67]:
ner_results = token_classifier(example)
print(ner_results)

[{'entity': 'LABEL_12', 'score': 0.9989335, 'index': 1, 'word': 'دانشگاه', 'start': 0, 'end': 7}, {'entity': 'LABEL_12', 'score': 0.99700147, 'index': 2, 'word': 'تهران', 'start': 8, 'end': 13}, {'entity': 'LABEL_16', 'score': 0.99870527, 'index': 3, 'word': 'در', 'start': 14, 'end': 16}, {'entity': 'LABEL_12', 'score': 0.9989802, 'index': 4, 'word': 'میدان', 'start': 17, 'end': 22}, {'entity': 'LABEL_12', 'score': 0.99521565, 'index': 5, 'word': 'انقلاب', 'start': 23, 'end': 29}, {'entity': 'LABEL_0', 'score': 0.9925498, 'index': 6, 'word': 'واقع', 'start': 30, 'end': 34}, {'entity': 'LABEL_0', 'score': 0.8014772, 'index': 7, 'word': 'شده', 'start': 35, 'end': 38}, {'entity': 'LABEL_22', 'score': 0.99891186, 'index': 8, 'word': 'است', 'start': 39, 'end': 42}, {'entity': 'LABEL_5', 'score': 0.9985612, 'index': 9, 'word': '.', 'start': 42, 'end': 43}]
