# **Preparing**

**Import Necessary Libraries**

In [None]:
from google.colab import drive
import pandas as pd
import json
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow.keras.utils import to_categorical
import string
import matplotlib.pyplot as plt
from collections import Counter
import os
import sys
import logging
from pathlib import Path
import random
import gensim
import  codecs
from tqdm import tqdm
from fastprogress.fastprogress import master_bar, progress_bar
plt.style.use("ggplot")

**Configurations To Use In The Script**

In [None]:
normalizer = {
 "؛" : ";",
 "«" : "<<",
 "؟" : "?",
 "²" : ".",
 "،" : ",",
 "»" : ">>",
 "×" : "*",
 "ة" : "ه",
 "–" : "-",
 "ؤ" : "و",
 "½" : ".",
 "ئ" : "ی",
 "…" : ".",
}

In [None]:
config = {
    "parent_dir" : "/content/gdrive/MyDrive/NLP-Spring 99-00/HW/HW2/",
    "model_dir" : "/content/gdrive/MyDrive/NLP-Spring 99-00/HW/HW2/Ph1/Model",
    "train_data_name" : "train.data",
    "length_threshold" : 128
}

**Mount Drive**

In [None]:
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


**Read Data**

In [None]:
#read raw data from path that defined in the config dictionary
with open(config["parent_dir"] + config["train_data_name"] , 'r' , encoding="utf-8") as f:
    raw_data = f.readlines()

#apply some correctionscorrections on the raw data and convert some characters to correct form
normalized_data = []
for line in raw_data:
    normalized_line = line
    for item in normalizer:
        normalized_line = normalized_line.replace(item , normalizer[item])
    normalized_data.append(normalized_line)
del raw_data

# **Preprocess**

**Break Text Into Sentences And Keep Labels**

In [None]:
split_labeled_text = []
sentence = []
for line in normalized_data:
    if len(line)==0 or line[0]=="\n":
        if len(sentence) > 0:
            split_labeled_text.append(sentence)
            sentence = []
        continue
    splits = line.split(' ')
    sentence.append([splits[0],splits[-1].rstrip("\n")])
    
if len(sentence) > 0:
    split_labeled_text.append(sentence)
    sentence = []

#split_labeled_text = [[[word , label] , [word , label] , [word , label] , ... , [word , label]] , ....]

**Seperate Each Sentence Into Pairs Of (Word , Label)**

In [None]:
sentences = []
labels = []
for data in split_labeled_text:
    sentence = []
    label = []
    for item in data:
        sentence.append(item[0])
        label.append(item[1])
    sentences.append(sentence)
    labels.append(label)

**Extract All Words In The Sentences**

In [None]:
word_list = set()
char_list = set()
for sent in sentences:
    for token in sent:
        word_list.add(token)

word_list = list(word_list)
len(word_list)

57296

**Create Dictionary To Save Each Label With Corresponding Id**

In [None]:
label_2_idx = {"gen_negative" : 0 , "gen_positive" : 1}
idx_2_label = {i: l for l, i in label_2_idx.items()}

In [None]:
labels_to_idx = []
for data in labels:
    labels_sentence = []
    for item in data:
        labels_sentence.append(label_2_idx[item])
    labels_to_idx.append(labels_sentence)
labels = labels_to_idx

**Shuffle Sentences And Corresponding Labels To Reduce Dependencies Between Successive Sentences**

In [None]:
sentences, labels = shuffle(sentences, labels, random_state=42)

**Split Data To Train, Test And Validation Sets**

In [None]:
train_sentences, valid_sentences, train_labels, valid_labels = train_test_split(sentences, labels, test_size=0.2)
valid_sentences, test_sentences, valid_labels, test_labels = train_test_split(valid_sentences, valid_labels, test_size=0.5)

# **Model**

In [None]:
!pip install transformers



**Impot Libraries**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import transformers
import tensorflow as tf
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForTokenClassification, BertTokenizer, BertConfig, BertModel

**Define Some Important Variables**

In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
LEARNING_RATE = 2e-05
tokenizer = BertTokenizer.from_pretrained('HooshvareLab/bert-base-parsbert-uncased')

**Custom Class For Feeding Data Into The Pars Bert Model**

In [None]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, sentences, labels, max_len):
        self.len = len(sentences)
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        sentence = self.sentences[index]
        inputs = self.tokenizer.encode_plus(
            sentence,
            None,
            truncation=True,
            is_split_into_words=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        try:
            label = self.labels[index]
            label.extend([-100]*self.max_len)
            label=label[:self.max_len]
        except:
            print(index)
            print(sentence)
            raise

        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'labels': label
        } 
    
    def __len__(self):
        return self.len

**Create Train, Test And Validation Sets**

In [None]:
training_set = CustomDataset(tokenizer, train_sentences, train_labels , MAX_LEN)
validation_set = CustomDataset(tokenizer, valid_sentences, valid_labels , MAX_LEN)
testing_set = CustomDataset(tokenizer, test_sentences, test_labels , MAX_LEN)

In [None]:
!pip install datasets
!pip install seqeval



In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained('HooshvareLab/bert-base-parsbert-uncased', num_labels=2)

Some weights of the model checkpoint at HooshvareLab/bert-base-parsbert-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were no

**Train Arguments**

In [None]:
args = TrainingArguments(
    "test-ezafe",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=1,
)

In [None]:
from datasets import load_metric
metric = load_metric("seqeval")

In [None]:
labels = list(label_2_idx.keys())
metric.compute(predictions=[labels], references=[labels])



{'en_negative': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'en_positive': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'overall_accuracy': 1.0,
 'overall_f1': 1.0,
 'overall_precision': 1.0,
 'overall_recall': 1.0}

**A Function For Computing Evaluation Metrics**

In [None]:
import numpy as np

def compute_metrics(p):
    predictions , labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [idx_2_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [idx_2_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

**Train The Model**

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=training_set,
    eval_dataset=validation_set,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0828,0.062748,1.0,1.0,1.0,1.0
2,0.0538,0.055342,1.0,1.0,1.0,1.0


  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}


TrainOutput(global_step=3506, training_loss=0.09109377295747867, metrics={'train_runtime': 3028.6235, 'train_samples_per_second': 1.158, 'total_flos': 22180537052160.0, 'epoch': 2.0, 'init_mem_cpu_alloc_delta': 2164396032, 'init_mem_gpu_alloc_delta': 649538048, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 237015040, 'train_mem_gpu_alloc_delta': 2012620800, 'train_mem_cpu_peaked_delta': 94105600, 'train_mem_gpu_peaked_delta': 3586833408})

**Save The Model**

In [None]:
trainer.save_model(config['parent_dir'] + 'our-pars-bert-model')


**Test The Model**

In [None]:
result = trainer.predict(testing_set)

  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}




In [None]:
predict_lables = np.argmax(result.predictions,-1)

In [None]:
true_lables = result.label_ids

**Sample Test**

In [None]:
i = 999
for a , b , c in zip(test_sentences[i] , predict_lables[i] , true_lables[i]):
    print(a , b , c)

من 0 0
احکام 1 1
خداوند 0 0
را 0 0
به 0 0
گونه‌ای 0 0
سامان‌یافته 0 0
به 0 0
گوش 1 1
شما 0 0
رساندم 0 0
و 0 0
آنچه 0 0
را 0 0
که 0 0
به 0 0
سود 1 1
شماست 0 0
پوشیده 0 0
نداشتم 0 0
و 0 0
همه 0 0
چیز 0 0
را 0 0
از 0 0
آغاز 0 0
تا 0 0
پایان 0 0
باز 0 0
نمودم 0 0
. 0 0
