# **Preparing**

**Import Necessary Libraries**

In [1]:
from google.colab import drive
import pandas as pd
import json
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow.keras.utils import to_categorical
import string
import matplotlib.pyplot as plt
from collections import Counter
import os
import sys
import logging
from pathlib import Path
import random
import gensim
import  codecs
from tqdm import tqdm
from fastprogress.fastprogress import master_bar, progress_bar
plt.style.use("ggplot")

**Configurations To Use In The Script**

In [2]:
normalizer = {
 "؛" : ";",
 "«" : "<<",
 "؟" : "?",
 "²" : ".",
 "،" : ",",
 "»" : ">>",
 "×" : "*",
 "ة" : "ه",
 "–" : "-",
 "ؤ" : "و",
 "½" : ".",
 "ئ" : "ی",
 "…" : ".",
}

In [3]:
config = {
    "parent_dir" : "/content/gdrive/MyDrive/NLP-Spring 99-00/HW/HW2/",
    "our_model" : "our-pars-bert-model",
    "test_data_name" : "test.data",
}

**Mount Drive**

In [4]:
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


**Read Data**

In [5]:
#read raw data from path that defined in the config dictionary
with open(config["parent_dir"] + config["test_data_name"] , 'r' , encoding="utf-8") as f:
    raw_data = f.readlines()

#apply some correctionscorrections on the raw data and convert some characters to correct form
normalized_data = []
for line in raw_data:
    normalized_line = line
    for item in normalizer:
        normalized_line = normalized_line.replace(item , normalizer[item])
    normalized_data.append(normalized_line)
del raw_data

# **Preprocess**

**Break Text Into Sentences And Keep Labels**

In [6]:
split_labeled_text = []
sentence = []
for line in normalized_data:
    if len(line)==0 or line[0]=="\n":
        if len(sentence) > 0:
            split_labeled_text.append(sentence)
            sentence = []
        continue
    splits = line.split(' ')
    sentence.append([splits[0],splits[-1].rstrip("\n")])
    
if len(sentence) > 0:
    split_labeled_text.append(sentence)
    sentence = []

#split_labeled_text = [[[word , label] , [word , label] , [word , label] , ... , [word , label]] , ....]

**Seperate Each Sentence Into Pairs Of (Word , Label)**

In [7]:
sentences = []
labels = []
for data in split_labeled_text:
    sentence = []
    label = []
    for item in data:
        sentence.append(item[0])
        label.append(item[1])
    sentences.append(sentence)
    labels.append(label)

**Extract All Words In The Sentences**

In [8]:
word_list = set()
char_list = set()
for sent in sentences:
    for token in sent:
        word_list.add(token)

word_list = list(word_list)
len(word_list)

16404

**Create Dictionary To Save Each Label With Corresponding Id**

In [9]:
label_2_idx = {"gen_negative" : 0 , "gen_positive" : 1}
idx_2_label = {i: l for l, i in label_2_idx.items()}

In [10]:
labels_to_idx = []
for data in labels:
    labels_sentence = []
    for item in data:
        labels_sentence.append(label_2_idx[item])
    labels_to_idx.append(labels_sentence)
labels = labels_to_idx

# **Model**

In [11]:
!pip install transformers



**Impot Libraries**

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import transformers
import tensorflow as tf
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForTokenClassification, BertTokenizer, BertConfig, BertModel

**Define Some Important Variables**

In [13]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
LEARNING_RATE = 2e-05
tokenizer = BertTokenizer.from_pretrained(config['parent_dir'] + config['our_model'])

**Custom Class For Feeding Data Into The Pars Bert Model**

In [14]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, sentences, labels, max_len):
        self.len = len(sentences)
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        sentence = self.sentences[index]
        inputs = self.tokenizer.encode_plus(
            sentence,
            None,
            truncation=True,
            is_split_into_words=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        try:
            label = self.labels[index]
            label.extend([-100]*self.max_len)
            label=label[:self.max_len]
        except:
            print(index)
            print(sentence)
            raise

        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'labels': label
        } 
    
    def __len__(self):
        return self.len

**Create Test Set**

In [15]:
testing_set = CustomDataset(tokenizer, sentences, labels , MAX_LEN)

In [16]:
!pip install datasets
!pip install seqeval



**Test The Model**

In [17]:
from datasets import load_metric
metric = load_metric("seqeval")

In [18]:
labels = list(label_2_idx.keys())
metric.compute(predictions=[labels], references=[labels])



{'en_negative': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'en_positive': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'overall_accuracy': 1.0,
 'overall_f1': 1.0,
 'overall_precision': 1.0,
 'overall_recall': 1.0}

In [19]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [idx_2_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [idx_2_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [20]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
 
model = AutoModelForTokenClassification.from_pretrained(config['parent_dir'] + config['our_model'], num_labels=2)
trainer = Trainer(model,compute_metrics=compute_metrics,tokenizer=tokenizer)

In [21]:
# trainer.evaluate(testing_set)

In [22]:
result = trainer.predict(testing_set)





In [23]:
true_lables = result.label_ids

In [24]:
predict_lables = np.argmax(result.predictions,-1)

**Results**

In [25]:
pl = []
tl = []

for i in range(len(sentences)):
    sentence_len = len(sentences[i])
    pred_label = predict_lables[i][:sentence_len]
    true_label = true_lables[i][:sentence_len]
    pl.extend(pred_label.tolist())
    tl.extend(true_label.tolist()) 

In [26]:
from sklearn.metrics import classification_report, 	f1_score, accuracy_score
print("Accuracy : " , accuracy_score(tl, pl))
print("F1-Score : " , f1_score(tl, pl))
print("------------------Classification Report------------------")
print(classification_report(tl, pl, target_names=['Negative' , 'Positive']))

Accuracy :  0.9773980154355016
F1-Score :  0.9532629110529688
------------------Classification Report------------------
              precision    recall  f1-score   support

    Negative       0.98      0.99      0.99    144250
    Positive       0.96      0.95      0.95     46220

    accuracy                           0.98    190470
   macro avg       0.97      0.97      0.97    190470
weighted avg       0.98      0.98      0.98    190470



**Sample Test**

In [27]:
# print(len(sentences[0]))
# print(predict_lables[0])
# print(true_lables[0])
# print(' '.join(sentences[0]))
# print(tokenizer.convert_ids_to_tokens(testing_set[0]['input_ids'].numpy().tolist()))

In [28]:
#بسیار
#آیا
i= np.random.randint(0 , len(sentences))
for a , b , c in zip(sentences[i], predict_lables[i], true_lables[i]):
    print(a, b, c)

مقام 1 1
رهبری 0 0
با 0 0
اشاره 0 0
به 0 0
حساسیت 1 1
نقش 1 1
رسانه‌ها 0 0
و 0 0
روزنامه‌ها 0 0
در 0 0
جهت‌دهی 0 0
و 0 0
شتاب 0 0
بخشیدن 0 0
به 0 0
اصلاحات 1 1
حقیقی 0 0
یا 0 0
انحرافی 0 0
افزودند 0 0
: 0 0
این 0 0
بحث 0 0
به 0 0
مقوله 1 1
آزادی 1 1
بیان 0 0
بازنمی‌گردد 0 0
, 0 0
چراکه 0 0
آزادی 0 0
از 0 0
مهم‌ترین 0 0
دغدغه‌های 1 1
مسیولان 1 1
نظام 1 1
اسلامی 0 0
است 0 0
, 0 0
اما 0 0
سم‌پاشی 0 0
و 0 0
گمراه‌سازی 0 0
و 0 0
منحرف 0 0
ساختن 1 1
اصلاحات 1 0
آن 0 0
هم 0 0
در 0 0
موقعیت 1 1
حساس 1 1
امروز 1 1
کشور 0 0
ممنوع 0 0
است 0 0
و 0 0
ما 0 0
نمی‌توانیم 0 0
اجازه 0 0
دهیم 0 0
به 0 0
اسم 1 1
دفاع 0 0
از 0 0
آزادی 0 0
, 0 0
از 0 0
شیوه‌های 1 1
مورد 1 1
نظر 1 1
دشمن 1 0
برای 1 1
ایجاد 1 1
انحراف 0 0
در 0 0
اصلاحات 0 0
, 0 0
در 0 0
کشور 0 0
استفاده 0 0
شود 0 0
. 0 0
