In [None]:
!pip install transformers 
!pip install datasets 
!pip install parsivar 
!pip install hazm 
!pip install nlpaug

In [None]:
import os
import csv
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from transformers import AdamW
from sklearn.model_selection import train_test_split
import torch
import numpy as np
import datasets
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm_notebook
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from parsivar import Normalizer
import hazm
import nlpaug.augmenter.word as naw
import random
from datasets import load_dataset

In [None]:
class myNormalizer:
    def __init__(self):
        self.my_normalizer = Normalizer(pinglish_conversion_needed=True)
        self.hazm_normalizer = hazm.Normalizer(
            remove_extra_spaces=True,
            persian_numbers=False,
            persian_style=True,
            punctuation_spacing=False,
            remove_diacritics=True,
            affix_spacing=True,
            token_based=True,
        )

    def text_normalization(self, txt):
        return self.hazm_normalizer.normalize(
            self.my_normalizer.normalize(txt.replace("\n", " "))
        )

In [None]:
data1 = load_dataset("persiannlp/parsinlu_entailment")
dt = data1["train"]

In [None]:
pd.DataFrame(dt).head()

In [None]:
class Data_set:
    def __init__(self, seed=42, test_percentage=0.15, validation_percentage=0.15):
        data = load_dataset("persiannlp/parsinlu_entailment")
        self.train_df = pd.DataFrame(data["train"])
        self.valid_df = pd.DataFrame(data["validation"])
        self.test_df = pd.DataFrame(data["test"])

    def preprocess(self, record, tokenizer, model):
        max_len = 512
        normal_claim = self.normalize(record["sent1"])
        normal_body = self.normalize(record["sent2"])
        encoded_inputs = tokenizer(
            normal_claim,
            normal_body,
            truncation="only_second",
            max_length=512,
            padding="max_length",
            return_overflowing_tokens=False,
            return_offsets_mapping=False,
        )
        with torch.no_grad():
            vector = model(
                input_ids=torch.Tensor(encoded_inputs["input_ids"])
                .unsqueeze(0)
                .to(int)
                .to("cuda"),
                attention_mask=torch.Tensor(encoded_inputs["attention_mask"])
                .unsqueeze(0)
                .to(int)
                .to("cuda"),
                token_type_ids=torch.Tensor(encoded_inputs["token_type_ids"])
                .unsqueeze(0)
                .to(int)
                .to("cuda"),
            )
            return {
                "input_ids": encoded_inputs["input_ids"],
                "token_type_ids": encoded_inputs["token_type_ids"],
                "attention_mask": encoded_inputs["attention_mask"],
                "label": record["label"],
                "embedding": vector[1].squeeze(1),
            }

    def get_wieghts(self):
        total_number = self.train_df["label"].value_counts().sum()
        w1 = 1 - self.train_df["label"].value_counts()[0] / total_number
        w2 = 1 - self.train_df["label"].value_counts()[1] / total_number
        w3 = 1 - self.train_df["label"].value_counts()[2] / total_number
        wt = w1 + w2 + w3
        return torch.Tensor([w1, w2, w3]) / wt

    def get_datasets(self, tokenizer, model, base_model="bert"):
        self.base_model = base_model
        maps = {"c": int(0), "n": int(1), "e": int(2)}
        self.train_df["label"] = self.train_df["label"].apply(maps.get)
        self.valid_df["label"] = self.valid_df["label"].apply(maps.get)
        self.test_df["label"] = self.test_df["label"].apply(maps.get)
        # remove other columns
        self.train_df = self.train_df[["sent1", "sent2", "label"]]
        self.valid_df = self.valid_df[["sent1", "sent2", "label"]]
        self.test_df = self.test_df[["sent1", "sent2", "label"]]

        # create dataset object from pandas data
        train_dataset = Dataset.from_pandas(self.train_df)
        valid_dataset = Dataset.from_pandas(self.valid_df)
        test_dataset = Dataset.from_pandas(self.test_df)
        # text normalization
        self.normalize = myNormalizer().text_normalization
        print("preprocess and normalize train data ")
        train_dataset = train_dataset.map(
            lambda x: self.preprocess(x, tokenizer, model),
            remove_columns=train_dataset.column_names,
        )
        print("preprocess and normalize validation data ")
        valid_dataset = valid_dataset.map(
            lambda x: self.preprocess(x, tokenizer, model),
            remove_columns=valid_dataset.column_names,
        )
        print("preprocess and normalize test data ")
        test_dataset = test_dataset.map(
            lambda x: self.preprocess(x, tokenizer, model),
            remove_columns=test_dataset.column_names,
        )
        # shuffle train dataset
        train_dataset = train_dataset.shuffle(seed=40)
        # ready for use in torch
        train_dataset.set_format(type="torch")
        valid_dataset.set_format(type="torch")
        test_dataset.set_format(type="torch")
        # prapare conherent dataset
        dataset = DatasetDict()
        dataset["train"] = train_dataset
        dataset["validation"] = valid_dataset
        dataset["test"] = test_dataset
        return dataset

In [None]:
c_model = "HooshvareLab/bert-base-parsbert-uncased"
tokenizer = AutoTokenizer.from_pretrained(c_model)
model_name = "bert"
print("prepare data")
data = Data_set()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModel.from_pretrained(c_model)
model.load_state_dict(
    torch.load("/content/drive/MyDrive/bert_pars_nli_prob.csv"), strict=False
)
model.to(device)

dataset = data.get_datasets(tokenizer, model)

In [None]:
vectors = dataset["test"]["embedding"].squeeze(1)

In [None]:
t_vectors = dataset["train"]["embedding"].squeeze(1)
t_lables = dataset["train"]["label"]

In [None]:
vectors.shape

In [None]:
labels = dataset["test"]["label"]

In [None]:
def get_distance_Matrix(vectors):
    X = vectors.clone()
    M = torch.matmul(X, X.T)
    d = torch.diag(M)
    W = [d.tolist() for i in range(len(vectors))]
    W = torch.Tensor(W)
    D = W + W.T - (2 * M)
    D.fill_diagonal_(torch.inf)
    return D

In [None]:
def get_SI_score(vectors, labels):
    D = get_distance_Matrix(vectors)
    idxs = torch.argmin(D, 0)
    score = torch.sum(labels[idxs] == labels)
    return score

In [None]:
D = get_distance_Matrix(vectors[:, 0].unsqueeze(0))
idxs = torch.argmin(D, 0)
score = torch.sum(labels[idxs] == labels)
score

In [None]:
D

In [None]:
idxs = torch.argmin(D, 0)
score = torch.sum(labels[idxs] == labels)

In [None]:
# forward selection
vectors[:, 0].shape

In [None]:
def get_score_zero(v, idx):
    d0 = []
    for scaler in v:
        d0.append((vectors[:, idx] - scaler).tolist())
    return torch.abs(torch.Tensor(d0)).fill_diagonal_(torch.inf)

In [None]:
get_score_zero(vectors[:, 0], 0).shape

In [None]:
scores = []
index_max = 0
for i in tqdm_notebook(range(len(vectors[0]))):
    idxs = torch.argmin(get_score_zero(vectors[:, i], i), 0)
    score = torch.sum(labels[idxs] == labels)
    scores.append(score)

In [None]:
torch.max(torch.Tensor(scores))

In [None]:
f1 = torch.argmax(torch.Tensor(scores))

In [None]:
idxs = torch.argmin(get_score_zero(vectors[:, f1], f1), 0)
torch.sum(labels[idxs] == labels)

In [None]:
k = 3
best_feature_set = [f1.tolist()]
for fk in range(1, k):

    scores = []
    for i in tqdm_notebook(range(len(vectors[0]))):
        D = get_distance_Matrix(
            vectors[:, torch.Tensor(best_feature_set + [i]).to(torch.long)]
        )
        idxs = torch.argmin(D, 0)
        score = torch.sum(labels[idxs] == labels)
        scores.append(score)
    index_best_new_feture = torch.argmax(torch.Tensor(scores))
    best_feature_set = best_feature_set + [index_best_new_feture.tolist()]

In [None]:
torch.Tensor(best_feature_set)

In [None]:
torch.argmin(get_score_zero(vectors[:, 0], 0), 0)

In [None]:
len(vectors)

In [None]:
D = get_distance_Matrix(vectors[:, torch.Tensor(best_feature_set[:2]).to(torch.long)])
idxs = torch.argmin(D, 0)
score = torch.sum(labels[idxs] == labels)
score / len(vectors)

In [None]:
vectors.shape

In [None]:
k = 5
best_feature_set = []
for fk in range(1, k):

    scores = []
    for i in tqdm_notebook(range(len(t_vectors[0]))):
        D = get_distance_Matrix(
            t_vectors[:, torch.Tensor(best_feature_set + [i]).to(torch.long)]
        )
        idxs = torch.argmin(D, 0)
        score = torch.sum(t_lables[idxs] == t_lables)
        scores.append(score)
    index_best_new_feture = torch.argmax(torch.Tensor(scores))
    best_feature_set = best_feature_set + [index_best_new_feture.tolist()]

In [None]:
best_feature_set

In [None]:
best_feature_set

In [None]:
D = get_distance_Matrix(vectors[:, torch.Tensor(best_feature_set[:2]).to(torch.long)])
idxs = torch.argmin(D, 0)
score = torch.sum(labels[idxs] == labels)
score / len(vectors)