In [1]:
!pip install transformers
!pip install datasets
!git clone https: // github.com/Zarharan/PersianStanceDetection
!pip install parsivar
!pip install hazm
!pip install nlpaug
!mkdir best_model


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
fatal: destination path 'PersianStanceDetection' already exists and is not an empty directory.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nltk==3.4.5
  Using cached nltk-3.4.5-py3-none-any.whl
Installing collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.3
    Uninstalling nltk-3.3:
      Successfully uninstalled nltk-3.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
hazm 0.7.0 requires nltk==3.3, but you have nltk 3.4.5 which is incompatible.[0m[31m
[0mSuccessfully installed nltk-3.4.5
Looking in indexes: https://pypi.org/simple, https://us-python.

In [2]:
import os
import csv
import pandas as pd
from tqdm import tqdm
import torch
from random import shuffle
from torch import nn
from transformers import AdamW
from sklearn.model_selection import train_test_split
import torch
import numpy as np
import datasets
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm_notebook
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from parsivar import Normalizer
import hazm
import nlpaug.augmenter.word as naw
import random


In [3]:
class utils():
    def __init__(self):
        self.my_normalizer = Normalizer(pinglish_conversion_needed=True)
        self.hazm_normalizer = hazm.Normalizer(remove_extra_spaces=True, persian_numbers=False, persian_style=False,
                                               punctuation_spacing=False, remove_diacritics=True,
                                               affix_spacing=True, token_based=True)

    def text_normalization(self, txt):
        return self.hazm_normalizer.normalize(self.my_normalizer.normalize(txt.replace("\n", " ")))


In [4]:
class DataSet():
    def __init__(self, seed=30, test_percentage=0.15, validation_percentage=0.15):
        self.data = pd.read_csv(
            "/content/PersianStanceDetection/Dataset/ArticleToClaim.csv", encoding='utf-8')
        self.train_valid_df, self.test_df = train_test_split(
            self.data, test_size=test_percentage)
        # self.train_valid_df = self.train_valid_df[self.train_valid_df["Stance"] != "Unrelated"]
        # self.test_df = self.test_df[self.test_df["Stance"] != "Unrelated"]
        self.train_df, self.valid_df = train_test_split(
            self.train_valid_df, test_size=validation_percentage)
        print("data counts: ", self.train_df["Stance"].value_counts())

    def create_dataset(self, dataset, number):
        Aug_df = pd.DataFrame({})
        print("data augmentation with EDA")
        for indx in tqdm_notebook(dataset.index):
            for i in range(number):
                value = random.randint(0, 3)
                if value == 0:
                    aug = naw.ContextualWordEmbsAug(
                        model_path='HooshvareLab/bert-fa-base-uncased', action="substitute", device="cuda")
                elif value == 1:
                    aug = naw.ContextualWordEmbsAug(
                        model_path='HooshvareLab/bert-fa-base-uncased', action="insert", device="cuda")
                elif value == 2:
                    aug = naw.RandomWordAug(action='swap')
                else:
                    aug = naw.RandomWordAug(action='delete')

                new_body = aug.augment(str(dataset['body'][indx]))
                Aug_df = Aug_df.append({"claim": str(dataset['claim'][indx]), "body": " ".join(
                    new_body), "Stance": dataset['Stance'][indx]}, ignore_index=True)
        return Aug_df

    def data_augmentation(self):
        augmented_agree = self.create_dataset(
            self.train_df[(self.train_df["Stance"] == "Agree")], 1)
        self.train_df = pd.concat(
            [self.train_df, augmented_agree], ignore_index=True)
        print("after data augmentation :",
              self.train_df["Stance"].value_counts())

    def preprocess(self, record, tokenizer):
        claim = self.normalize(record["claim"])
        body = self.normalize(record["body"])
        len_input_claim = len(tokenizer.tokenize(claim)) + 2
        len_input_body = len(tokenizer.tokenize(body)) + 2
        max_len_input_claim = 256 if len_input_claim > 256 else len_input_claim
        max_len_input_body = 512 if len_input_body > 512 else len_input_body

        encoded_inputs_claim = tokenizer(
            claim,
            truncation=True,
            max_length=max_len_input_claim,
            padding="max_length",
            return_overflowing_tokens=False,
            return_offsets_mapping=False
        )
        encoded_inputs_body = tokenizer(
            body,
            truncation=True,
            max_length=max_len_input_body,
            padding="max_length",
            return_overflowing_tokens=False,
            return_offsets_mapping=False
        )
        return {"input_ids": encoded_inputs_claim["input_ids"], "attention_mask": encoded_inputs_claim["attention_mask"], "label": record["label"],
                "input_ids_body": encoded_inputs_body["input_ids"], 'attention_mask_body': encoded_inputs_body["attention_mask"]}

    def get_wieghts(self):
        total_number = self.train_df["label"].value_counts().sum()
        w1 = 1 - self.train_df["label"].value_counts()[0] / total_number
        w2 = 1 - self.train_df["label"].value_counts()[1] / total_number
        w3 = 1 - self.train_df["label"].value_counts()[2] / total_number
        w4 = 1 - self.train_df["label"].value_counts()[3] / total_number
        wt = w1 + w2 + w3 + w4
        weights = 1 - torch.Tensor([w1, w2, w3, w4])/wt
        return weights

    def get_datasets(self, tokenizer, base_model="bert"):
        self.base_model = base_model
        maps = {'Agree': int(0), "Disagree": int(
            1), "Discuss": int(2), "Unrelated": int(3)}
        self.train_df["label"] = self.train_df["Stance"].apply(maps.get)
        self.valid_df["label"] = self.valid_df["Stance"].apply(maps.get)
        self.test_df["label"] = self.test_df["Stance"].apply(maps.get)
        # remove other columns
        self.train_df = self.train_df[["claim", "body", "label"]]
        self.valid_df = self.valid_df[["claim", "body", "label"]]
        self.test_df = self.test_df[["claim", "body", "label"]]
        # create dataset object from pandas data
        train_dataset = Dataset.from_pandas(self.train_df)
        valid_dataset = Dataset.from_pandas(self.valid_df)
        test_dataset = Dataset.from_pandas(self.test_df)
        # text normalization
        self.normalize = utils().text_normalization
        print("preprocess and normalize train data ")
        train_dataset = train_dataset.map(lambda x: self.preprocess(
            x, tokenizer), remove_columns=train_dataset.column_names)
        print("preprocess and normalize validation data ")
        valid_dataset = valid_dataset.map(lambda x: self.preprocess(
            x, tokenizer), remove_columns=valid_dataset.column_names)
        print("preprocess and normalize test data ")
        test_dataset = test_dataset.map(lambda x: self.preprocess(
            x, tokenizer), remove_columns=test_dataset.column_names)
        # shuffle train dataset
        train_dataset = train_dataset.shuffle(seed=42)
        # ready for use in torch
        train_dataset.set_format(type='torch')
        valid_dataset.set_format(type="torch")
        test_dataset.set_format(type='torch')
        # prapare conherent dataset
        dataset = DatasetDict()
        dataset["train"] = train_dataset
        dataset["validation"] = valid_dataset
        dataset["test"] = test_dataset
        return dataset


In [5]:
class BaseModel(nn.Module):

    def __init__(self):
        super(BaseModel, self).__init__()
        self.bert = AutoModel.from_pretrained(
            c_model, output_hidden_states=True)
        # self.dense = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
        # self.activation = nn.Tanh()
        self.lstm_claim = nn.LSTM(self.bert.config.hidden_size,
                                  300, num_layers=2, bidirectional=True, batch_first=True)
        self.lstm_body = nn.LSTM(self.bert.config.hidden_size,
                                 300, num_layers=2, bidirectional=True, batch_first=True)

        self.fc1 = nn.Linear(1200, 512)
        self.act = nn.ReLU()
        self.drop = nn.Dropout(0.2)
        self.fc2 = nn.Linear(512, 4)

    def forward(self, input_ids, attention_mask, sep_idx):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # pooled_output = self.dense(output[2][9][:,0])
        # pooled_output = self.activation(pooled_output)
        _, (last_hidden_layyer_claim, _) = self.lstm_claim(
            output[2][11][:, :][:, 0:sep_idx])
        _, (last_hidden_layyer_body, _) = self.lstm_body(
            output[2][11][:, :][:, sep_idx:])

        pooled_output = torch.cat((last_hidden_layyer_claim[2, :, :], last_hidden_layyer_claim[3, :, :],
                                  last_hidden_layyer_body[2, :, :], last_hidden_layyer_body[3, :, :]), dim=1)
        out = self.act(self.fc1(pooled_output))
        out = self.drop(out)
        return self.fc2(out), pooled_output


In [6]:
def collate_batch(batch):
    input_ids_claims = []
    attention_mask_claims = []
    input_ids_body = []
    attention_mask_body = []
    labels = []
    idx_sep = []
    for b in batch:
        # for claims
        input_ids_claims.append(b["input_ids"])
        attention_mask_claims.append(b["attention_mask"])
        # for bodies
        input_ids_body.append(b["input_ids_body"])
        attention_mask_body.append(b["attention_mask_body"])
        # label
        labels.append(b["label"])

    input_ids_claims = torch.nn.utils.rnn.pad_sequence(
        input_ids_claims, batch_first=True, padding_value=0.0)
    input_ids_body = torch.nn.utils.rnn.pad_sequence(
        input_ids_body, batch_first=True, padding_value=0.0)

    input_ids = torch.cat((input_ids_claims, torch.ones(
        (input_ids_claims.shape[0], 1)) * tokenizer.sep_token_id, input_ids_body), dim=1)

    attention_mask_claims = torch.nn.utils.rnn.pad_sequence(
        attention_mask_claims, batch_first=True, padding_value=0.0)
    attention_mask_body = torch.nn.utils.rnn.pad_sequence(
        attention_mask_body, batch_first=True, padding_value=0.0)

    attention_mask = torch.cat((attention_mask_claims, torch.ones(
        (input_ids_claims.shape[0], 1)), attention_mask_body), dim=1)

    return {"input_ids": input_ids[:, :512], "attention_mask": attention_mask[:, :512], "label": torch.Tensor(labels).to(torch.long), "sep_idx": input_ids_claims.shape[1]}


In [7]:
def validation(model, val_loader, loss_fn, loss_fn2, device="cuda", test=False):

    val_loss = 0.0
    model.eval()
    model.to(device)
    num_words = 0
    tps = 0
    reals = []
    preds_list = []
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device).to(torch.int)
        attention_mask = batch["attention_mask"].to(device).to(torch.int)
        sep_idx = batch["sep_idx"]
        labels = batch["label"]
        preds, feats = model(input_ids=input_ids,
                             attention_mask=attention_mask, sep_idx=sep_idx)
        loss1 = loss_fn(preds, labels.to(device))
        loss = loss1
        val_loss += loss.item()
        reals += labels.numpy().tolist()
        preds_list += preds.argmax(axis=1).cpu().numpy().tolist()

    val_loss = val_loss / len(val_loader)
    accuracy = accuracy_score(reals, preds_list, normalize=True)
    val_f1_wighted = f1_score(reals, preds_list, average='macro')
    if test:
        print(classification_report(reals, preds_list, target_names=[
              "Agree", "Disagree", "Discuss", "unrelated"]))
    return val_loss, accuracy, val_f1_wighted


In [8]:
def train(model, optimizer, loss_fn, loss_fn2, train_loader, validation_loader, test_loader, epochs, scheduler, device="cuda"):
    model.to(device)
    best_loss = 100
    best_score = 0
    best_epoch = 0
    for epoch in range(epochs):
        training_loss = 0.0
        c = 0
        val_loss = 0.0
        model.train()
        t_labels = []
        t_preds = []
        for batch in tqdm_notebook(train_loader):
            input_ids = batch["input_ids"].to(device).to(torch.int)
            attention_mask = batch["attention_mask"].to(device).to(torch.int)
            labels = batch["label"]
            sep_idx = batch["sep_idx"]

            t_labels += labels.numpy().tolist()

            optimizer.zero_grad()
            preds, feats = model(
                input_ids=input_ids, attention_mask=attention_mask, sep_idx=sep_idx)
            loss1 = loss_fn(preds, labels.to(device))
            loss2 = loss_fn2(feats, labels.to(device))
            loss = loss1 + loss2
            loss.backward()
            optimizer.step()
            training_loss += loss.item()
            t_preds += preds.argmax(axis=1).cpu().numpy().tolist()
            c += 1

        training_loss = training_loss / len(train_loader)
        train_accuracy = accuracy_score(t_labels, t_preds, normalize=True)
        valid_loss, valid_acc, valid_f1_macro = validation(
            model, validation_loader, loss_fn, loss_fn2, device, False)

        # save model based on minimum loss
        if valid_loss < best_loss:
            best_loss = valid_loss
            torch.save(model.state_dict(), "/content/best_model/" +
                       "best_loss" + model_name + "loss.w")

        # save model based on best validation score (here is macro-f1)
        if best_score < valid_f1_macro:
            best_score = valid_f1_macro
            best_epoch = epoch
            torch.save(model.state_dict(),
                       "/content/best_model/" + model_name + ".w")

        # condition for stop training
        if (epoch - best_epoch) >= 3 and epoch > 3:
            break

        scheduler.step()

        print('Epoch: {},training loss: {:.2f} , train accuracy: {:.2f} , validation loss: {:.2f} , validation accuracy: {:.2f} , validaton f1: {:.2f}'.format(
            epoch, training_loss, train_accuracy, valid_loss, valid_acc, valid_f1_macro))

    test_loss, test_accuracy, f1_macro_test = validation(
        model, test_loader, loss_fn, loss_fn2, device, True)
    print('test loss: {:.2f}  test  accuracy: {:.2f}'.format(
        test_loss, test_accuracy))


In [9]:
def normalization(data):

    for i in range(len(data)):

        _range = torch.max(data[i]) - torch.min(data[i])
        data[i] = (data[i] - torch.min(data[i])) / _range
    return data


class SupConLoss(nn.Module):
    """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
    It also supports the unsupervised contrastive loss in SimCLR"""

    def __init__(self, temperature=0.07, contrast_mode='one',
                 base_temperature=0.07):
        super(SupConLoss, self).__init__()
        self.temperature = temperature
        self.contrast_mode = contrast_mode
        self.base_temperature = base_temperature

    def forward(self, features, labels=None, mask=None):
        """Compute loss for model. If both `labels` and `mask` are None,
        it degenerates to SimCLR unsupervised loss:
        https://arxiv.org/pdf/2002.05709.pdf
        Args:
            features: hidden vector of shape [bsz, n_views, ...].
            labels: ground truth of shape [bsz].
            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
                has the same class as sample i. Can be asymmetric.
        Returns:
            A loss scalar.
        """
        device = (torch.device('cuda')
                  if features.is_cuda
                  else torch.device('cpu'))

        if len(features.shape) < 2:
            raise ValueError('`features` needs to be [bsz, n_views, ...],'
                             'at least 3 dimensions are required')
        if len(features.shape) > 2:
            features = features.view(features.shape[0], features.shape[1], -1)

        # get batch_size
        batch_size = features.shape[0]

        if labels is not None and mask is not None:
            raise ValueError('Cannot define both `labels` and `mask`')
        elif labels is None and mask is None:
            mask = torch.eye(batch_size, dtype=torch.float32).to(device)
        elif labels is not None:
            labels = labels.contiguous().view(-1, 1)     # 16*1
            if labels.shape[0] != batch_size:
                raise ValueError(
                    'Num of labels does not match num of features')
            mask = torch.eq(labels, labels.T).float().to(device)      # 16*16
        else:
            mask = mask.float().to(device)

        features = features.unsqueeze(dim=1)
        features = F.normalize(features, dim=2)
        contrast_count = features.shape[1]
        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)

        if self.contrast_mode == 'one':
            anchor_feature = features[:, 0]
            anchor_count = 1
        elif self.contrast_mode == 'all':
            anchor_feature = contrast_feature
            anchor_count = contrast_count
        else:
            raise ValueError('Unknown mode: {}'.format(self.contrast_mode))

        # compute logits
        anchor_dot_contrast = torch.div(
            torch.matmul(anchor_feature, contrast_feature.T),
            self.temperature)

        # for numerical stability
        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
        logits = anchor_dot_contrast - logits_max.detach()

        logits_min, _ = torch.min(logits, dim=1, keepdim=True)
        logits_max, _ = torch.max(logits, dim=1, keepdim=True)
        _range = logits_max - logits_min
        logits = torch.div(logits-logits_min, _range)

        # tile mask
        mask = mask.repeat(anchor_count, contrast_count)
        # print("mask",mask)  # 16*16

        # mask-out self-contrast cases
        logits_mask = torch.scatter(
            torch.ones_like(mask),
            1,
            torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
            0
        )
        # print("logits_mask",logits_mask)
        mask = mask * logits_mask

        exp_logits = torch.exp(logits) * logits_mask

        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))

        # compute mean of log-likelihood over positive
        mean_log_prob_pos = (mask * log_prob).sum(1) / (mask.sum(1)+1)

        # loss
        loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos
        loss = loss.view(anchor_count, batch_size).mean()

        return loss


In [10]:
model_name = "bert"
c_model = "HooshvareLab/bert-fa-base-uncased"
# "persiannlp/mbert-base-parsinlu-entailment"
tokenizer = AutoTokenizer.from_pretrained(c_model)
print("prepare data")
data = DataSet()
data.data_augmentation()
dataset = data.get_datasets(tokenizer)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epoch = 8

weights = data.get_wieghts()
loss_fn = nn.CrossEntropyLoss(weight=weights.to(device))
supconloss = SupConLoss()
model = BaseModel()
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
train_loader = torch.utils.data.DataLoader(
    dataset["train"], batch_size=14, collate_fn=collate_batch, shuffle=True)
valid_loader = torch.utils.data.DataLoader(
    dataset["validation"], batch_size=2, collate_fn=collate_batch)
test_loader = torch.utils.data.DataLoader(
    dataset["test"], batch_size=2, collate_fn=collate_batch, shuffle=True)


prepare data
data counts:  Discuss      775
Unrelated    408
Disagree     152
Agree        107
Name: Stance, dtype: int64
data augmentation with EDA


  0%|          | 0/107 [00:00<?, ?it/s]

after data augmentation : Discuss      775
Unrelated    408
Agree        214
Disagree     152
Name: Stance, dtype: int64
preprocess and normalize train data 




  0%|          | 0/1549 [00:00<?, ?ex/s]

preprocess and normalize validation data 


  0%|          | 0/255 [00:00<?, ?ex/s]

preprocess and normalize test data 


  0%|          | 0/300 [00:00<?, ?ex/s]

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
import torch.nn.functional as F
train(model, optimizer, loss_fn, supconloss, train_loader,
      valid_loader, test_loader, epoch, scheduler, device)


  0%|          | 0/111 [00:00<?, ?it/s]

Epoch: 0,training loss: 3.00 , train accuracy: 0.51 , validation loss: 0.86 , validation accuracy: 0.64 , validaton f1: 0.32


  0%|          | 0/111 [00:00<?, ?it/s]

Epoch: 1,training loss: 2.43 , train accuracy: 0.76 , validation loss: 0.61 , validation accuracy: 0.76 , validaton f1: 0.59


  0%|          | 0/111 [00:00<?, ?it/s]

Epoch: 2,training loss: 2.02 , train accuracy: 0.88 , validation loss: 0.58 , validation accuracy: 0.82 , validaton f1: 0.71


  0%|          | 0/111 [00:00<?, ?it/s]

Epoch: 3,training loss: 1.81 , train accuracy: 0.95 , validation loss: 0.56 , validation accuracy: 0.82 , validaton f1: 0.75


  0%|          | 0/111 [00:00<?, ?it/s]

Epoch: 4,training loss: 1.70 , train accuracy: 0.97 , validation loss: 0.78 , validation accuracy: 0.79 , validaton f1: 0.64


  0%|          | 0/111 [00:00<?, ?it/s]

Epoch: 5,training loss: 1.65 , train accuracy: 0.98 , validation loss: 0.92 , validation accuracy: 0.75 , validaton f1: 0.66


  0%|          | 0/111 [00:00<?, ?it/s]

Epoch: 6,training loss: 1.63 , train accuracy: 0.98 , validation loss: 0.60 , validation accuracy: 0.84 , validaton f1: 0.75


  0%|          | 0/111 [00:00<?, ?it/s]

Epoch: 7,training loss: 1.57 , train accuracy: 0.99 , validation loss: 0.61 , validation accuracy: 0.86 , validaton f1: 0.80
              precision    recall  f1-score   support

       Agree       0.56      0.67      0.61        15
    Disagree       0.81      0.79      0.80        28
     Discuss       0.83      0.85      0.84       163
   unrelated       0.83      0.79      0.81        94

    accuracy                           0.81       300
   macro avg       0.76      0.77      0.76       300
weighted avg       0.82      0.81      0.81       300

test loss: 0.75  test  accuracy: 0.81


In [12]:
model.load_state_dict(torch.load(
    "/content/best_model/" + model_name + ".w"), strict=True)
test_loss, test_accuracy, test_f1 = validation(
    model, test_loader, loss_fn, supconloss, device=device, test=True)
model.load_state_dict(torch.load(
    "/content/best_model/best_lossbertloss.w"), strict=True)
test_loss, test_accuracy, test_f1 = validation(
    model, test_loader, loss_fn, supconloss, device=device, test=True)


              precision    recall  f1-score   support

       Agree       0.47      0.60      0.53        15
    Disagree       0.85      0.79      0.81        28
     Discuss       0.83      0.86      0.84       163
   unrelated       0.85      0.78      0.81        94

    accuracy                           0.81       300
   macro avg       0.75      0.76      0.75       300
weighted avg       0.82      0.81      0.81       300

              precision    recall  f1-score   support

       Agree       0.41      0.80      0.55        15
    Disagree       0.86      0.68      0.76        28
     Discuss       0.87      0.84      0.85       163
   unrelated       0.81      0.79      0.80        94

    accuracy                           0.81       300
   macro avg       0.74      0.78      0.74       300
weighted avg       0.83      0.81      0.81       300

