# Importing libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/StanceCat-COV19

Mounted at /content/drive
/content/drive/MyDrive/StanceCat-COV19


# Imports

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import re
import nltk
from tqdm import tqdm
from sklearn.metrics import f1_score
nltk.download('punkt')
from torch.optim import Adam
import string

!pip install emoji==1.4.2
import emoji

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji==1.4.2
  Downloading emoji-1.4.2.tar.gz (184 kB)
[K     |████████████████████████████████| 184 kB 16.6 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.4.2-py3-none-any.whl size=186469 sha256=fc6ccf0db1cafa24d038da994544538f758b3e184041c953c51fb0f8fac67590
  Stored in directory: /root/.cache/pip/wheels/71/4d/3c/cada364d4ea0026deee7208dee1e61bcebd20aa2ae5dc154ba
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-1.4.2


# Data PreProcessing

### Read and visualize data

In [None]:
train = pd.read_csv('Dataset/train.csv')
text_train, category_train, stance_train = train['text'], train['category'], train['stance']

dev = pd.read_csv('Dataset/dev.csv')
text_dev, category_dev, stance_dev = dev['text'], dev['category'], dev['stance']

In [None]:
train.head(10)

Unnamed: 0,text,category,stance
0,بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الاب...,celebrity,1
1,وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...,info_news,1
2,قولكن رح يكونو اد المسؤولية ب لبنان لما يوصل ...,info_news,1
3,#تركيا.. وزير الصحة فخر الدين قوجة يتلقى أول ج...,celebrity,1
4,وئام وهاب يشتم الدول الخليجية في كل طلة اعلامي...,personal,0
5,"لقاح #كورونا في أميركا.. قلق متزايد من ""التوزي...",info_news,0
6,لبنان اشترى مليونان لقاح امريكي اذا شلنا يلي ع...,info_news,1
7,من عوارض لقاح كورونا<LF>هو تهكير حسابك عتويتر<...,personal,0
8,هناك 1780 مليونيراً في لبنان. ماذا لو فُرضت ال...,unrelated,0
9,دعبول حضرتك منو انت وتطلب من قائد دولة إسلامية...,info_news,1


In [None]:
dev.head(10)

Unnamed: 0,text,category,stance
0,#مريم_رجوي: <LF>حظر خامنئي المجرم شراء #لقاح_ك...,info_news,1
1,#الصحة:<LF>•تم إعطاء 259.530 جرعة من لقاح #كور...,plan,1
2,#خادم_الحرمين - حفظه الله - يتلقى الجرعة الأول...,celebrity,1
3,#الصحه_العالميه: لقاحات #كورونا آمنة ولا خوف م...,info_news,1
4,"#وزيرة_الصحة ""#هالة_زايد"" تقول إنه يجرى مراجعة...",info_news,1
5,2️⃣ وانتهى الفريق من الدراسات قبل السريرية ونش...,info_news,1
6,عاجل 🔴 <LF>.<LF><LF>.<LF><LF>وزارة الصحة :<LF>...,plan,1
7,#فيديو | السفير الأميركي لدى السعودية بعد تلقي...,info_news,1
8,تصريحات وبس الحكومة مع السيسي علي حسب اللقطة! ...,info_news,0
9,الاتحاد الاوروبي تفاوض لشراء لقاحات الكورونا م...,info_news,1


In [None]:
text_train, category_train, stance_train = np.array(train['text']), np.array(train['category']), np.array(train['stance'])
text_dev, category_dev, stance_dev = np.array(dev['text']), np.array(dev['category']), np.array(dev['stance'])

print(text_train.shape, category_train.shape, stance_train.shape)
print(text_dev.shape, category_dev.shape, stance_dev.shape)

(6988,) (6988,) (6988,)
(1000,) (1000,) (1000,)


### Preprocessing

In [None]:
# Clean/Normalize Arabic Text 
# taskeel, longation, ... etc
def clean_str(text):
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    #remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    #trim    
    text = text.strip()

    return text

def PreProcessing(text, unk=''):

  for i in range(len(text)):

    # remove links
    text[i] = re.sub(r'https?:\/\/.*[\r\n]*', '', text[i], flags=re.MULTILINE)

    # limit repeated emojis to 2
    prev_emoji = None
    cnt = 0
    remove_duplicate_emoji = []
    for c in text[i]:
        if c in emoji.UNICODE_EMOJI['en']:
            if prev_emoji == c and cnt >=2:
                continue
            if prev_emoji != c:
              cnt = 0
            prev_emoji = c
            cnt+=1
        else:
          prev_emoji = None
          cnt = 0
        remove_duplicate_emoji.append(c)
    text[i] = "".join(remove_duplicate_emoji)

    # seperate emojis
    text[i] = ''.join((' '+c+' ') if c in emoji.UNICODE_EMOJI['en'] else c for c in text[i])

    # Clean/Normalize Arabic Text
    text[i] = clean_str(text[i]) 

    # remove multiple spaces
    text[i] = re.sub(' +', ' ', text[i])

  return text

In [None]:
pre_text_train = PreProcessing(np.array(text_train))
pre_text_dev = PreProcessing(np.array(text_dev))
print(text_train[0], '\n', pre_text_train[0])
print(text_dev[0], '\n', pre_text_dev[0])
print(len(max(pre_text_train,key=len)))

بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الابرة و لا السيرنجة و لا الدواء و لابس بولو صيفي في عز الشتاء و يقول ان إحدى مزايا عمر ال 65 عامًا هي انه مؤهل للحصول على اللقاح ... يعنى ما كان يحتاج اللقاح لو كان عمره اصغر من 65 🤔 https://t.co/QQKFFUNwBn 
 بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الابرة و لا السيرنجة و لا الدواء و لابس بولو صيفي في عز الشتاء و يقول ان إحدى مزايا عمر ال 65 عاما هي انه مؤهل للحصول على اللقاح .. يعنى ما كان يحتاج اللقاح لو كان عمره اصغر من 65 🤔
#مريم_رجوي: <LF>حظر خامنئي المجرم شراء #لقاح_كورونا يعد مجزرة متعمدة بحق الشعب الإيراني<LF><LF>نقل موقع مريم رجوي موقف رئيسة الجمهورية المنتخبة للمقاومة الإيرانية من تصريحات خامنئي المجرم حول حظر استيراد لقاح كورونا من الولايات المتحدة و بريطانيا و فرنسا. <LF>#اللقاح_حق_للناس https://t.co/AYXMbXjVKn 
 #مريم_رجوي: <LF>حظر خامنئي المجرم شراء #لقاح_كورونا يعد مجزرة متعمدة بحق الشعب الإيراني<LF><LF>نقل موقع مريم رجوي موقف رئيسة الجمهورية المنتخبة للمقاومة الإيرانية من تصريحات خامنئي المجرم حول حظر استيراد لقاح كورونا من ال

### Save preprocessed data

In [None]:
with open('our_processed_train.txt','w', encoding='utf8') as f:
	for i in pre_text_train:
		f.write('%s\n'%i)

### Building dictionaries to turn a category to number & vice versa

In [None]:
category2id = {'others':0, 'rumors':1, 'restrictions':2, 'celebrity':3, 'personal':4, 'info_news':5, 'requests':6, 'advice':7, 'unrelated':8, 'plan':9}
id2category = {id:word for (word, id) in category2id.items()}

for id, word in id2category.items():
  print(id, word)

print(category2id['celebrity'])

0 others
1 rumors
2 restrictions
3 celebrity
4 personal
5 info_news
6 requests
7 advice
8 unrelated
9 plan
3


# Model building & training

### Ideas to try
1) bi-directional
2) pre-training
3) multi-layers
4) BERT
5) transformers notebook
6) packed_padded_sequences
7) pre-trained embedding

### Building the datasets

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
class Dataset(torch.utils.data.Dataset):

  def __init__(self, x, y, oversample=False):

    x = x.copy()
    y = y.copy()

    # apply arabert additional preprocessing & tokenize the input
    x = [arabert_prep.preprocess(text) for text in x]
    self.X = [tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for text in x]

    self.Y = torch.tensor(y)

    self.len = len(x)

    print(sorted(Counter(self.Y.numpy()).items()))

  def __len__(self):
    return self.len

  def __getitem__(self, idx):
    return self.X[idx], self.Y[idx]

In [None]:
stance_train_dataset = Dataset(pre_text_train, stance_train + 1, True)
category_train_dataset = Dataset(pre_text_train, [category2id[category] for category in category_train], True)

stance_dev_dataset = Dataset(pre_text_dev, stance_dev + 1)
category_dev_dataset = Dataset(pre_text_dev, [category2id[category] for category in category_dev])

[(0, 438), (1, 1012), (2, 5538)]
[(0, 167), (1, 79), (2, 18), (3, 975), (4, 1025), (5, 3616), (6, 112), (7, 67), (8, 323), (9, 606)]
[(0, 70), (1, 126), (2, 804)]
[(0, 17), (1, 15), (2, 2), (3, 145), (4, 128), (5, 545), (6, 20), (7, 10), (8, 36), (9, 82)]


### The arabert model

In [None]:
from transformers import BertModel
class BertClassifier(nn.Module):

    def __init__(self, output_dim, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('aubmindlab/bert-base-arabertv02-twitter')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, output_dim)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

### Use GPU if any

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Training function

In [None]:
def train(model, train_dataset, train_dataloader, criterion, optimizer):
  
  model.train()

  total_acc_train = 0
  total_loss_train = 0

  # for f1 score
  y_true, y_pred = [], []

  for train_input, train_label in tqdm(train_dataloader):

    train_label = train_label.to(device)
    mask = train_input['attention_mask'].to(device)
    input_id = train_input['input_ids'].squeeze(1).to(device)

    output = model(input_id, mask)

    batch_loss = criterion(output, train_label.long())
    total_loss_train += batch_loss.item()

    acc = (output.argmax(dim=1) == train_label).sum().item()
    total_acc_train += acc

    y_true += train_label.tolist() 
    y_pred += output.argmax(dim=1).tolist()

    model.zero_grad()
    batch_loss.backward()
    optimizer.step()

  f1_macro_train = f1_score(y_true, y_pred, average='macro')

  return total_loss_train/len(train_dataset), total_acc_train/len(train_dataset), f1_macro_train

### Evaluating function

In [None]:
def evaluate(model, test_dataset, test_dataloader, criterion):

  model.eval()

  total_acc_test = 0
  total_loss_test = 0

  # for f1 score
  y_true, y_pred = [], []

  with torch.no_grad():

    for test_input, test_label in test_dataloader:

      test_label = test_label.to(device)

      # print(test_input['input_ids'].shape)
      # print("attention_mask",train_input['attention_mask'].shape)

      mask = test_input['attention_mask'].to(device)
      input_id = test_input['input_ids'].squeeze(1).to(device)

      output = model(input_id, mask)

      # print(output.shape)

      batch_loss = criterion(output, test_label.long())
      total_loss_test += batch_loss.item()

      acc = (output.argmax(dim=1) == test_label).sum().item()
      total_acc_test += acc

      y_true += test_label.tolist() 
      y_pred += output.argmax(dim=1).tolist()

  # print(len(y_true)-1, len(y_pred)-1)
  f1_macro_test = f1_score(y_true, y_pred, average='macro')

  return total_loss_test/len(test_dataset), total_acc_test/len(test_dataset), f1_macro_test

### Training and evaluating the model each epoch

In [None]:
def train_evaluate(model, train_dataset, val_dataset, learning_rate, epochs, model_name):

  # oversample
  _, y_train = train_dataset[:]
  class_sample_count = np.array([len(torch.where(y_train == t)[0]) for t in np.unique(y_train)])
  print('counts', class_sample_count)

  weight = 1.0 / class_sample_count
  samples_weight = np.array([weight[t] for t in y_train])
  samples_weight = torch.from_numpy(samples_weight)
  print(samples_weight)

  sampler = torch.utils.data.WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight))

  # dataloaders
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=2, sampler=sampler)
  val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=2)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  criterion = nn.CrossEntropyLoss()
  optimizer = Adam(model.parameters(), lr= learning_rate)

  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  best_f1_macro = 0

  for epoch_num in range(epochs):

    loss_train, acc_train, f1_macro_train = train(model, train_dataset, train_dataloader, criterion, optimizer)
    loss_val, acc_val, f1_macro_val = evaluate(model, val_dataset, val_dataloader, criterion)
          
    if f1_macro_val > best_f1_macro:
      best_f1_macro = f1_macro_val
    torch.save(model.state_dict(), f'Models/BERT/{model_name}/F1{f1_macro_val: .4f} Acc{acc_val: .4f}.pt')

    print(
        f'Epochs: {epoch_num + 1} \
        | Train Loss: {loss_train: .4f} \
        | Train Accuracy: {acc_train : .4f} \
        | Train F1_macro: {f1_macro_train: .4f} \
        | Val Loss: {loss_val: .4f} \
        | Val Accuracy: {acc_val: .4f} \
        | Val F1_macro: {f1_macro_val: .4f} \
        ')

### Training the stance classifier

In [None]:
EPOCHS = 10
OUTPUT_DIM = 3
LR = 1e-6

model = BertClassifier(OUTPUT_DIM)
              
train_evaluate(model, stance_train_dataset, stance_dev_dataset, LR, EPOCHS, 'Stance')

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02-twitter were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dens

counts [ 438 1012 5538]
tensor([0.0002, 0.0002, 0.0002,  ..., 0.0002, 0.0002, 0.0002],
       dtype=torch.float64)


100%|██████████| 3494/3494 [12:55<00:00,  4.50it/s]


Epochs: 1         | Train Loss:  0.4485         | Train Accuracy:  0.5950         | Train F1_macro:  0.5965         | Val Loss:  0.3053         | Val Accuracy:  0.7560         | Val F1_macro:  0.5861         


100%|██████████| 3494/3494 [12:55<00:00,  4.51it/s]


Epochs: 2         | Train Loss:  0.2996         | Train Accuracy:  0.7665         | Train F1_macro:  0.7667         | Val Loss:  0.2696         | Val Accuracy:  0.7840         | Val F1_macro:  0.6299         


100%|██████████| 3494/3494 [12:55<00:00,  4.50it/s]


Epochs: 3         | Train Loss:  0.2374         | Train Accuracy:  0.8253         | Train F1_macro:  0.8258         | Val Loss:  0.2815         | Val Accuracy:  0.7960         | Val F1_macro:  0.6397         


100%|██████████| 3494/3494 [12:56<00:00,  4.50it/s]


Epochs: 4         | Train Loss:  0.1952         | Train Accuracy:  0.8659         | Train F1_macro:  0.8655         | Val Loss:  0.2980         | Val Accuracy:  0.7960         | Val F1_macro:  0.6503         


100%|██████████| 3494/3494 [12:56<00:00,  4.50it/s]


Epochs: 5         | Train Loss:  0.1633         | Train Accuracy:  0.8904         | Train F1_macro:  0.8898         | Val Loss:  0.2881         | Val Accuracy:  0.7980         | Val F1_macro:  0.6459         


100%|██████████| 3494/3494 [12:55<00:00,  4.51it/s]


Epochs: 6         | Train Loss:  0.1412         | Train Accuracy:  0.9104         | Train F1_macro:  0.9099         | Val Loss:  0.2933         | Val Accuracy:  0.8030         | Val F1_macro:  0.6497         


100%|██████████| 3494/3494 [12:55<00:00,  4.50it/s]


Epochs: 7         | Train Loss:  0.1148         | Train Accuracy:  0.9254         | Train F1_macro:  0.9247         | Val Loss:  0.2994         | Val Accuracy:  0.8070         | Val F1_macro:  0.6521         


100%|██████████| 3494/3494 [12:56<00:00,  4.50it/s]


Epochs: 8         | Train Loss:  0.1039         | Train Accuracy:  0.9337         | Train F1_macro:  0.9331         | Val Loss:  0.2811         | Val Accuracy:  0.8170         | Val F1_macro:  0.6406         


100%|██████████| 3494/3494 [12:56<00:00,  4.50it/s]


Epochs: 9         | Train Loss:  0.0949         | Train Accuracy:  0.9399         | Train F1_macro:  0.9402         | Val Loss:  0.2913         | Val Accuracy:  0.8210         | Val F1_macro:  0.6449         


100%|██████████| 3494/3494 [12:56<00:00,  4.50it/s]


Epochs: 10         | Train Loss:  0.0796         | Train Accuracy:  0.9493         | Train F1_macro:  0.9493         | Val Loss:  0.3111         | Val Accuracy:  0.8160         | Val F1_macro:  0.6342         


### Training the category classifier

In [None]:
EPOCHS = 10
OUTPUT_DIM = 10
LR = 1e-6

model = BertClassifier(OUTPUT_DIM)
              
train_evaluate(model, category_train_dataset, category_dev_dataset, LR, EPOCHS, 'Category')

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02-twitter were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dens

counts [ 167   79   18  975 1025 3616  112   67  323  606]
tensor([0.0010, 0.0003, 0.0003,  ..., 0.0003, 0.0003, 0.0003],
       dtype=torch.float64)


100%|██████████| 3494/3494 [12:56<00:00,  4.50it/s]


Epochs: 1         | Train Loss:  1.0841         | Train Accuracy:  0.2497         | Train F1_macro:  0.2284         | Val Loss:  1.0489         | Val Accuracy:  0.2280         | Val F1_macro:  0.2009         


100%|██████████| 3494/3494 [12:56<00:00,  4.50it/s]


Epochs: 2         | Train Loss:  0.8174         | Train Accuracy:  0.5268         | Train F1_macro:  0.5082         | Val Loss:  0.8198         | Val Accuracy:  0.3600         | Val F1_macro:  0.2971         


100%|██████████| 3494/3494 [12:56<00:00,  4.50it/s]


Epochs: 3         | Train Loss:  0.6383         | Train Accuracy:  0.6234         | Train F1_macro:  0.6099         | Val Loss:  0.7425         | Val Accuracy:  0.3930         | Val F1_macro:  0.3356         


100%|██████████| 3494/3494 [12:55<00:00,  4.50it/s]


Epochs: 4         | Train Loss:  0.5180         | Train Accuracy:  0.6896         | Train F1_macro:  0.6787         | Val Loss:  0.7184         | Val Accuracy:  0.4270         | Val F1_macro:  0.3474         


100%|██████████| 3494/3494 [12:55<00:00,  4.51it/s]


Epochs: 5         | Train Loss:  0.4471         | Train Accuracy:  0.7265         | Train F1_macro:  0.7195         | Val Loss:  0.6439         | Val Accuracy:  0.5110         | Val F1_macro:  0.3783         


100%|██████████| 3494/3494 [12:55<00:00,  4.51it/s]


Epochs: 6         | Train Loss:  0.3680         | Train Accuracy:  0.7881         | Train F1_macro:  0.7777         | Val Loss:  0.6895         | Val Accuracy:  0.4750         | Val F1_macro:  0.3611         


100%|██████████| 3494/3494 [12:56<00:00,  4.50it/s]


Epochs: 7         | Train Loss:  0.3282         | Train Accuracy:  0.8024         | Train F1_macro:  0.7961         | Val Loss:  0.6666         | Val Accuracy:  0.4970         | Val F1_macro:  0.3743         


100%|██████████| 3494/3494 [12:56<00:00,  4.50it/s]


Epochs: 8         | Train Loss:  0.2887         | Train Accuracy:  0.8280         | Train F1_macro:  0.8227         | Val Loss:  0.6343         | Val Accuracy:  0.5260         | Val F1_macro:  0.3820         


100%|██████████| 3494/3494 [12:56<00:00,  4.50it/s]


Epochs: 9         | Train Loss:  0.2475         | Train Accuracy:  0.8487         | Train F1_macro:  0.8450         | Val Loss:  0.6447         | Val Accuracy:  0.5270         | Val F1_macro:  0.3891         


100%|██████████| 3494/3494 [12:55<00:00,  4.50it/s]


Epochs: 10         | Train Loss:  0.2285         | Train Accuracy:  0.8605         | Train F1_macro:  0.8562         | Val Loss:  0.6288         | Val Accuracy:  0.5410         | Val F1_macro:  0.3924         


### Continue category training

In [None]:
EPOCHS = 5
OUTPUT_DIM = 10
LR = 1e-6

train_evaluate(model, category_train_dataset, category_dev_dataset, LR, EPOCHS, 'Category')

counts [ 167   79   18  975 1025 3616  112   67  323  606]
tensor([0.0010, 0.0003, 0.0003,  ..., 0.0003, 0.0003, 0.0003],
       dtype=torch.float64)


100%|██████████| 3494/3494 [13:59<00:00,  4.16it/s]


Epochs: 1         | Train Loss:  0.2046         | Train Accuracy:  0.8749         | Train F1_macro:  0.8716         | Val Loss:  0.6243         | Val Accuracy:  0.5750         | Val F1_macro:  0.3937         


100%|██████████| 3494/3494 [14:03<00:00,  4.14it/s]


Epochs: 2         | Train Loss:  0.1945         | Train Accuracy:  0.8799         | Train F1_macro:  0.8745         | Val Loss:  0.6441         | Val Accuracy:  0.5340         | Val F1_macro:  0.3859         


100%|██████████| 3494/3494 [14:02<00:00,  4.15it/s]


Epochs: 3         | Train Loss:  0.1656         | Train Accuracy:  0.9004         | Train F1_macro:  0.8953         | Val Loss:  0.6521         | Val Accuracy:  0.5440         | Val F1_macro:  0.3781         


100%|██████████| 3494/3494 [14:02<00:00,  4.14it/s]


Epochs: 4         | Train Loss:  0.1623         | Train Accuracy:  0.8988         | Train F1_macro:  0.8961         | Val Loss:  0.6393         | Val Accuracy:  0.5540         | Val F1_macro:  0.3987         


100%|██████████| 3494/3494 [14:03<00:00,  4.14it/s]


Epochs: 5         | Train Loss:  0.1466         | Train Accuracy:  0.9073         | Train F1_macro:  0.9061         | Val Loss:  0.6562         | Val Accuracy:  0.5630         | Val F1_macro:  0.3907         


# Prediction for test set (no gold labels given)

## Reading, visualizing and preprocessing

In [None]:
test = pd.read_csv('Dataset/test.csv')
print(len(test))

2000


In [None]:
test.head(10)

Unnamed: 0,id,text
0,0,بيل غيتس يعلن عن تلقيه الجرعة الأولى من اللقاح...
1,1,محتاجين لقاح يقاوم الإفراط في التفكير
2,2,- إحنا نخلي لقاح فايزر الأمريكي <LF>للمسئولين ...
3,3,متحدث الصحة: أخذ لقاح كورونا ليس شرطاً للسماح ...
4,4,"131,939 جرعة من لقاح #كوفيد19 تم تقديمها خلال ..."
5,5,أكثر من ١٠٠ مليون شخص حول العالم أخذوا لقاح ال...
6,6,أمير #الرياض يتلقى الجرعة الأولى من #لقاح_كورو...
7,7,@USER @USER لقاح مجاني https://t.co/chOtH9LG3V
8,8,لو سمعتوا كلام صديقي الطبيب العربي بعد حصوله ع...
9,9,"قال الرئیس أردوغان ""تركيا في المرتبة الثالثة ب..."


In [None]:
id_test, text_test = test['id'], test['text']
id_test, text_test = np.array(id_test), np.array(text_test)
print(id_test.shape, text_test.shape)

(2000,) (2000,)


In [None]:
for i in range(10):
  print(id_test[i], text_test[i])

0 بيل غيتس يعلن عن تلقيه الجرعة الأولى من اللقاح ضد فيروس #كورونا<LF><LF>- قال بيل غيتس "إحدى مزايا بلوغي عمر الـ 65 عامًا هي أنني مؤهل للحصول على لقاح COVID-19" https://t.co/JGZhBysT8M
1 محتاجين لقاح يقاوم الإفراط في التفكير
2 - إحنا نخلي لقاح فايزر الأمريكي <LF>للمسئولين وكبار الشخصيات ورجال الأعمال<LF>عشان بيكسبوا كتير و هيقدروا يدفعوا كويس <LF>- ونخلي لقاح موديرنا الأمريكي<LF>للدكاترة و التمريض أهو رخيص شوية<LF>- ونخلي لقاح سينوفارم الصيني ولقاح سبوتنيك الروسي للشعب عشان اللقاح رخيص و الشعب بيستحمل 🤣 https://t.co/K25PLmhjxf
3 متحدث الصحة: أخذ لقاح كورونا ليس شرطاً للسماح بالسفر للخارج  https://t.co/eMVUXxjftk @USER
4 131,939 جرعة من لقاح #كوفيد19 تم تقديمها خلال الـ24 ساعة الماضية، ليبلغ عدد الجرعات الكلي 1,797,926 بمعدل توزيع 18.18 جرعة لكل 100 شخص.<LF><LF>#يدا_بيد_نتعافى https://t.co/aQd30YXVzo
5 أكثر من ١٠٠ مليون شخص حول العالم أخذوا لقاح الكورونا https://t.co/Ve3ATxooBa
6 أمير #الرياض يتلقى الجرعة الأولى من #لقاح_كورونا.<LF>-<LF> https://t.co/q6o6DCnuiZ
7 @USER @USER لقاح مجاني

In [None]:
pre_text_test = PreProcessing(np.array(text_test))
print(text_test[0], '\n', pre_text_test[0])
print(len(max(pre_text_test,key=len)))

بيل غيتس يعلن عن تلقيه الجرعة الأولى من اللقاح ضد فيروس #كورونا<LF><LF>- قال بيل غيتس "إحدى مزايا بلوغي عمر الـ 65 عامًا هي أنني مؤهل للحصول على لقاح COVID-19" https://t.co/JGZhBysT8M 
 بيل غيتس يعلن عن تلقيه الجرعة الأولى من اللقاح ضد فيروس #كورونا<LF><LF>- قال بيل غيتس "إحدى مزايا بلوغي عمر الـ 65 عاما هي أنني مؤهل للحصول على لقاح COVID-19"
332


In [None]:
test_dataset = Dataset(pre_text_test, id_test)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1),

### predicting function

In [None]:
def predict(model, test_dataset):

  model.eval()

  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=2)

  y_id, y_pred = [], []

  with torch.no_grad():

    for test_input, test_id in tqdm(test_dataloader):
      
      mask = test_input['attention_mask'].to(device)
      input_id = test_input['input_ids'].squeeze(1).to(device)

      output = model(input_id, mask)

      y_id += test_id.tolist() 
      y_pred += output.argmax(dim=1).tolist()

  return y_id, y_pred

### Loading the stance model and evaluating on dev set

In [None]:
EPOCHS = 10
OUTPUT_DIM = 3
stance_model = BertClassifier(OUTPUT_DIM) 

stance_model.to(device)
stance_model.load_state_dict(torch.load('Models/BERT/Stance/Copy of F1 0.6521 Acc 0.8070.pt'))
val_dataloader = torch.utils.data.DataLoader(stance_dev_dataset, batch_size=2)
criterion = nn.CrossEntropyLoss()

loss_val, acc_val, f1_macro_val = evaluate(stance_model, stance_dev_dataset, val_dataloader, criterion)
print(
    f'Val Loss: {loss_val: .4f} \
    | Val Accuracy: {acc_val: .4f} \
    | Val F1_macro: {f1_macro_val: .4f} \
    ')

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02-twitter were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dens

Val Loss:  0.2994     | Val Accuracy:  0.8070     | Val F1_macro:  0.6521     


### Loading the category model and evaluating on dev set

In [None]:
EPOCHS = 10
OUTPUT_DIM = 10
category_model = BertClassifier(OUTPUT_DIM) 

category_model.to(device)
category_model.load_state_dict(torch.load('Models/BERT/Category/Copy of F1 0.3987 Acc 0.5540.pt'))
val_dataloader = torch.utils.data.DataLoader(category_dev_dataset, batch_size=2)
criterion = nn.CrossEntropyLoss()

loss_val, acc_val, f1_macro_val = evaluate(category_model, category_dev_dataset, val_dataloader, criterion)
print(
    f'Val Loss: {loss_val: .4f} \
    | Val Accuracy: {acc_val: .4f} \
    | Val F1_macro: {f1_macro_val: .4f} \
    ')

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02-twitter were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dens

Val Loss:  0.6393     | Val Accuracy:  0.5540     | Val F1_macro:  0.3987     


### Stance prediction

In [None]:
stance_y_id, stance_y_pred = predict(stance_model, test_dataset)

100%|██████████| 1000/1000 [01:03<00:00, 15.68it/s]


In [None]:
stance_y_pred = np.array(stance_y_pred) - 1

In [None]:
print(sorted(Counter(stance_y_pred).items()))

[(-1, 211), (0, 381), (1, 1408)]


### Category prediction

In [None]:
category_y_id, category_y_pred = predict(category_model, test_dataset)

100%|██████████| 1000/1000 [01:04<00:00, 15.48it/s]


In [None]:
category_y_pred = np.array([id2category[p] for p in category_y_pred])

In [None]:
print(sorted(Counter(category_y_pred).items()))

[('advice', 24), ('celebrity', 312), ('info_news', 557), ('others', 47), ('personal', 397), ('plan', 490), ('requests', 46), ('restrictions', 10), ('rumors', 44), ('unrelated', 73)]


### Outputing predictions to csv file

In [None]:
df = pd.DataFrame({
    'id': category_y_id, 
    'category': category_y_pred,
    'stance': stance_y_pred
})

In [None]:
df.head(10)

Unnamed: 0,id,category,stance
0,0,celebrity,1
1,1,unrelated,0
2,2,personal,0
3,3,restrictions,0
4,4,restrictions,1
5,5,info_news,1
6,6,celebrity,1
7,7,personal,1
8,8,personal,1
9,9,plan,1


In [None]:
df.to_csv('prediction.csv', sep=',', index=False)