# Importing libraries

In [23]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import re
import nltk
from tqdm import tqdm
from sklearn.metrics import f1_score

# Data PreProcessing

In [24]:
train = pd.read_csv('Dataset/train.csv')
text_train, category_train, stance_train = train['text'], train['category'], train['stance']

dev = pd.read_csv('Dataset/dev.csv')
text_dev, category_dev, stance_dev = dev['text'], dev['category'], dev['stance']

In [25]:
train.head(10)

Unnamed: 0,text,category,stance
0,بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الاب...,celebrity,1
1,وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...,info_news,1
2,قولكن رح يكونو اد المسؤولية ب لبنان لما يوصل ...,info_news,1
3,#تركيا.. وزير الصحة فخر الدين قوجة يتلقى أول ج...,celebrity,1
4,وئام وهاب يشتم الدول الخليجية في كل طلة اعلامي...,personal,0
5,"لقاح #كورونا في أميركا.. قلق متزايد من ""التوزي...",info_news,0
6,لبنان اشترى مليونان لقاح امريكي اذا شلنا يلي ع...,info_news,1
7,من عوارض لقاح كورونا<LF>هو تهكير حسابك عتويتر<...,personal,0
8,هناك 1780 مليونيراً في لبنان. ماذا لو فُرضت ال...,unrelated,0
9,دعبول حضرتك منو انت وتطلب من قائد دولة إسلامية...,info_news,1


In [26]:
dev.head(10)

Unnamed: 0,text,category,stance
0,#مريم_رجوي: <LF>حظر خامنئي المجرم شراء #لقاح_ك...,info_news,1
1,#الصحة:<LF>•تم إعطاء 259.530 جرعة من لقاح #كور...,plan,1
2,#خادم_الحرمين - حفظه الله - يتلقى الجرعة الأول...,celebrity,1
3,#الصحه_العالميه: لقاحات #كورونا آمنة ولا خوف م...,info_news,1
4,"#وزيرة_الصحة ""#هالة_زايد"" تقول إنه يجرى مراجعة...",info_news,1
5,2️⃣ وانتهى الفريق من الدراسات قبل السريرية ونش...,info_news,1
6,عاجل 🔴 <LF>.<LF><LF>.<LF><LF>وزارة الصحة :<LF>...,plan,1
7,#فيديو | السفير الأميركي لدى السعودية بعد تلقي...,info_news,1
8,تصريحات وبس الحكومة مع السيسي علي حسب اللقطة! ...,info_news,0
9,الاتحاد الاوروبي تفاوض لشراء لقاحات الكورونا م...,info_news,1


In [27]:
text_train, category_train, stance_train = np.array(train['text']), np.array(train['category']), np.array(train['stance'])
text_dev, category_dev, stance_dev = np.array(dev['text']), np.array(dev['category']), np.array(dev['stance'])

print(text_train.shape, category_train.shape, stance_train.shape)
print(text_dev.shape, category_dev.shape, stance_dev.shape)

(6988,) (6988,) (6988,)
(1000,) (1000,) (1000,)


In [28]:
def PreProcessing(text):

    # remove links
    text = [re.sub(r'https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in text]
    # text = [re.sub(r'https?:\/\/\S*', '', x, flags=re.MULTILINE) for x in text]

    # remove emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    
    text = [emoji_pattern.sub(r'', x) for x in text] # no emoji

    # remove english words
    text = [re.sub(r'\s*[A-Za-z]+\b', '' , x) for x in text]

    # tokenize
    text = [nltk.tokenize.word_tokenize(x) for x in text]

    # # remove stop-words
    # stopwords = set(nltk.corpus.stopwords.words("arabic"))

    # for i in range(len(text)):
    #     text[i] = [word for word in text[i] if word not in stopwords]


    for i in range(len(text)):
        text[i] = [word for word in text[i] if len(word)>2]

    # but anything in empty strings
    for i in range(len(text)):
        if(len(text[i])==0):
            text[i]='<unk>'
    
    return text

In [31]:
from arabert.preprocess import ArabertPreprocessor
from transformers import AutoTokenizer, AutoModelForMaskedLM

model_name="aubmindlab/bert-base-arabertv02-twitter"
arabert_prep = ArabertPreprocessor(model_name=model_name)

text = "ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري"
arabert_prep.preprocess(text)
  
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02-twitter")
model = AutoModelForMaskedLM.from_pretrained("aubmindlab/bert-base-arabertv02-twitter")

def PreProcessing2(text):
    # print(text[10])

    text = [arabert_prep.preprocess(x) for x in text]
    # print(text[10])
    
    text = tokenizer(text)
    # print(len(text["input_ids"][10]),text["input_ids"][10])

    text = torch.nn.utils.rnn.pad_sequence([torch.tensor(sentence) for sentence in text["input_ids"]], batch_first=True, padding_value=0)
    # print(text[10])

    text = model(text).logits
    # print(text[10].shape)

    return text

In [33]:
PreProcessing2(text_train[:512])


In [9]:
# text = text_train[2]
# print(text)
# text = arabert_prep.preprocess(text)
# print(text)
# text = tokenizer(text)
# print(print(text),torch.tensor(text['input_ids']).unsqueeze(0).shape)
# text = model(torch.tensor(text['input_ids']).unsqueeze(0))
# print(text)

In [10]:
print(max(text_train, key=len))
text_train = PreProcessing(text_train)
text_dev = PreProcessing(text_dev)

الامريكيين متهمون بصنع ونشر فيروس كورونا ولذلك لا يمكن الوثوق بهم”<LF>الإمام الخامنئي<LF><LF>#لقاح_آمن	info_news	0
train	حبيبنا وقرة أعيننا سيدي #خادم_الحرمين_الشريفين الملك سلمان حفظه الله يتلقى الجرعة الأولى من لقاح كورنا … نفعه الله به ومتعه بالصحة والعافيه. https://t.co/AJRzC7dCWe	celebrity	1
train	رغم تلقيه جرعتين من لقاح #فايزر.. إصابة كبير حاخامات #تل_أبيب، يسرائيل لاو،  83 عاما، بفيروس #كورونا، حيث انتقلت له العدوى من زوجته بعد مخالطتها مصابا آخر https://t.co/RGI6WTgrxf	celebrity	0
train	تلقيت قبل قليل الجرعة الثانية من لقاح كورونا، وكلي فخر بجهود وطننا الغالي وتوجيهات قيادتنا الرشيدة التي تؤكد أن صحة الإنسان أولاً.  🇸🇦🇸🇦🇸🇦🇸🇦 https://t.co/XGstr9Zvzf	info_news	1
train	شركة صحة": جزيل الشكر للمواطنة ملهية شويرب سعيد العامري، التي تبلغ ١٠٢ عاماً<LF>لكونها قدوة لجميع أفراد المجتمع من خلال <LF> تلقيها أول جرعة من لقاح كوفيد-19 في مركز القوع الصحي #الإمارات_اليوم https://t.co/uBSCd0JZ4Y


In [11]:
with open('processed_train.txt','w', encoding='utf8') as f:
	for i in text_train:
		f.write('%s\n'%i)
print(text_train[0])

['بيل', 'غيتس', 'يتلقى', 'لقاح', 'كوفيد19', 'غير', 'تصوير', 'الابرة', 'السيرنجة', 'الدواء', 'لابس', 'بولو', 'صيفي', 'الشتاء', 'يقول', 'إحدى', 'مزايا', 'عمر', 'عامًا', 'انه', 'مؤهل', 'للحصول', 'على', 'اللقاح', '...', 'يعنى', 'كان', 'يحتاج', 'اللقاح', 'كان', 'عمره', 'اصغر']


In [12]:
def BuildVocab(text, pad='<pad>', unk='<unk>'):

    vocab = set()    
    for x in text:
        vocab |= set(x)

    vocab = [pad, unk] + list(vocab)

    id2word = {i: word for i, word in enumerate(vocab)}
    word2id = {word: i for i, word in id2word.items()}
    vocab_size = len(vocab)

    return vocab_size, vocab, id2word, word2id    

In [13]:
vocab_size, vocab, id2word, word2id = BuildVocab(text_train)
print(vocab_size)

32058


In [14]:
categories = set(category_train)
print(categories)
category2id = {word:i for i, word in enumerate(list(categories))}
print(category2id['celebrity'])

{'others', 'personal', 'restrictions', 'info_news', 'rumors', 'unrelated', 'celebrity', 'advice', 'plan', 'requests'}
6


# Model Building

## LSTM

### Ideas to try
1) bi-directional
2) pre-training
3) multi-layers
4) BERT
5) transformers notebook
6) packed_padded_sequences
7) pre-trained embedding

### Building Model

In [15]:
class Dataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad='<pad>', unk='<unk>', word2id=word2id):

    x = x.copy()

    # src lengths to be used in pack padded
    self.seq_lengths = torch.LongTensor(list(map(len, x)))

    print(x[0], self.seq_lengths[0])

    for i in range(len(x)):
      x[i] = [word2id[word] if word in word2id else word2id[unk] for word in x[i]]
    
    print(x[0])

    self.X = torch.nn.utils.rnn.pad_sequence([torch.tensor(sentence) for sentence in x], batch_first=True, padding_value=word2id[pad])

    # sort sequeces decreasing in size
    self.seq_lengths, perm_idx = self.seq_lengths.sort(0, descending=True)
    self.X = self.X[perm_idx]

    print(self.X[0])

    print(self.X.shape)

    print(min(self.seq_lengths))
    
    self.Y = torch.tensor(y)
    self.len = len(x)
    self.pad = pad

  def __len__(self):
    return self.len

  def __getitem__(self, idx):
    return self.X[idx], self.Y[idx], self.seq_lengths[idx]

In [16]:
stance_train_dataset = Dataset(text_train, stance_train + 1)
category_train_dataset = Dataset(text_train, [category2id[category] for category in category_train])

stance_dev_dataset = Dataset(text_dev, stance_dev + 1)
category_dev_dataset = Dataset(text_dev, [category2id[category] for category in category_dev])

['بيل', 'غيتس', 'يتلقى', 'لقاح', 'كوفيد19', 'غير', 'تصوير', 'الابرة', 'السيرنجة', 'الدواء', 'لابس', 'بولو', 'صيفي', 'الشتاء', 'يقول', 'إحدى', 'مزايا', 'عمر', 'عامًا', 'انه', 'مؤهل', 'للحصول', 'على', 'اللقاح', '...', 'يعنى', 'كان', 'يحتاج', 'اللقاح', 'كان', 'عمره', 'اصغر'] tensor(32)
[19009, 24724, 1963, 12351, 17173, 11074, 28364, 11439, 28735, 3431, 30322, 20742, 11516, 18377, 24930, 20171, 7087, 12166, 16670, 27485, 27990, 30191, 19956, 28836, 16585, 21888, 30814, 15008, 28836, 30814, 1942, 31652]
tensor([27908, 23445, 22255, 12523, 16578, 13183, 18058,  7816, 14310, 23064,
        12393, 18856, 23126, 13955, 25158, 27186,  1427, 19281, 14591, 29717,
        19131, 30914, 22503,  1963, 10699, 25253, 12351, 26524, 14492, 22503,
        29403, 18659, 29603,  1341, 29876, 23100, 12351,   366, 27996,  9990,
         2718, 29625, 23961, 20627, 11784, 13069,  9861, 14021,  5904, 31739,
        15402,  1334,  6912, 24401, 15860, 20931, 22684, 10761, 10699,  1149,
        12351,  9861, 23599

In [17]:
class LSTM(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, output_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        self.output_dim = output_dim

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, dropout=dropout)

        self.fc_out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_len):
        #src = [src len, batch size]
        
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)
        #embedded = [src len, batch size, emb dim]

        packed_embedded =  torch.nn.utils.rnn.pack_padded_sequence(embedded, src_len.cpu().numpy(), batch_first=False)

        outputs, (hidden, cell) = self.lstm(packed_embedded)
        #outputs = [src len, batch size, hid dim]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        #outputs are always from the top hidden layer

        prediction = self.fc_out(hidden)
        #prediction = [1, batch size, output dim]

        prediction = prediction.squeeze(0)
        #prediction = [batch size, output dim]

        return prediction        

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Training

In [19]:
def train(model, train_dataset, train_dataloader, optimizer, criterion, clip):
    
    model.train()
        
    total_acc_train = 0.0
    total_loss_train = 0.0

    for train_input, train_label, src_len in tqdm(train_dataloader):

        train_input = train_input.to(device).permute(1, 0)
        # print(train_input.shape)
        train_label = train_label.to(device)

        output = model(train_input, src_len)
        # print(output.shape, train_label.shape)
        
        output_dim = output.shape[-1]
        output = output.view(-1, output_dim)
        train_label = train_label.view(-1)

        batch_loss = criterion(output, train_label)

        total_loss_train += batch_loss
        
        acc = torch.sum(torch.argmax(output, -1) == train_label) 
        total_acc_train += acc

        optimizer.zero_grad()

        batch_loss.backward()

        # torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()

    # calculate loss    
    epoch_loss = total_loss_train / len(train_dataset)
    
    # calculate accuracy
    epoch_acc = total_acc_train / len(train_dataset)

    # calculate f1 score
    train_input, train_label, src_len = train_dataset[:]
    train_input = train_input.to(device).permute(1, 0)
    train_label = train_label.to(device)
    output = model(train_input, src_len)

    output_dim = output.shape[-1]
    output = output.view(-1, output_dim)
    train_label = train_label.view(-1)
    y_true, y_pred = train_label, torch.argmax(output, -1)

    f1_macro = f1_score(y_true, y_pred, average='macro')

    return epoch_loss, epoch_acc, f1_macro

In [20]:
def evaluate(model, test_dataset, test_dataloader, criterion):

  model.eval()

  total_acc_test = 0.0
  total_loss_test = 0.0
  
  with torch.no_grad():

    for test_input, test_label, src_len in tqdm(test_dataloader):

      test_input = test_input.to(device).permute(1, 0)
      test_label = test_label.to(device)


      output = model(test_input, src_len)

      batch_loss = criterion(output.view(-1, model.output_dim), test_label.view(-1))

      total_loss_test += batch_loss

      acc = torch.sum(torch.argmax(output, -1)==test_label)
      total_acc_test += acc

    # calculate loss
    total_loss_test /= len(test_dataset)

    # calculate accuracy
    total_acc_test /= len(test_dataset)

    # calculate f1 score
    test_input, test_label, src_len = test_dataset[:]
    test_input = test_input.to(device).permute(1, 0)
    test_label = test_label.to(device)
    output = model(test_input, src_len)

    output_dim = output.shape[-1]
    output = output.view(-1, output_dim)
    test_label = test_label.view(-1)
    y_true, y_pred = test_label, torch.argmax(output, -1)

    f1_macro = f1_score(y_true, y_pred, average='macro')
  
  return total_loss_test, total_acc_test, f1_macro

In [21]:
def train_evaluate(model, train_dataset, dev_dataset, model_name, batch_size=512, epochs=10, learning_rate=0.01, clip=1):

  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)

  dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=batch_size)

  # criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
  criterion = nn.CrossEntropyLoss()

  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  
  model = model.to(device)
  criterion = criterion.to(device)

  best_f1_macro = 0

  for epoch_num in range(epochs):

    epoch_loss, epoch_acc, train_f1_macro = train(model, train_dataset, train_dataloader, optimizer, criterion, clip)
    dev_loss, dev_acc, dev_f1_macro = evaluate(model, dev_dataset, dev_dataloader, criterion)

    if dev_f1_macro > best_f1_macro:
      best_f1_macro = dev_f1_macro
      torch.save(model.state_dict(), 'best_'+model_name+'.pt')

    print(f'Train = Epochs: {epoch_num + 1} | Loss: {epoch_loss} | Accuracy: {epoch_acc} | f1_macro : {train_f1_macro}')
    print(f'Dev = Epochs: {epoch_num + 1} | Loss: {dev_loss} | Accuracy: {dev_acc} | f1_macro : {dev_f1_macro}')    

  model.load_state_dict(torch.load('best_'+model_name+'.pt'))

  dev_loss, dev_acc, dev_f1_macro = evaluate(model, dev_dataset, dev_dataloader, criterion)

  print(f'Best Dev = Loss: {dev_loss} | Accuracy: {dev_acc} | f1_macro : {dev_f1_macro}')

In [22]:
INPUT_DIM = vocab_size
OUTPUT_DIM = 3
EMB_DIM = 50 #256
HID_DIM = 50gi #512
DROPOUT = 0.5

stance_model = LSTM(INPUT_DIM, EMB_DIM, HID_DIM, OUTPUT_DIM, DROPOUT).to(device)

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
stance_model.apply(init_weights)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(stance_model):,} trainable parameters')

SyntaxError: invalid decimal literal (4236606597.py, line 4)

In [None]:
train_evaluate(stance_model, stance_train_dataset, stance_dev_dataset, 'stance_model')

In [None]:
INPUT_DIM = vocab_size
OUTPUT_DIM = 10
EMB_DIM = 50 #256
HID_DIM = 50 #512
DROPOUT = 0.5

category_model = LSTM(INPUT_DIM, EMB_DIM, HID_DIM, OUTPUT_DIM, DROPOUT).to(device)

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
category_model.apply(init_weights)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(category_model):,} trainable parameters')

In [None]:
train_evaluate(category_model, category_train_dataset, category_dev_dataset, 'category_model')

## Dumped

In [None]:
evaluate(stance_model, stance_dev_dataset)

In [None]:
evaluate(category_model, category_dev_dataset)

In [None]:
def train(model, train_dataset, batch_size=512, epochs=10, learning_rate=0.01):
  
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)

  # criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
  criterion = nn.CrossEntropyLoss()

  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  
  model = model.to(device)
  criterion = criterion.to(device)

  for epoch_num in range(epochs):
    total_acc_train = 0.0
    total_loss_train = 0.0

    for train_input, train_label, src_len in tqdm(train_dataloader):

      train_input = train_input.to(device).permute(1, 0)
      # print(train_input.shape)
      train_label = train_label.to(device)

      output = model(train_input, src_len)

      # print(output.shape, train_label.shape)
      
      batch_loss = criterion(output.view(-1, model.output_dim), train_label.view(-1))

      total_loss_train += batch_loss
      
      acc = torch.sum(torch.argmax(output, -1) == train_label) 
      total_acc_train += acc

      optimizer.zero_grad()

      batch_loss.backward()

      optimizer.step()
      
    epoch_loss = total_loss_train / len(train_dataset)

    epoch_acc = total_acc_train / len(train_dataset)

    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')


In [None]:
def evaluate(model, test_dataset, batch_size=512):

  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  model = model.to(device)

  total_acc_test = 0.0
  
  with torch.no_grad():

    for test_input, test_label, src_len in tqdm(test_dataloader):

      test_input = test_input.to(device).permute(1, 0)
      test_label = test_label.to(device)


      output = model(test_input, src_len)

      acc = torch.sum(torch.argmax(output, -1)==test_label)
      total_acc_test += acc
    
    total_acc_test /= len(test_dataset)
  
  print(f'\nDev Accuracy: {total_acc_test}')