# Importing libraries

In [977]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import re
import nltk
from tqdm import tqdm

# Data PreProcessing

In [978]:
train = pd.read_csv('Dataset/train.csv')
text_train, category_train, stance_train = train['text'], train['category'], train['stance']

dev = pd.read_csv('Dataset/dev.csv')
text_dev, category_dev, stance_dev = dev['text'], dev['category'], dev['stance']

In [979]:
train.head(10)

Unnamed: 0,text,category,stance
0,بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الاب...,celebrity,1
1,وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...,info_news,1
2,قولكن رح يكونو اد المسؤولية ب لبنان لما يوصل ...,info_news,1
3,#تركيا.. وزير الصحة فخر الدين قوجة يتلقى أول ج...,celebrity,1
4,وئام وهاب يشتم الدول الخليجية في كل طلة اعلامي...,personal,0
5,"لقاح #كورونا في أميركا.. قلق متزايد من ""التوزي...",info_news,0
6,لبنان اشترى مليونان لقاح امريكي اذا شلنا يلي ع...,info_news,1
7,من عوارض لقاح كورونا<LF>هو تهكير حسابك عتويتر<...,personal,0
8,هناك 1780 مليونيراً في لبنان. ماذا لو فُرضت ال...,unrelated,0
9,دعبول حضرتك منو انت وتطلب من قائد دولة إسلامية...,info_news,1


In [980]:
dev.head(10)

Unnamed: 0,text,category,stance
0,#مريم_رجوي: <LF>حظر خامنئي المجرم شراء #لقاح_ك...,info_news,1
1,#الصحة:<LF>•تم إعطاء 259.530 جرعة من لقاح #كور...,plan,1
2,#خادم_الحرمين - حفظه الله - يتلقى الجرعة الأول...,celebrity,1
3,#الصحه_العالميه: لقاحات #كورونا آمنة ولا خوف م...,info_news,1
4,"#وزيرة_الصحة ""#هالة_زايد"" تقول إنه يجرى مراجعة...",info_news,1
5,2️⃣ وانتهى الفريق من الدراسات قبل السريرية ونش...,info_news,1
6,عاجل 🔴 <LF>.<LF><LF>.<LF><LF>وزارة الصحة :<LF>...,plan,1
7,#فيديو | السفير الأميركي لدى السعودية بعد تلقي...,info_news,1
8,تصريحات وبس الحكومة مع السيسي علي حسب اللقطة! ...,info_news,0
9,الاتحاد الاوروبي تفاوض لشراء لقاحات الكورونا م...,info_news,1


In [981]:
text_train, category_train, stance_train = np.array(train['text']), np.array(train['category']), np.array(train['stance'])
text_dev, category_dev, stance_dev = np.array(dev['text']), np.array(dev['category']), np.array(dev['stance'])

print(text_train.shape, category_train.shape, stance_train.shape)
print(text_dev.shape, category_dev.shape, stance_dev.shape)

(6988,) (6988,) (6988,)
(1000,) (1000,) (1000,)


In [982]:
def PreProcessing(text):

    # remove links
    text = [re.sub(r'https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in text]
    # text = [re.sub(r'https?:\/\/\S*', '', x, flags=re.MULTILINE) for x in text]

    # remove emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    
    text = [emoji_pattern.sub(r'', x) for x in text] # no emoji

    # remove english words
    text = [re.sub(r'\s*[A-Za-z]+\b', '' , x) for x in text]

    # tokenize
    text = [nltk.tokenize.word_tokenize(x) for x in text]

    # # remove stop-words
    # stopwords = set(nltk.corpus.stopwords.words("arabic"))

    # for i in range(len(text)):
    #     text[i] = [word for word in text[i] if word not in stopwords]


    for i in range(len(text)):
        text[i] = [word for word in text[i] if len(word)>2]
    
    return text

In [983]:
text_train = PreProcessing(text_train)
text_dev = PreProcessing(text_dev)

In [984]:
with open('processed_train.txt','w', encoding='utf8') as f:
	for i in text_train:
		f.write('%s\n'%i)
print(text_train[0])

['بيل', 'غيتس', 'يتلقى', 'لقاح', 'كوفيد19', 'غير', 'تصوير', 'الابرة', 'السيرنجة', 'الدواء', 'لابس', 'بولو', 'صيفي', 'الشتاء', 'يقول', 'إحدى', 'مزايا', 'عمر', 'عامًا', 'انه', 'مؤهل', 'للحصول', 'على', 'اللقاح', '...', 'يعنى', 'كان', 'يحتاج', 'اللقاح', 'كان', 'عمره', 'اصغر']


In [985]:
def BuildVocab(text, pad='<pad>', unk='<unk>'):
    vocab = set([pad, unk])
    
    for x in text:
        vocab |= set(x)

    id2word = {i: word for i, word in enumerate(list(vocab))}
    word2id = {word: i for i, word in id2word.items()}
    vocab_size = len(vocab)

    return vocab_size, vocab, id2word, word2id    

In [986]:
vocab_size, vocab, id2word, word2id = BuildVocab(text_train)
print(vocab_size)

32053


In [987]:
categories = set(category_train)
print(categories)
category2id = {word:i for i, word in enumerate(list(categories))}
print(category2id['celebrity'])

{'personal', 'requests', 'advice', 'celebrity', 'restrictions', 'rumors', 'plan', 'info_news', 'others', 'unrelated'}
3


# Model Building

## LSTM

### Ideas to try
1) bi-directional
2) pre-training
3) multi-layers
4) BERT
5) transformers notebook

### Building Model

In [988]:
class Dataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad='<pad>', unk='<unk>', word2id=word2id):

    x = x.copy()

    # print(x[0])

    for i in range(len(x)):
      x[i] = [word2id[word] if word in word2id else word2id[unk] for word in x[i]]
    
    # print(x[0])

    self.X = torch.nn.utils.rnn.pad_sequence([torch.tensor(sentence) for sentence in x], batch_first=True, padding_value=word2id[pad])

    # print(self.X[0])

    # print(self.X.shape)
    
    self.Y = torch.tensor(y)
    self.len = len(x)
    self.pad = pad

  def __len__(self):
    return self.len

  def __getitem__(self, idx):
    return self.X[idx], self.Y[idx]

In [989]:
stance_train_dataset = Dataset(text_train, stance_train + 1)
category_train_dataset = Dataset(text_train, [category2id[category] for category in category_train])

stance_dev_dataset = Dataset(text_dev, stance_dev + 1)
category_dev_dataset = Dataset(text_dev, [category2id[category] for category in category_dev])

In [990]:
class LSTM(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, output_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        self.output_dim = output_dim

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, dropout=dropout)

        self.fc_out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        #src = [src len, batch size]
        
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)
        #embedded = [src len, batch size, emb dim]

        outputs, (hidden, cell) = self.lstm(embedded)
        #outputs = [src len, batch size, hid dim]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        #outputs are always from the top hidden layer

        prediction = self.fc_out(hidden)
        #prediction = [1, batch size, output dim]

        prediction = prediction.squeeze(0)
        #prediction = [batch size, output dim]

        return prediction        

In [991]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Training

In [992]:

def train(model, train_dataset, batch_size=512, epochs=5, learning_rate=0.01):
  
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)

  # criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
  criterion = nn.CrossEntropyLoss()

  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  
  model = model.to(device)
  criterion = criterion.to(device)

  for epoch_num in range(epochs):
    total_acc_train = 0.0
    total_loss_train = 0.0

    for train_input, train_label in tqdm(train_dataloader):

      train_input = train_input.to(device).permute(1, 0)
      # print(train_input.shape)
      train_label = train_label.to(device)

      output = model(train_input)

      # print(output.shape, train_label.shape)
      
      batch_loss = criterion(output.view(-1, model.output_dim), train_label.view(-1))

      total_loss_train += batch_loss
      
      acc = torch.sum(torch.argmax(output, -1) == train_label) 
      total_acc_train += acc

      optimizer.zero_grad()

      batch_loss.backward()

      optimizer.step()
      
    epoch_loss = total_loss_train / len(train_dataset)

    epoch_acc = total_acc_train / len(train_dataset)

    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')


In [993]:
INPUT_DIM = vocab_size
OUTPUT_DIM = 3
EMB_DIM = 50 #256
HID_DIM = 50 #512
DROPOUT = 0.0

stance_model = LSTM(INPUT_DIM, EMB_DIM, HID_DIM, OUTPUT_DIM, DROPOUT).to(device)

In [994]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
stance_model.apply(init_weights)

LSTM(
  (embedding): Embedding(32053, 50)
  (lstm): LSTM(50, 50)
  (fc_out): Linear(in_features=50, out_features=3, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

In [995]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(stance_model):,} trainable parameters')

The model has 1,623,203 trainable parameters


In [996]:
train(stance_model, stance_train_dataset)

100%|██████████| 14/14 [00:04<00:00,  3.00it/s]


Epochs: 1 | Train Loss: 0.0015010071219876409         | Train Accuracy: 0.7492845058441162



100%|██████████| 14/14 [00:05<00:00,  2.41it/s]


Epochs: 2 | Train Loss: 0.0012770949397236109         | Train Accuracy: 0.7925014495849609



100%|██████████| 14/14 [00:05<00:00,  2.42it/s]


Epochs: 3 | Train Loss: 0.0012749889865517616         | Train Accuracy: 0.7925014495849609



100%|██████████| 14/14 [00:05<00:00,  2.72it/s]


Epochs: 4 | Train Loss: 0.001277221948839724         | Train Accuracy: 0.7925014495849609



100%|██████████| 14/14 [00:04<00:00,  2.85it/s]

Epochs: 5 | Train Loss: 0.0012790737673640251         | Train Accuracy: 0.7925014495849609






In [997]:
INPUT_DIM = vocab_size
OUTPUT_DIM = 10
EMB_DIM = 50 #256
HID_DIM = 50 #512
DROPOUT = 0.0

category_model = LSTM(INPUT_DIM, EMB_DIM, HID_DIM, OUTPUT_DIM, DROPOUT).to(device)

In [998]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
category_model.apply(init_weights)

LSTM(
  (embedding): Embedding(32053, 50)
  (lstm): LSTM(50, 50)
  (fc_out): Linear(in_features=50, out_features=10, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

In [999]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(category_model):,} trainable parameters')

The model has 1,623,560 trainable parameters


In [1000]:
train(category_model, category_train_dataset)

100%|██████████| 14/14 [00:05<00:00,  2.43it/s]


Epochs: 1 | Train Loss: 0.0034677402582019567         | Train Accuracy: 0.48282769322395325



100%|██████████| 14/14 [00:05<00:00,  2.50it/s]


Epochs: 2 | Train Loss: 0.00305797066539526         | Train Accuracy: 0.5174584984779358



100%|██████████| 14/14 [00:05<00:00,  2.77it/s]


Epochs: 3 | Train Loss: 0.003039069939404726         | Train Accuracy: 0.5174584984779358



100%|██████████| 14/14 [00:05<00:00,  2.78it/s]


Epochs: 4 | Train Loss: 0.003038821741938591         | Train Accuracy: 0.5174584984779358



100%|██████████| 14/14 [00:05<00:00,  2.56it/s]

Epochs: 5 | Train Loss: 0.00303946016356349         | Train Accuracy: 0.5174584984779358






## Evaluation

In [1001]:
def evaluate(model, test_dataset, batch_size=512):

  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  model = model.to(device)

  total_acc_test = 0.0
  
  with torch.no_grad():

    for test_input, test_label in tqdm(test_dataloader):

      test_input = test_input.to(device).permute(1, 0)
      test_label = test_label.to(device)


      output = model(test_input)

      acc = torch.sum(torch.argmax(output, -1)==test_label)
      total_acc_test += acc
    
    total_acc_test /= len(test_dataset)
  
  print(f'\nDev Accuracy: {total_acc_test}')

In [1004]:
evaluate(stance_model, stance_dev_dataset)

100%|██████████| 2/2 [00:00<00:00,  8.50it/s]


Dev Accuracy: 0.8040000200271606





In [1005]:
evaluate(category_model, category_dev_dataset)

100%|██████████| 2/2 [00:00<00:00, 11.46it/s]


Dev Accuracy: 0.5450000166893005



