<a href="https://colab.research.google.com/github/beltagy97/NADI_Shared_Task/blob/master/BERT_sequence_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing requirements

In [0]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P4


In [0]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |▋                               | 10kB 22.5MB/s eta 0:00:01[K     |█▏                              | 20kB 27.4MB/s eta 0:00:01[K     |█▊                              | 30kB 31.5MB/s eta 0:00:01[K     |██▎                             | 40kB 35.0MB/s eta 0:00:01[K     |███                             | 51kB 36.2MB/s eta 0:00:01[K     |███▌                            | 61kB 38.6MB/s eta 0:00:01[K     |████                            | 71kB 39.3MB/s eta 0:00:01[K     |████▋                           | 81kB 39.6MB/s eta 0:00:01[K     |█████▎                          | 92kB 40.5MB/s eta 0:00:01[K     |█████▉                          | 102kB 40.2MB/s eta 0:00:01[K     |██████▍                         | 112kB 40.2MB/s eta 0:00:01[K     |███████                         | 

In [0]:
!pip install pyarabic

Collecting pyarabic
[?25l  Downloading https://files.pythonhosted.org/packages/ee/59/2c7efe30a789c1dfd3c5c15b9b06fcde8cde67ff1c27adabb78692eb7f7f/PyArabic-0.6.7.tar.gz (103kB)
[K     |███▏                            | 10kB 24.6MB/s eta 0:00:01[K     |██████▍                         | 20kB 31.7MB/s eta 0:00:01[K     |█████████▌                      | 30kB 36.4MB/s eta 0:00:01[K     |████████████▊                   | 40kB 38.2MB/s eta 0:00:01[K     |███████████████▉                | 51kB 20.9MB/s eta 0:00:01[K     |███████████████████             | 61kB 19.3MB/s eta 0:00:01[K     |██████████████████████▏         | 71kB 16.4MB/s eta 0:00:01[K     |█████████████████████████▍      | 81kB 16.0MB/s eta 0:00:01[K     |████████████████████████████▌   | 92kB 16.5MB/s eta 0:00:01[K     |███████████████████████████████▊| 102kB 16.0MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 16.0MB/s 
[?25hBuilding wheels for collected packages: pyarabic
  Building wheel 

In [0]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
model = AutoModel.from_pretrained("asafaya/bert-base-arabic")

In [0]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "asafaya/bert-base-arabic", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 21, 
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# reading data

In [0]:
labels_dictionary = {'Iraq':0,
'Egypt': 1,
'Algeria':2,
'Yemen':3,
'Saudi_Arabia':4,
'Syria':5,
'United_Arab_Emirates':6,
'Oman':7,
'Jordan':8,
'Tunisia':9,
'Kuwait':10,
'Morocco':11,
'Libya':12,
'Qatar':13,
'Lebanon':14,
'Sudan':15,
'Mauritania':16,
'Palestine':17,
'Somalia':18,
'Bahrain':19,
'Djibouti':20}

In [0]:
import re
import pyarabic.araby as arb
def clean_tweet(tweet):
  result = re.sub(r"http\S+", "", tweet)
  result=re.sub(r"pic\S+", "", result)
  result = re.sub(r"@\S+","",result)
  result = arb.strip_tashkeel(result)
  return result

In [0]:
def read_tweets_labeled(file):
  df = pd.read_csv(file,sep="\t",dtype="string")
  df['#2 tweet_content'] = df['#2 tweet_content'].apply(lambda x : clean_tweet(x))
  df['label'] = df['#3 country_label'].apply(lambda x : labels_dictionary.get(x)) 
  return df

In [0]:
import pandas as pd
data = read_tweets_labeled('drive/My Drive/NADI_release/train_labeled.tsv')

In [0]:
from sklearn.utils import resample
df_majority = data[data['label'] == 9]
qatar = data[data['label'] == 13]
bahrain = data[data['label'] == 19]
dijbouti = data[data['label'] == 20]

In [0]:
qatar_upsampled = resample(qatar, replace=True, n_samples=len(df_majority) - len(qatar))
bahrain_upsampled = resample(bahrain, replace=True, n_samples=len(df_majority) - len(bahrain))
dijbouti_upsampled = resample(dijbouti, replace=True, n_samples=len(df_majority) - len(dijbouti))

In [0]:
data = pd.concat([data, qatar_upsampled,bahrain_upsampled,dijbouti_upsampled])
data.label.value_counts()

1     4473
0     2556
4     2312
2     1491
7     1098
6     1070
5     1070
11    1070
12    1070
3      851
9      750
20     750
19     750
13     750
14     639
8      426
10     420
17     420
18     210
16     210
15     210
Name: label, dtype: int64

In [0]:
data.head()

Unnamed: 0,#1 tweet_ID,#2 tweet_content,#3 country_label,#4 province_label,label
0,Dev_1,الفار العور يشوف فقط كيسي ومايشوف ماتويد,Iraq,iq_Al-Anbar,0
1,Dev_2,ي دينيييي ربنا يستر,Egypt,eg_Alexandria,1
2,Dev_3,أساسا نسبكم قذر ونجس بلاش تتفاخروا بنجاستكم يا...,Iraq,iq_Maysan,0
3,Dev_4,ليست كل المشاعر تحتاج إلى حبيب بعض المشاعر تحت...,Morocco,ma_Oriental,11
4,Dev_5,لأ ني حاضرها هذي لايف,Libya,ly_Al-Jabal-al-Akhdar,12


# testing tokenizer

In [0]:
train_text = data['#2 tweet_content'].values
train_labels = data['label'].values

In [0]:
print(' Original: ', X_train[2])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(X_train[2]))

print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(X_train[2])))

 Original:  خلص يبقى بعرفكيش
Tokenized:  ['خلص', 'يبقى', 'بع', '##رف', '##كي', '##ش']
Token IDs:  [29471, 8081, 2921, 1870, 2179, 1034]


# Tokenization

In [0]:
def Tokenize(train_text,train_labels):
  input_ids = []
  attention_masks = []

  # For every sentence...
  for sent in train_text:
      # `encode_plus` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      #   (5) Pad or truncate the sentence to `max_length`
      #   (6) Create attention masks for [PAD] tokens.
      encoded_dict = tokenizer.encode_plus(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = 64,           # Pad & truncate all sentences.
                          pad_to_max_length = True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
      # Add the encoded sentence to the list.    
      input_ids.append(encoded_dict['input_ids'])
      
      # And its attention mask (simply differentiates padding from non-padding).
      attention_masks.append(encoded_dict['attention_mask'])

  # Convert the lists into tensors.
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(train_labels)


  print('Original: ', train_text[0])
  print('Token IDs:', input_ids[0])
  return input_ids,attention_masks,labels

stratification

In [0]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.05, random_state=0)

for train_index, test_index in sss.split(train_text,train_labels):
  print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_dev = train_text[train_index], train_text[test_index]
  y_train, y_dev = train_labels[train_index], train_labels[test_index]

TRAIN: [16742  4327  9421 ...  5061 17824 21606] TEST: [13210  6626 12281 ... 18178 11735  9916]
TRAIN: [ 5613 16165 11837 ... 18074 10810 15826] TEST: [ 3826  9467 21081 ...  3708 14141  9694]
TRAIN: [21346 19746  2927 ...  7305  3727  9981] TEST: [22379 20769  6370 ... 18347 16570 22329]
TRAIN: [12113  7315 12257 ... 20089  7791  5682] TEST: [11816 12242   744 ...  5312 15889 13920]
TRAIN: [13533  3875 20777 ...   446 15468 13018] TEST: [ 7358 21925 22340 ... 17827  2705  8357]
TRAIN: [22384  3618  6961 ... 22515  2375  7199] TEST: [21681  9186  9511 ...  2952  4773 10810]
TRAIN: [ 9604   224 11831 ...  3907 11123 15905] TEST: [ 4481  7476 18680 ...  8208 22351  2080]
TRAIN: [13990  1877 21104 ... 14935  9237 15657] TEST: [ 8669 15666  3704 ... 20800 16809  1967]
TRAIN: [18830  1271 18979 ... 10454 22118   755] TEST: [ 3292  2281  7478 ... 21157 19587  8056]
TRAIN: [20307 14357 11161 ...  5908  2843  8784] TEST: [ 8877  1027  3135 ...  3086 22423 22234]


In [0]:
input_ids,attention_masks,labels = Tokenize(X_train,y_train)

Original:  يابني بقول مش بفتحه يابني
Token IDs: tensor([    2, 10173,  1907, 18732,  1944, 11114,  1033, 10173,  1907,     3,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])


In [0]:
input_ids_dev,attention_masks_dev,labels_dev = Tokenize(X_dev,y_dev)

Original:  من اي بلد تتابع صفحتنا؟؟ انا من الاردن ويوجد ادمنز من دول اخرى هلا مدريد 
Token IDs: tensor([    2,  1762,  1887,  4814, 20085, 11501,  3398,   232,   232,  2928,
         1762,  4242, 16312,  4214,  8143,  1762,  2411,  2567, 17642,  5910,
            3,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])


In [0]:
from torch.utils.data import TensorDataset, random_split

train_dataset = TensorDataset(input_ids, attention_masks,labels)
val_dataset = TensorDataset(input_ids_dev, attention_masks_dev,labels_dev)


print('{:>5,} training samples'.format(train_dataset.__len__()))
print('{:>5,} validation samples'.format(val_dataset.__len__()))

21,466 training samples
1,130 validation samples


stop using this cell

In [0]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks,labels)

# Create a train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.95 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

19,950 training samples
1,050 validation samples


In [0]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# Model layers 

In [0]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (32000, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [0]:
from transformers import get_linear_schedule_with_warmup,AdamW

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8)
epochs = 4

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [0]:
import time
import datetime
import random
import numpy as np

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [0]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training loop

In [0]:
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    model.to(device)

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        
        
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    40  of    671.    Elapsed: 0:00:17.
  Batch    80  of    671.    Elapsed: 0:00:34.
  Batch   120  of    671.    Elapsed: 0:00:51.
  Batch   160  of    671.    Elapsed: 0:01:08.
  Batch   200  of    671.    Elapsed: 0:01:25.
  Batch   240  of    671.    Elapsed: 0:01:42.
  Batch   280  of    671.    Elapsed: 0:01:59.
  Batch   320  of    671.    Elapsed: 0:02:16.
  Batch   360  of    671.    Elapsed: 0:02:33.
  Batch   400  of    671.    Elapsed: 0:02:50.
  Batch   440  of    671.    Elapsed: 0:03:07.
  Batch   480  of    671.    Elapsed: 0:03:25.
  Batch   520  of    671.    Elapsed: 0:03:42.
  Batch   560  of    671.    Elapsed: 0:03:59.
  Batch   600  of    671.    Elapsed: 0:04:16.
  Batch   640  of    671.    Elapsed: 0:04:34.

  Average training loss: 2.27
  Training epcoh took: 0:04:47

Running Validation...
  Accuracy: 0.41
  Validation Loss: 2.04
  Validation took: 0:00:04

Training...
  Batch    40  of    671.    Elapsed: 0:00:17.
  Batch    80  of    6

# Evaluation on dev set

In [0]:
data = read_tweets_labeled('drive/My Drive/NADI_release/dev_labeled.tsv')

In [0]:
data.head()

Unnamed: 0,#1 tweet_ID,#2 tweet_content,#3 country_label,#4 province_label,label
0,Dev_1,ايسكو لاعب اليوم :) اسيست وهدف,Iraq,iq_Ninawa,0
1,Dev_2,بعد صلاه الفجر بقا,Egypt,eg_Monufia,1
2,Dev_3,إن شاء الله هذه المرة يكون من نصيبي,Algeria,dz_Oran,2
3,Dev_4,ههههههههههههههههه خلي السوداني يزغبك,Yemen,ye_Al-Hudaydah,3
4,Dev_5,كل حاجة محسوبة يا جماعة والله,Egypt,eg_South-Sinai,1


In [0]:
dev_text = data['#2 tweet_content'].values
dev_labels = data['label'].values

In [0]:
input_ids,attention_masks,labels = Tokenize(dev_text,dev_labels)

Original:   ايسكو لاعب اليوم :) اسيست وهدف
Token IDs: tensor([    2,  9839,  2763,  6326,  2089,    30,    13,  8654,  1782, 22622,
            3,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])


Prediction loop

In [0]:
batch_size = 64

prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [0]:
def predict(model,prediction_dataloader):
  # Prediction on test set

  print('Predicting labels for {:,} dev sentences...'.format(len(input_ids)))

  # Put model in evaluation mode
  model.eval()

  # Tracking variables 
  predictions , true_labels = [], []

  # Predict 
  for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().tolist()
    label_ids = b_labels.to('cpu').tolist()
    
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

  print('    DONE.')
  return predictions,true_labels

In [0]:
from sklearn.metrics import classification_report

def get_report(predictions,true_labels):
  pred = [item for sublist in predictions for item in sublist]

  true_label = [item for sublist in true_labels for item in sublist]

  prediction = []
  for i in range(len(pred)):
    prediction.append(np.argmax(pred[i], axis=0).flatten()[0])

  print(classification_report(true_label, prediction,target_names= list(labels_dictionary.keys())))

In [0]:
pred,actual = predict(model,prediction_dataloader)

Predicting labels for 4,957 dev sentences...
    DONE.


In [0]:
get_report(pred,actual)

                      precision    recall  f1-score   support

                Iraq       0.47      0.53      0.50       636
               Egypt       0.59      0.74      0.66      1070
             Algeria       0.45      0.46      0.45       359
               Yemen       0.48      0.26      0.34       206
        Saudi_Arabia       0.30      0.42      0.35       579
               Syria       0.37      0.15      0.21       265
United_Arab_Emirates       0.18      0.16      0.17       265
                Oman       0.24      0.24      0.24       249
              Jordan       0.23      0.16      0.19       104
             Tunisia       0.35      0.28      0.31       164
              Kuwait       0.00      0.00      0.00        70
             Morocco       0.39      0.25      0.31       249
               Libya       0.27      0.29      0.28       265
               Qatar       0.06      0.05      0.05       104
             Lebanon       0.38      0.14      0.20       110
       