# Check the availbility of GPU

In [1]:
import tensorflow as tf
print(len(tf.config.list_physical_devices('GPU')))
import torch
print(torch.cuda.is_available())

2022-09-20 07:08:22.657395: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-20 07:08:22.754690: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-20 07:08:22.755557: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


1
True


## Install gdown library

In [2]:
!pip install gdown

Collecting gdown
  Downloading gdown-4.5.1.tar.gz (14 kB)
  Installing build dependencies ... [?25l- \ | / - \ done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
Building wheels for collected packages: gdown
  Building wheel for gdown (pyproject.toml) ... [?25l- done
[?25h  Created wheel for gdown: filename=gdown-4.5.1-py3-none-any.whl size=14933 sha256=a833a5a8eb8baa26fde3406f9cc7f615f7609a1ba6cf25c30be1018dcdf0a519
  Stored in directory: /root/.cache/pip/wheels/3d/ec/b0/a96d1d126183f98570a785e6bf8789fca559853a9260e928e1
Successfully built gdown
Installing collected packages: gdown
Successfully installed gdown-4.5.1
[0m

# Download datasets

In [3]:
!gdown --id 1SRB7w6x_6oVUOzJihlYA5T2VR8u0UJyd
!gdown --id 1zs91kg3MO6FNkmtHFo1bqOF2Iy1F1b4y
!gdown --id 165kzfZDsRTZAAfZKedeZiUlKzMcHNgPd

Downloading...
From: https://drive.google.com/uc?id=1SRB7w6x_6oVUOzJihlYA5T2VR8u0UJyd
To: /kaggle/working/Twitter_train.csv
100%|████████████████████████████████████████| 261k/261k [00:00<00:00, 83.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1zs91kg3MO6FNkmtHFo1bqOF2Iy1F1b4y
To: /kaggle/working/Twitter_test.csv
100%|██████████████████████████████████████| 84.4k/84.4k [00:00<00:00, 53.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=165kzfZDsRTZAAfZKedeZiUlKzMcHNgPd
To: /kaggle/working/Arabic_stop_words.txt
100%|██████████████████████████████████████| 6.48k/6.48k [00:00<00:00, 9.24MB/s]


In [4]:
!pip install pyarabic

[0m

In [5]:
import pyarabic.araby as ar

# import Stemmer
import functools, operator

import logging

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

## Load Dataset

In [6]:
import pandas as pd
df = pd.read_csv("./Twitter_train.csv")
seed=42

In [7]:
df.sample(5)

Unnamed: 0,tweet,class
1871,"' """"""ولا كل غايب عن عِنيّا هقول بعيدوفي ناس نا...",neu
552,"' """"""\""""@fahad60801: اللهم رب جبرائيل، وميكائي...",pos
448,' @I__Divo @Pinar_Gika17 طب ايه السبب في انهم ...,neg
194,' لو فعلا عايز تبدأ صفحة جديدة ف حياتك لازم تب...,neu
1248,"' """"""@ahmad_nady انا قلقت من نص ساعة كدة مبحلق...",neg


## Arabic stop words

In [8]:
arabic_stop_words=[]
with open ('./Arabic_stop_words.txt',encoding='utf-8') as f :
    for i in f.readlines() :
        arabic_stop_words.append(i)
        arabic_stop_words[-1]=arabic_stop_words[-1][:-1]


In [9]:
!pip install farasapy

Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Installing collected packages: farasapy
Successfully installed farasapy-0.0.14
[0m

In [10]:
import numpy as np
import pandas as pd
import re

#============= Read CSV and apply data preperation =============#


def data_preprocessing (data_frame):
    # clean-up: remove #tags, http links and special symbols
    data_frame['tweet']= data_frame['tweet'].apply(lambda x: x[2:-2])
    data_frame['tweet']= data_frame['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))
    data_frame['tweet'] = data_frame['tweet'].apply(lambda x: re.sub(r'[@|#]\S*', '', x))
    data_frame['tweet'] = data_frame['tweet'].apply(lambda x: re.sub(r'"+', '', x))

    # Remove arabic signs
    data_frame['tweet'] = data_frame['tweet'].apply(lambda x: re.sub(r'([@A-Za-z0-9_ـــــــــــــ]+)|[^\w\s]|#|http\S+', '', x))

    # Remove repeated letters like "الللللللللللللللله" to "الله"
    data_frame['tweet'] = data_frame['tweet'].apply(lambda x: x[0:2] + ''.join([x[i] for i in range(2, len(x)) if x[i]!=x[i-1] or x[i]!=x[i-2]]))

    # remove stop words
    data_frame['tweet'] = data_frame['tweet'].apply(lambda x: '' if x in arabic_stop_words else x)

    from nltk.stem.isri import ISRIStemmer
    df['tweet']=df['tweet'].apply(lambda x:ISRIStemmer().stem(x))

    return data_frame


In [11]:
!pip install emoji

[0m

In [12]:
# st =  Stemmer.Stemmer('arabic')
import string,emoji
def data_cleaning (text):
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"https\S+", "", text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub("(\s\d+)","",text)
    text = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", "", text)
    text = re.sub("\d+", " ", text)
    text = ar.strip_tashkeel(text)
    text = ar.strip_tatweel(text)
    text = text.replace("#", " ");
    text = text.replace("@", " ");
    text = text.replace("_", " ");
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    em = text
    em_split_emoji = emoji.get_emoji_regexp().split(em)
    em_split_whitespace = [substr.split() for substr in em_split_emoji]
    em_split = functools.reduce(operator.concat, em_split_whitespace)
    text = " ".join(em_split)
    text = re.sub(r'(.)\1+', r'\1', text)
    # text_stem = " ".join([st.stemWord(i) for i in text.split()])
    # text = text +" "+ text_stem
    text = text.replace("آ", "ا")
    text = text.replace("إ", "ا")
    text = text.replace("أ", "ا")
    text = text.replace("ؤ", "و")
    text = text.replace("ئ", "ي")

    return text

In [13]:
df['tweet']=df['tweet'].apply(lambda x: data_cleaning(x))
df=data_preprocessing(df)
df



Unnamed: 0,tweet,class
0,متني الحياه ان الذين يعيشون على الارض ليسوا مل...,pos
1,ري كرسمس كل سنة وانتم طيب,pos
2,انتهى مشوار الخ,neg
3,عارف ابتدى مذاكره من,neg
4,اختصروا الطريق بدلا من اختيار المنصف ثم الانق...,neg
...,...,...
2054,الجمال مبيحتاح اي مكياج لناعم وله خشن جمل ال...,neu
2055,نتمني وجود الفنانة رنا سماحة افضل فنانة صاعدة...,neu
2056,د الهدى فالكاينات ضياء وفم الزمان تبسم وسناء ك...,pos
2057,انت متناقض جدا يا صل,neg


In [14]:
!git clone https://github.com/aub-mind/arabert.git

Cloning into 'arabert'...
remote: Enumerating objects: 595, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 595 (delta 37), reused 43 (delta 29), pack-reused 535[K
Receiving objects: 100% (595/595), 9.14 MiB | 5.19 MiB/s, done.
Resolving deltas: 100% (338/338), done.


In [15]:
from arabert.preprocess import ArabertPreprocessor

model_name = "aubmindlab/bert-large-arabertv02-twitter"
arabert_prep = ArabertPreprocessor(model_name=model_name)

df['tweet']=df['tweet'].apply(lambda x: arabert_prep.preprocess(x))


text = "ولن نبالغ إذا قلنا: إن 'هاتف' أو 'كمبيوتر المكتب' في زمننا هذا ضروري"
arabert_prep.preprocess(text)
# # "و+ لن نبالغ إذا قل +نا : إن ' هاتف ' أو ' كمبيوتر ال+ مكتب ' في زمن +نا هذا ضروري"

"ولن نبالغ إذا قلنا : إن ' هاتف ' أو ' كمبيوتر المكتب ' في زمننا هذا ضروري"

## Label Encoder

In [16]:
from sklearn import preprocessing
# Apply label encoding over the labels
lable_encoder = preprocessing.LabelEncoder()
encoded_labels =lable_encoder.fit_transform(df["class"])
df['class']=encoded_labels
df

Unnamed: 0,tweet,class
0,متني الحياه ان الذين يعيشون على الارض ليسوا مل...,2
1,ري كرسمس كل سنة وانتم طيب,2
2,انتهى مشوار الخ,0
3,عارف ابتدى مذاكره من,0
4,اختصروا الطريق بدلا من اختيار المنصف ثم الانقل...,0
...,...,...
2054,الجمال مبيحتاح اي مكياج لناعم وله خشن جمل الطا...,1
2055,نتمني وجود الفنانة رنا سماحة افضل فنانة صاعدة ...,1
2056,د الهدى فالكاينات ضياء وفم الزمان تبسم وسناء ك...,2
2057,انت متناقض جدا يا صل,0


In [17]:
df['length']=df['tweet'].apply(lambda x:len(x.split(' ')))
df['length'].max()

36

## Train Test Split

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation=train_test_split(df['tweet'], df['class'], test_size=0.2, random_state=seed)
X_validation

1298    معناها مش المعني الظاهر معناها نفسها في الجنة ...
591                                                     ا
1318                                              كل اخير
1067                    هنا العاصمة لميس الحديدي تودع عبر
29                              انا نهي سنفورة القهوة ادي
                              ...                        
1033    صباح اورد من احمد انا بحب سكس انا بحب الزمالك ...
674     حياة بالقرب من اله حياة مطمينة محفوفة بالتوفيق...
1771                           عليش نسال فيكن معناها ي بص
322                                         شارع الجاردنز
1299                              ن نفسي اتولد مخلص ه تعل
Name: tweet, Length: 412, dtype: object

# Trying some machine learning models

## TF_IDF

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_ngram(n_gram,X_train,X_val):
    vectorizer = TfidfVectorizer(ngram_range=(n_gram,n_gram))
    x_train_vec = vectorizer.fit_transform(X_train)
    x_test_vec = vectorizer.transform(X_val)
    return x_train_vec,x_test_vec

In [20]:
# Applying tfidf with 1-gram, 2-gram and 3-gram
tfidf_1g_transformation_train,tfidf_1g_transformation_validation= tfidf_ngram(1,X_train,X_validation)
tfidf_2g_transformation_train,tfidf_2g_transformation_validation= tfidf_ngram(2,X_train,X_validation)

## Machine learning models

In [21]:
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

models=[SVC(),XGBClassifier(),RandomForestClassifier(),DecisionTreeClassifier(),LogisticRegression()]
for m in models :
    m.fit(tfidf_2g_transformation_train,y_train)
    print(m.score(tfidf_2g_transformation_train,y_train))
    print(m.score(tfidf_2g_transformation_validation,y_validation))

0.9769277474195507
0.3422330097087379
0.4632665452337584
0.3446601941747573
0.9775349119611415
0.35436893203883496
0.9775349119611415
0.3446601941747573
0.9769277474195507
0.36650485436893204


# Trying to use some pre-trained models from hugging face website 

## Install transformers

In [22]:
!pip install transformers

[0m

## Model and Tokenizer initialization

In [23]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

#============= Initialize Arabic Bert =============#
#load your pre_trained model with all its weights
# model_name= 'aubmindlab/bert-base-arabertv02'
model_name='UBC-NLP/MARBERT' #top
# model_name='asafaya/bert-base-arabic'
# model_name='AraBERTv0.2-Twitter-base'
# model_name='aubmindlab/bert-large-arabertv2'
# model_name='aubmindlab/bert-base-arabertv02-twitter'
# model_name='aubmindlab/bert-large-arabertv02-twitter'
# model_name='aubmindlab/aragpt2-base'

# model_name='aubmindlab/bert-base-arabertv2'
tokenizer =AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
# model=AutoModel.from_pretrained(model_name,output_hidden_states=True)

Downloading:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/701 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624M [00:00<?, ?B/s]

Some weights of the model checkpoint at UBC-NLP/MARBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at U

You can uncomment ay of the other models to get differnet accuraces

In [24]:
# Tokenize the sentences using bert tokenizer
df["bert_tokens"] = df.tweet.apply(lambda x: tokenizer(x).tokens())
df["bert_tokens_ids"] = df.tweet.apply(lambda x: tokenizer(x).tokens())
df["encoded"] = df.tweet.apply(lambda x: tokenizer.encode_plus(x,return_tensors='pt')['input_ids'])
df

Unnamed: 0,tweet,class,length,bert_tokens,bert_tokens_ids,encoded
0,متني الحياه ان الذين يعيشون على الارض ليسوا مل...,2,27,"[[CLS], متني, الحياه, ان, الذين, يعيشون, على, ...","[[CLS], متني, الحياه, ان, الذين, يعيشون, على, ...","[[tensor(2), tensor(68713), tensor(3946), tens..."
1,ري كرسمس كل سنة وانتم طيب,2,6,"[[CLS], ري, كرس, ##مس, كل, سنة, وانتم, طيب, [S...","[[CLS], ري, كرس, ##مس, كل, سنة, وانتم, طيب, [S...","[[tensor(2), tensor(2536), tensor(35685), tens..."
2,انتهى مشوار الخ,0,3,"[[CLS], انتهى, مشوار, الخ, [SEP]]","[[CLS], انتهى, مشوار, الخ, [SEP]]","[[tensor(2), tensor(7609), tensor(13606), tens..."
3,عارف ابتدى مذاكره من,0,4,"[[CLS], عارف, ابتدى, مذاكره, من, [SEP]]","[[CLS], عارف, ابتدى, مذاكره, من, [SEP]]","[[tensor(2), tensor(3323), tensor(45008), tens..."
4,اختصروا الطريق بدلا من اختيار المنصف ثم الانقل...,0,20,"[[CLS], اختصر, ##وا, الطريق, بدلا, من, اختيار,...","[[CLS], اختصر, ##وا, الطريق, بدلا, من, اختيار,...","[[tensor(2), tensor(22181), tensor(1958), tens..."
...,...,...,...,...,...,...
2054,الجمال مبيحتاح اي مكياج لناعم وله خشن جمل الطا...,1,10,"[[CLS], الجمال, مبيح, ##تاح, اي, مكياج, لنا, #...","[[CLS], الجمال, مبيح, ##تاح, اي, مكياج, لنا, #...","[[tensor(2), tensor(4770), tensor(68899), tens..."
2055,نتمني وجود الفنانة رنا سماحة افضل فنانة صاعدة ...,1,11,"[[CLS], نتمني, وجود, الفنانة, رنا, سماحة, افضل...","[[CLS], نتمني, وجود, الفنانة, رنا, سماحة, افضل...","[[tensor(2), tensor(39939), tensor(3715), tens..."
2056,د الهدى فالكاينات ضياء وفم الزمان تبسم وسناء ك...,2,16,"[[CLS], د, الهدى, فالك, ##اينات, ضياء, وف, ##م...","[[CLS], د, الهدى, فالك, ##اينات, ضياء, وف, ##م...","[[tensor(2), tensor(125), tensor(4880), tensor..."
2057,انت متناقض جدا يا صل,0,5,"[[CLS], انت, متناقض, جدا, يا, صل, [SEP]]","[[CLS], انت, متناقض, جدا, يا, صل, [SEP]]","[[tensor(2), tensor(2030), tensor(27008), tens..."


## Padding and attention mask

In [25]:
from keras_preprocessing.sequence import pad_sequences

# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway.
# In the original paper, the authors used a length of 512.
MAX_LEN = 64
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in df['bert_tokens']]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [26]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, encoded_labels,
                                                            random_state=seed, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=seed, test_size=0.1)
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 64

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size)

## Set optimizer parameters

In [27]:
import torch.optim as optim

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay_rate': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay_rate': 0.0}]
# This variable contains all of the hyperparemeter information our training loop needs
# optimizer = optim.BertAdam(optimizer_grouped_parameters,lr=2e-5,warmup=.1)
# optimizer = optim.AdamW(optimizer_grouped_parameters,lr=5e-6)
optimizer = optim.AdamW(optimizer_grouped_parameters,lr=.00001)

# Training

In [28]:
from tqdm import trange
import numpy as np
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
t = []

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs
epochs = 11

# Transfer the model to GPU
model.to("cuda")

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):


  # Training

  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0

  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    b_input_ids, b_input_mask, b_labels = batch
    b_labels = b_labels.type(torch.LongTensor)   # casting to long
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()

    # Forward pass
    loss = model(b_input_ids.to("cuda"), token_type_ids=None, attention_mask=b_input_mask.to("cuda"), labels=b_labels.to("cuda"))["loss"]
    train_loss_set.append(loss.item())

    # Backward pass
    loss.backward()

    # Update parameters and take a step using the computed gradient
    optimizer.step()


    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    # batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    b_labels = b_labels.type(torch.LongTensor)   # casting to long
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids.to("cuda"), token_type_ids=None, attention_mask=b_input_mask.to("cuda"))

    # Move logits and labels to CPU
    logits = logits["logits"].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
  if (eval_accuracy/nb_eval_steps) > 0.77 :
    break

Epoch:   0%|          | 0/11 [00:00<?, ?it/s]

Train loss: 1.0426638804633042


Epoch:   9%|▉         | 1/11 [00:11<01:59, 11.94s/it]

Validation Accuracy: 0.6517857142857143
Train loss: 0.7408206976693252


Epoch:  18%|█▊        | 2/11 [00:22<01:42, 11.41s/it]

Validation Accuracy: 0.6981026785714286
Train loss: 0.49888533970405313


Epoch:  27%|██▋       | 3/11 [00:34<01:29, 11.24s/it]

Validation Accuracy: 0.7098214285714286
Train loss: 0.34121368671285696


Epoch:  36%|███▋      | 4/11 [00:45<01:18, 11.16s/it]

Validation Accuracy: 0.6741071428571428
Train loss: 0.28795710051881857


Epoch:  45%|████▌     | 5/11 [00:56<01:06, 11.12s/it]

Validation Accuracy: 0.6997767857142857
Train loss: 0.2097340326370864


Epoch:  55%|█████▍    | 6/11 [01:07<00:55, 11.09s/it]

Validation Accuracy: 0.7098214285714286
Train loss: 0.10513628068669088


Epoch:  64%|██████▎   | 7/11 [01:18<00:44, 11.07s/it]

Validation Accuracy: 0.7215401785714286
Train loss: 0.08320913778553748


Epoch:  73%|███████▎  | 8/11 [01:29<00:33, 11.05s/it]

Validation Accuracy: 0.6568080357142857
Train loss: 0.06793176466277961


Epoch:  82%|████████▏ | 9/11 [01:40<00:22, 11.04s/it]

Validation Accuracy: 0.7416294642857143
Train loss: 0.04434245683509728


Epoch:  91%|█████████ | 10/11 [01:51<00:11, 11.04s/it]

Validation Accuracy: 0.7299107142857143
Train loss: 0.06369137057456477


Epoch: 100%|██████████| 11/11 [02:02<00:00, 11.11s/it]

Validation Accuracy: 0.7081473214285714





# Prepare testset with the same preprocessing

In [29]:
#============= Read CSV and apply data preperation =============#
df_submit = pd.read_csv("./Twitter_test.csv")

df_submit["tweet"] = df_submit.tweet.apply(lambda x: data_cleaning(x))
df_submit=data_preprocessing(df_submit)

df_submit['tweet']=df_submit['tweet'].apply(lambda x: arabert_prep.preprocess(x))

# Tokenize the sentences using bert tokenizer
df_submit["bert_tokens"] = df_submit.tweet.apply(lambda x: tokenizer(x).tokens())



In [30]:
bert_tokens_submit = df_submit["bert_tokens"]

In [31]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = 64
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids_submit = [tokenizer.convert_tokens_to_ids(x) for x in bert_tokens_submit]
# Pad our input tokens
input_ids_submit = pad_sequences(input_ids_submit, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks_submit = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_submit:
    seq_mask = [float(i>0) for i in seq]
    attention_masks_submit.append(seq_mask)

In [32]:
# Convert all of our data into torch tensors, the required datatype for our model
inputs_submit = torch.tensor(input_ids_submit)
masks_submit = torch.tensor(attention_masks_submit)

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory
batch_size = 64
submit_data = TensorDataset(inputs_submit, masks_submit)

# do not use shuffle, we need the preds to be in same order
submit_dataloader = DataLoader(submit_data, batch_size=batch_size)#, shuffle=True)

In [33]:
# Put the model in an evaluation state
model.eval()

# Transfer model to GPU
model.to("cuda")

outputs = []
for input, masks in submit_dataloader:
    torch.cuda.empty_cache() # empty the gpu memory

    # Transfer the batch to gpu
    input = input.to('cuda')
    masks = masks.to('cuda')

    # Run inference on the batch
    output = model(input, attention_mask=masks)["logits"]

    # Transfer the output to CPU again and convert to numpy
    output = output.cpu().detach().numpy()

    # Store the output in a list
    outputs.append(output)

# Concatenate all the lists within the list into one list
outputs = [x for y in outputs for x in y]

# Inverse transform the label encoding
pred_flat = np.argmax(outputs, axis=1).flatten()
output_labels = lable_encoder.inverse_transform(pred_flat)

In [34]:
submission = pd.DataFrame({"Id":np.arange(1, len(output_labels)+1), "class":output_labels})
# save (submission)
submission.to_csv("submission.csv", index=False)