# install requirements

In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB
Tue Jun 29 17:17:51 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.04   Driver Version: 450.119.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    27W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------

In [2]:
!pip install optuna==2.3.0
!pip install transformers==4.2.1
!pip install farasapy
!pip install pyarabic
!pip install aaransia
!git clone https://github.com/aub-mind/arabert

Collecting optuna==2.3.0
  Downloading optuna-2.3.0.tar.gz (258 kB)
[K     |████████████████████████████████| 258 kB 4.4 MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Building wheels for collected packages: optuna
  Building wheel for optuna (PEP 517) ... [?25ldone
[?25h  Created wheel for optuna: filename=optuna-2.3.0-py3-none-any.whl size=359773 sha256=6a65c0dc07b66984c177c1ffd490dbb50304c11db0adae66543e29d7b6d39708
  Stored in directory: /root/.cache/pip/wheels/38/61/9e/955ab1890f6cab231b1d756db63f36c711968a324296e0b649
Successfully built optuna
Installing collected packages: optuna
  Attempting uninstall: optuna
    Found existing installation: optuna 2.7.0
    Uninstalling optuna-2.7.0:
      Successfully uninstalled optuna-2.7.0
Successfully installed optuna-2.3.0
Collecting transformers==4.2.1
  Downloading transformers-4.2.1-py3-none-any.whl (1

In [5]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split

# load and preprocess data

In [6]:
class Dataset:
    def __init__(
        self,
        train,
        test,
        label_list,
    ):
        self.train = train
        self.test = test
        self.label_list = label_list

In [7]:
DATA_COLUMN = "text"
LABEL_COLUMN = "label"

In [8]:
pos_url = "https://raw.githubusercontent.com/moroccanSA-NER/SA-Moroccan/master/MSTD_pos.txt"
neg_url = "https://raw.githubusercontent.com/moroccanSA-NER/SA-Moroccan/master/MSTD_neg.txt"
obj_url = "https://raw.githubusercontent.com/moroccanSA-NER/SA-Moroccan/master/MSTD_obj.txt"
sar_url = "https://raw.githubusercontent.com/moroccanSA-NER/SA-Moroccan/master/MSTD_sar.txt"

data_pos = pd.read_csv(pos_url, names=[DATA_COLUMN])
data_pos[LABEL_COLUMN] = 1
data_neg = pd.read_csv(neg_url, names=[DATA_COLUMN])
data_neg[LABEL_COLUMN] = -1
data_obj = pd.read_csv(obj_url, names=[DATA_COLUMN])
data_obj[LABEL_COLUMN] = 0
data_sar = pd.read_csv(sar_url, names=[DATA_COLUMN])
data_sar[LABEL_COLUMN] = 2

df = pd.concat([data_pos, data_neg, data_obj, data_sar], ignore_index=True)
print(df[LABEL_COLUMN].value_counts())

 0    6281
-1    2672
 2    2188
 1     734
Name: label, dtype: int64


In [9]:
import re
import warnings
from aaransia import transliterate, SourceLanguageError

warnings.filterwarnings('ignore')
lenghth = df[LABEL_COLUMN].value_counts()[0]

def reverse_translate(text):
  pattern = r"@\S+|#\S+|[www\.]?\S+\.com|https?:\/\/\S+"
  pattern2 = r"#\S+|[www\.]?\S+\.com|https?:\/\/\S+"
  clean_text = ' '.join(
            word 
            for word in text.split(' ')
            if not re.match(pattern, word)
  )
  matches = re.findall(pattern2, text)
  clean_translated = transliterate(
      clean_text, 
      source='ar', target='ma', 
      universal=True
      )
  clean_reversed = transliterate(
      clean_translated, 
      source='ma', target='ar', 
      universal=True
      )
  reversed = clean_reversed + ' '.join(matches)
  return reversed

def upsample_label(df, label):
  df_func = df[df[LABEL_COLUMN]==label]
  size = lenghth-len(df_func)
  df_func[DATA_COLUMN] = df_func[DATA_COLUMN].apply(lambda x: reverse_translate(x))
  unik = len(df_func[DATA_COLUMN].unique())
  if unik >= size :
    df_func.drop_duplicates(subset=[DATA_COLUMN], inplace=True)
    df = df.append(df_func, ignore_index=True)
  else:
    gap = size-unik
    df_func.drop_duplicates(subset=[DATA_COLUMN], inplace=True)
    df = df.append(df_func, ignore_index=True)
    df = df.append(df_func[:][:gap], ignore_index=True)
  return df

for label in (-1, 1, 2):
  df = upsample_label(df, label) 
  while len(df[df[LABEL_COLUMN]==label]) < lenghth:
    df = upsample_label(df, label)

In [10]:
print(df[LABEL_COLUMN].value_counts())

-1    6281
 2    6281
 1    6281
 0    6281
Name: label, dtype: int64


In [11]:
df

Unnamed: 0,text,label
0,#عباس_مكروه_الجزاير_ايهاب_محبوبنا يحيا اهابوو ...,1
1,ما عرفتش علاش لكن مقدمة الاخبار في Medi1Tv كات...,1
2,كيعجبني اللي كايفرض حريتو الدينية علاش واخا ما...,1
3,@abdelhamid_25 تبارك الله عليك هههه يدوز المنت...,1
4,@duniabatma والله حتى دمعو عينيا مني شفت كادو ...,1
...,...,...
25119,1ع/1ء/ء016 سباه الخير يا بلادي.علاش الينا كانب...,2
25120,كادهكوا هيت انا مختالف. كندهك هينت نتوما كاملي...,2
25121,هياكم اله قبل الانترنيت الناس كاتقرا عن ابن خل...,2
25122,فاش كتقول لينا شي مرا راها هامل كنقولو مبروك ف...,2


In [12]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
dataset = Dataset(df_train, df_test, label_list=list(df[LABEL_COLUMN].unique()))

In [13]:
dataset.train.head()

Unnamed: 0,text,label
7048,@mkhoumania انا تفرجت من الدقيقة 40..نسيت كاع ...,0
16777,اليوم راكوم معرودين عندي كولكم لغدا :) الا الو...,1
17099,اه ااعر و دابا اي هاجت دارتها في مسيرتها الفني...,1
20434,ايلي، تال دابا عاد كاتڭولها! تعيد و تعاود ان ش...,1
12341,تسورو يخرجو الناس لشارع لاهتجاج بالالاف االملك...,-1


# training

In [14]:
from arabert.preprocess import ArabertPreprocessor
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch
import optuna 

In [15]:
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [16]:
model_name = 'aubmindlab/bert-base-arabertv02'
task_name = 'classification'
max_len = 256

In [17]:
arabert_prep = ArabertPreprocessor(model_name.split("/")[-1])

dataset.train[DATA_COLUMN] = dataset.train[DATA_COLUMN].apply(lambda x:   arabert_prep.preprocess(x))
dataset.test[DATA_COLUMN] = dataset.test[DATA_COLUMN].apply(lambda x:   arabert_prep.preprocess(x)) 

In [18]:
dataset.train.head()

Unnamed: 0,text,label
7048,[مستخدم] انا تفرجت من الدقيقة 40 . . نسيت كاع ...,0
16777,اليوم راكوم معرودين عندي كولكم لغدا : ) الا ال...,1
17099,اه ااعر و دابا اي هاجت دارتها في مسيرتها الفني...,1
20434,ايلي ، تال دابا عاد كات ولها ! تعيد و تعاود ان...,1
12341,تسورو يخرجو الناس لشارع لاهتجاج بالالاف االملك...,-1


In [19]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())


        
      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )     
    
      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)    
      
      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [20]:
label_map = { v:index for index, v in enumerate(dataset.label_list) }
print(label_map)
train_dataset = BERTDataset(dataset.train[DATA_COLUMN].to_list(),dataset.train[LABEL_COLUMN].to_list(),model_name,max_len,label_map)
test_dataset = BERTDataset(dataset.test[DATA_COLUMN].to_list(),dataset.test[LABEL_COLUMN].to_list(),model_name,max_len,label_map)

{1: 0, -1: 1, 0: 2, 2: 3}


Downloading:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/825k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/381 [00:00<?, ?B/s]

In [22]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

In [23]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  #print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))

  macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[-1,1])
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

In [24]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.lr_scheduler_type = 'cosine'
training_args.fp16 = True
training_args.per_device_train_batch_size = 16
training_args.per_device_eval_batch_size = 16
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 8
training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000
# training_args.save_steps = 
#training_args.eval_steps = 
training_args.disable_tqdm = True
# print("Logging Step:", training_args.logging_steps)
# print("Eval Step:",training_args.eval_steps)

In [25]:
steps_per_epoch = (len(dataset.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)

628
5024


In [26]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_dataset, 
    eval_dataset=test_dataset, 
    model_init=model_init,
    compute_metrics=compute_metrics,
)

Downloading:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [27]:
def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 2e-5, 7e-5, step=1e-5),
        "seed": trial.suggest_categorical("seed", [0, 1, 42, 666, 123, 12345]),
        "warmup_steps": trial.suggest_int("warmup_steps",0,total_steps*0.1,step=total_steps*0.1*0.5)
    }

search_space = {
    "learning_rate":  list(np.arange(2e-5, 7e-5, 1e-5)),
    "seed":  [0, 1, 42, 666, 123, 12345],
    "warmup_steps": list(range(0, int((total_steps)*0.1)+1, int(total_steps*0.1*0.5)))
}
search_space

{'learning_rate': [2e-05,
  3.0000000000000004e-05,
  4.000000000000001e-05,
  5.000000000000001e-05,
  6.000000000000001e-05],
 'seed': [0, 1, 42, 666, 123, 12345],
 'warmup_steps': [0, 251, 502]}

In [28]:
def my_objective(metrics):
    return metrics['eval_macro_f1']

In [29]:
name = "sa-arabert-base-v2"

In [None]:
best_run = trainer.hyperparameter_search(direction="maximize",
                                         hp_space=my_hp_space,
                                         compute_objective=my_objective,
                                         n_trials=5,
                                         pruner=optuna.pruners.NopPruner(),
                                         sampler=optuna.samplers.GridSampler(search_space),
                                         study_name=name,
                                         load_if_exists=False
                                         )