# install requirements

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
!pip install optuna==2.3.0
!pip install transformers==4.2.1
!pip install farasapy
!pip install pyarabic
!pip install aaransia
!git clone https://github.com/aub-mind/arabert

In [None]:
import sys
sys.modules['pkg_resources'].get_distribution('tokenizers').version

In [None]:
import importlib
importlib.reload(sys.modules['pkg_resources'])

sys.modules['pkg_resources'].get_distribution('tokenizers').version

In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split

# load and preprocess data

In [None]:
class Dataset:
    def __init__(
        self,
        train,
        test,
        label_list,
    ):
        self.train = train
        self.test = test
        self.label_list = label_list

In [None]:
DATA_COLUMN = "text"
LABEL_COLUMN = "label"

In [None]:
pos_url = "https://raw.githubusercontent.com/moroccanSA-NER/SA-Moroccan/master/MSTD_pos.txt"
neg_url = "https://raw.githubusercontent.com/moroccanSA-NER/SA-Moroccan/master/MSTD_neg.txt"
obj_url = "https://raw.githubusercontent.com/moroccanSA-NER/SA-Moroccan/master/MSTD_obj.txt"
sar_url = "https://raw.githubusercontent.com/moroccanSA-NER/SA-Moroccan/master/MSTD_sar.txt"

data_pos = pd.read_csv(pos_url, names=[DATA_COLUMN])
data_pos[LABEL_COLUMN] = 1
data_neg = pd.read_csv(neg_url, names=[DATA_COLUMN])
data_neg[LABEL_COLUMN] = -1
data_obj = pd.read_csv(obj_url, names=[DATA_COLUMN])
data_obj[LABEL_COLUMN] = 0
data_sar = pd.read_csv(sar_url, names=[DATA_COLUMN])
data_sar[LABEL_COLUMN] = 2

df = pd.concat([data_pos, data_neg, data_obj, data_sar], ignore_index=True)
print(df[LABEL_COLUMN].value_counts())

In [None]:
import re
import warnings
from aaransia import transliterate, SourceLanguageError

warnings.filterwarnings('ignore')
lenghth = df[LABEL_COLUMN].value_counts()[0]

def reverse_translate(text):
  pattern = r"@\S+|#\S+|[www\.]?\S+\.com|https?:\/\/\S+"
  pattern2 = r"#\S+|[www\.]?\S+\.com|https?:\/\/\S+"
  clean_text = ' '.join(
            word 
            for word in text.split(' ')
            if not re.match(pattern, word)
  )
  matches = re.findall(pattern2, text)
  clean_translated = transliterate(
      clean_text, 
      source='ar', target='ma', 
      universal=True
      )
  clean_reversed = transliterate(
      clean_translated, 
      source='ma', target='ar', 
      universal=True
      )
  reversed = clean_reversed + ' '.join(matches)
  return reversed

def upsample_label(df, label):
  df_func = df[df[LABEL_COLUMN]==label]
  size = lenghth-len(df_func)
  df_func[DATA_COLUMN] = df_func[DATA_COLUMN].apply(lambda x: reverse_translate(x))
  unik = len(df_func[DATA_COLUMN].unique())
  if unik >= size :
    df_func.drop_duplicates(subset=[DATA_COLUMN], inplace=True)
    df = df.append(df_func, ignore_index=True)
  else:
    gap = size-unik
    df_func.drop_duplicates(subset=[DATA_COLUMN], inplace=True)
    df = df.append(df_func, ignore_index=True)
    df = df.append(df_func[:][:gap], ignore_index=True)
  return df

for label in (-1, 1, 2):
  df = upsample_label(df, label) 
  while len(df[df[LABEL_COLUMN]==label]) < lenghth:
    df = upsample_label(df, label)

In [None]:
print(df[LABEL_COLUMN].value_counts())

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
dataset = Dataset(df_train, df_test, label_list=list(df[LABEL_COLUMN].unique()))

In [None]:
dataset.train.head()

# training

In [None]:
from arabert.preprocess import ArabertPreprocessor
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch
import optuna 

In [None]:
model_name = 'aubmindlab/bert-base-arabertv02'
task_name = 'classification'
max_len = 256

In [None]:
arabert_prep = ArabertPreprocessor(model_name.split("/")[-1])

dataset.train[DATA_COLUMN] = dataset.train[DATA_COLUMN].apply(lambda x:   arabert_prep.preprocess(x))
dataset.test[DATA_COLUMN] = dataset.test[DATA_COLUMN].apply(lambda x:   arabert_prep.preprocess(x)) 

In [None]:
dataset.train.head()

In [None]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())


        
      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )     
    
      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)    
      
      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [None]:
label_map = { v:index for index, v in enumerate(dataset.label_list) }
print(label_map)
train_dataset = BERTDataset(dataset.train[DATA_COLUMN].to_list(),dataset.train[LABEL_COLUMN].to_list(),model_name,max_len,label_map)
test_dataset = BERTDataset(dataset.test[DATA_COLUMN].to_list(),dataset.test[LABEL_COLUMN].to_list(),model_name,max_len,label_map)

In [None]:
train_dataset.text

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

In [None]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  #print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))

  macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[-1,1])
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

In [None]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.lr_scheduler_type = 'cosine'
training_args.fp16 = True
training_args.per_device_train_batch_size = 16
training_args.per_device_eval_batch_size = 16
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 8
training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000
# training_args.save_steps = 
#training_args.eval_steps = 
training_args.disable_tqdm = True
# print("Logging Step:", training_args.logging_steps)
# print("Eval Step:",training_args.eval_steps)

In [None]:
steps_per_epoch = (len(dataset.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)

In [None]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_dataset, 
    eval_dataset=test_dataset, 
    model_init=model_init,
    compute_metrics=compute_metrics,
)

In [None]:
def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 2e-5, 7e-5, step=1e-5),
        "seed": trial.suggest_categorical("seed", [0, 1, 42, 666, 123, 12345]),
        "warmup_steps": trial.suggest_int("warmup_steps",0,total_steps*0.1,step=total_steps*0.1*0.5)
    }

search_space = {
    "learning_rate":  list(np.arange(2e-5, 7e-5, 1e-5)),
    "seed":  [0, 1, 42, 666, 123, 12345],
    "warmup_steps": list(range(0, int((total_steps)*0.1)+1, int(total_steps*0.1*0.5)))
}
search_space

In [None]:
def my_objective(metrics):
    return metrics['eval_macro_f1']

In [None]:
name = "sa-arabert-base-v2"

In [None]:
best_run = trainer.hyperparameter_search(direction="maximize",
                                         hp_space=my_hp_space,
                                         compute_objective=my_objective,
                                         n_trials=1,
                                         pruner=optuna.pruners.NopPruner(),
                                         sampler=optuna.samplers.GridSampler(search_space),
                                         study_name=name,
                                         storage=
                                         load_if_exists=False
                                         )