#Fine-tune AraBERT with Fast-BERT library
https://github.com/kaushaltrivedi/fast-ber

##Get Requirements

In [None]:
!git clone https://github.com/aub-mind/arabert
!pip install PyArabic farasapy fast-bert

##Prepare the data

In [2]:
import pandas as pd
#from py4j.java_gateway import JavaGateway
from farasa.segmenter import FarasaSegmenter
from arabert.preprocess_arabert import preprocess
from sklearn.model_selection import train_test_split

# !pkill "java"
# gateway = JavaGateway.launch_gateway(classpath='./FarasaSegmenterJar.jar')
# farasa_segmenter = gateway.jvm.com.qcri.farasa.segmenter.Farasa()

farasa_segmenter = FarasaSegmenter(interactive=True)

df_AJGT = pd.read_excel('./arabert/AJGT.xlsx',header=0)

DATA_COLUMN = 'text'
LABEL_COLUMN = 'label'

df_AJGT = df_AJGT[['Feed', 'Sentiment']]
df_AJGT.columns = [DATA_COLUMN, LABEL_COLUMN]

label_map = {
    'Negative' : 0,
    'Positive' : 1
}

df_AJGT[DATA_COLUMN] = df_AJGT[DATA_COLUMN].apply(lambda x: preprocess(x, do_farasa_tokenization=True , farasa=farasa_segmenter, use_farasapy = True))
# df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(lambda x: label_map[x])

train_AJGT, test_AJGT = train_test_split(df_AJGT, test_size=0.2,random_state=42)
!mkdir data
train_AJGT.to_csv("data/train.csv",index=True,columns=train_AJGT.columns,sep=',',header=True)
test_AJGT.to_csv("data/dev.csv",index=True,columns=test_AJGT.columns,sep=',',header=True)
with open('data/labels.csv','w') as f:
  f.write("\n".join(df_AJGT['label'].unique()))

perform system check...
check java version...
Your java version is 11.0 which is compatiple with Farasa 
check toolkit binaries...
some binaries are not existed..
downloading zipped binaries...
100%|██████████| 200M/200M [00:06<00:00, 30.6MiB/s]
extracting...
toolkit binaries are downloaded and extracted.
Dependencies seem to be satisfied..
[37minitializing [SEGMENT] task in [32mINTERACTIVE [37mmode...


  "Be careful with large lines as they may break on interactive mode. You may switch to Standalone mode for such cases."


task [SEGMENT] is initialized interactively.


In [5]:
with open('data/labels.csv','w') as f:
  f.write("\n".join(df_AJGT['label'].unique()))

##Create a DataBunch Object:
see https://github.com/kaushaltrivedi/fast-bert#text-classification

In [9]:
from fast_bert.data_cls import BertDataBunch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabert')

databunch = BertDataBunch('./data/', './data/',
                          tokenizer=tokenizer,
                          train_file='train.csv',
                          val_file='dev.csv',
                          label_file='labels.csv',
                          text_col='text',
                          label_col='label',
                          batch_size_per_gpu=16,
                          max_seq_length=256,
                          multi_gpu=True,
                          multi_label=False,
                          model_type='bert',
                          )

##Create the Learner Object
see https://github.com/kaushaltrivedi/fast-bert#2-create-a-learner-object

In [27]:
import logging
import torch

from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

device_cuda = torch.device("cuda")
metrics = [{'name': 'accuracy', 'function': accuracy}]

!mkdir 'output'
learner = BertLearner.from_pretrained_model(
						databunch,
						pretrained_path='aubmindlab/bert-base-arabert',
						metrics=metrics,
						device=device_cuda,
						logger=logger,
						output_dir='output',
						finetuned_wgts_path=None,
						warmup_steps=30,
						multi_gpu=False,
						is_fp16=False,
						multi_label=False,
						logging_steps=0)

mkdir: cannot create directory ‘output’: File exists


INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/aubmindlab/bert-base-arabert/config.json from cache at /root/.cache/torch/transformers/91c3e98e149f6e88215bffd705e4ef9bd8a355f4c317973e4f3868c6f93fa24a.6eb3bdeb0de95f2f47fb89640edd8008987e27ff2fafa62e0210100371359306
INFO:transformers.configuration_utils:Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",

##Start Training and Validating

In [28]:
learner.fit(epochs=5,
			lr=2e-5,
			validate=True, 	# Evaluate the model after each epoch
			schedule_type="warmup_linear",
			optimizer_type="adamw")

INFO:__main__:***** Running training *****
INFO:__main__:  Num examples = 1440
INFO:__main__:  Num Epochs = 5
INFO:__main__:  Total train batch size (w. parallel, distributed & accumulation) = 16
INFO:__main__:  Gradient Accumulation steps = 1
INFO:__main__:  Total optimization steps = 450


INFO:__main__:Running evaluation
INFO:__main__:  Num examples = 360
INFO:__main__:  Batch size = 32


INFO:__main__:eval_loss after epoch 1: 0.21195762356122336: 
INFO:__main__:eval_accuracy after epoch 1: 0.9111111111111111: 
INFO:__main__:lr after epoch 1: 1.7142857142857142e-05
INFO:__main__:train_loss after epoch 1: 0.42563389043013256
INFO:__main__:

INFO:__main__:Running evaluation
INFO:__main__:  Num examples = 360
INFO:__main__:  Batch size = 32


INFO:__main__:eval_loss after epoch 2: 0.2557687560717265: 
INFO:__main__:eval_accuracy after epoch 2: 0.9111111111111111: 
INFO:__main__:lr after epoch 2: 1.2857142857142859e-05
INFO:__main__:train_loss after epoch 2: 0.1966747651911444
INFO:__main__:

INFO:__main__:Running evaluation
INFO:__main__:  Num examples = 360
INFO:__main__:  Batch size = 32


INFO:__main__:eval_loss after epoch 3: 0.2621495487789313: 
INFO:__main__:eval_accuracy after epoch 3: 0.9333333333333333: 
INFO:__main__:lr after epoch 3: 8.571428571428571e-06
INFO:__main__:train_loss after epoch 3: 0.09381214525136683
INFO:__main__:

INFO:__main__:Running evaluation
INFO:__main__:  Num examples = 360
INFO:__main__:  Batch size = 32


INFO:__main__:eval_loss after epoch 4: 0.2937438429022829: 
INFO:__main__:eval_accuracy after epoch 4: 0.9333333333333333: 
INFO:__main__:lr after epoch 4: 4.2857142857142855e-06
INFO:__main__:train_loss after epoch 4: 0.03164816682951318
INFO:__main__:

INFO:__main__:Running evaluation
INFO:__main__:  Num examples = 360
INFO:__main__:  Batch size = 32


INFO:__main__:eval_loss after epoch 5: 0.3140492020174861: 
INFO:__main__:eval_accuracy after epoch 5: 0.9222222222222223: 
INFO:__main__:lr after epoch 5: 0.0
INFO:__main__:train_loss after epoch 5: 0.020089184989531834
INFO:__main__:



(450, 0.15357163053833775)

##You can see the output using tensorboard

In [None]:
%load_ext tensorboard
%tensorboard --logdir './output/tensorboard'