#Mount Your Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
#copy the BERT model to Colab
!cp -r "/content/drive/My Drive/AraBERT_models/" ./

#Installing Farasa and pyarabic

To do Farasa segmenting you will need FarasaSegmenter.jar in the same directory as the preprocess.py file 

(you can get the Farasa segmenter from http://qatsdemo.cloudapp.net/farasa/register.html)

In [3]:
#install java on colab (needed for Farasa)
import os       
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()
!pip install py4j
!pip install pyarabic 

openjdk version "11.0.6" 2020-01-14
OpenJDK Runtime Environment (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1)
OpenJDK 64-Bit Server VM (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1, mixed mode, sharing)
Collecting py4j
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 2.8MB/s 
[?25hInstalling collected packages: py4j
Successfully installed py4j-0.10.9
Collecting pyarabic
[?25l  Downloading https://files.pythonhosted.org/packages/b8/77/da852ee13bce3affc55b746cebc0fdc0fc48628dbc5898ce489112cd6bd1/PyArabic-0.6.6.tar.gz (101kB)
[K     |████████████████████████████████| 102kB 2.3MB/s 
[?25hBuilding wheels for collected packages: pyarabic
  Building wheel for pyarabic (setup.py) ... [?25l[?25hdone
  Created wheel for pyarabic: filename=PyArabic-0.6.6-cp36-none-any.whl size=106208 sha256=6290721a6eb1f51c1108b7d5358a41e37195

In [0]:
#This command is usefull when the java runtime hangs after a runtime restart (colab issue)
!pkill "java"

#Tensorflow implementation

In [0]:
!git clone https://github.com/aub-mind/arabert

If the cell below hangs, stop it and use !pkill "java"

In [0]:
import tensorflow as tf
from arabert import tokenization
from arabert.preprocess_arabert import preprocess
from py4j.java_gateway import JavaGateway

!pkill "java"

gateway = JavaGateway.launch_gateway(classpath='./FarasaSegmenterJar.jar')
farasa = gateway.jvm.com.qcri.farasa.segmenter.Farasa()

#Configure the path to the araBERT folder
ARABERT_PATH = "./AraBERT_models/tf_arabert"

In [0]:
#test BERT tokenizer
bert_tokenizer = tokenization.FullTokenizer(ARABERT_PATH+"/vocab.txt")

In [18]:
text = "الدراسات النظرية للتصميم الحديث"
text_prep = preprocess(text, do_farasa_tokenization=True , farasa=farasa)
print(text_prep)

ال+ دراس +ات ال+ نظري +ة ل+ ال+ تصميم ال+ حديث


In [19]:
bert_tokenizer.tokenize(text_prep)

['ال+',
 'دراس',
 '+ات',
 'ال+',
 'نظري',
 '+ة',
 'ل+',
 'ال+',
 'تصميم',
 'ال+',
 'حديث']

##Tensorflow Training

**ENABLE TPU RUNTIME!!!**

Test Sentiment Analysis score on a dataset like the AJGT

K. M. Alomari, H. M. ElSherif, and K. Shaalan, “Arabic tweets sentimental analysis using machine learning,” in Proceedings of the International Conference on Industrial, Engineering and Other Applications of Applied Intelligent Systems, pp. 602–610, Montreal, Canada, June 2017.

In [28]:
import os
import sys
import json
import nltk
import random
import logging
import tensorflow as tf
import pandas as pd

from glob import glob
from tensorflow.keras.utils import Progbar
from tqdm import  tqdm

sys.path.append("arabert")

import arabert
from arabert import modeling, optimization, tokenization
from arabert.run_classifier import input_fn_builder, model_fn_builder

from sklearn.model_selection import train_test_split

from google.colab import auth
auth.authenticate_user()

# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s :  %(message)s')
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
log.handlers = [sh]

if 'COLAB_TPU_ADDR' in os.environ:
  log.info("Using TPU runtime")
  USE_TPU = True
  TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

  with tf.Session(TPU_ADDRESS) as session:
    log.info('TPU address is ' + TPU_ADDRESS)
    # Upload credentials to TPU.
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    
else:
  log.warning('Not connected to TPU runtime')
  USE_TPU = False

2020-03-11 12:09:35,885 :  Not connected to TPU runtime


In [23]:
# Input data pipeline config
TRAIN_BATCH_SIZE = 32 #@param {type:"integer"} #You can probably 
                                              #increase when using TPUS
MAX_SEQ_LENGTH = 512 #@param {type:"integer"} #reduce if running on GPU

# Training procedure config
EVAL_BATCH_SIZE = 64 
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 6 #@param {type:"integer"}
WARMUP_PROPORTION = 0.1 #@param {type:"number"}
NUM_TPU_CORES = 8
PREDICT_BATCH_SIZE = 8


CONFIG_FILE = os.path.join(ARABERT_PATH, "config.json")
INIT_CHECKPOINT = os.path.join(ARABERT_PATH,"arabert_model.ckpt")#change the model name when you use arabertv0.1

OUTPUT_DIR_PER_MODEL = "./finetuned_model"
bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)

log.info("Using checkpoint: {}".format(INIT_CHECKPOINT))

print("ARABERT_PATH: "+ARABERT_PATH)
print("CONFIG_FILE: "+CONFIG_FILE)
print("INIT_CHECKPOINT: "+INIT_CHECKPOINT)

2020-03-11 12:08:05,119 :  Using checkpoint: ./AraBERT_models/tf_arabert


ARABERT_PATH: ./AraBERT_models/tf_arabert
CONFIG_FILE: ./AraBERT_models/tf_arabert/config.json
INIT_CHECKPOINT: ./AraBERT_models/tf_arabert


In [0]:
df_AJGT = pd.read_excel('./arabert/AJGT.xlsx',header=0)

DATA_COLUMN = 'text'
LABEL_COLUMN = 'label'

df_AJGT = df_AJGT[['Feed', 'Sentiment']]
df_AJGT.columns = [DATA_COLUMN, LABEL_COLUMN]

df_AJGT['text'] = df_AJGT['text'].apply(lambda x: preprocess(x, do_farasa_tokenization=True , farasa=farasa))

train_AJGT, test_AJGT = train_test_split(df_AJGT, test_size=0.2,random_state=42)

In [0]:
train_InputExamples = train_AJGT.apply(lambda x: arabert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                    text_a = x["text"], 
                                                                    text_b = None, 
                                                                    label = x["label"]), axis = 1)

test_InputExamples = test_AJGT.apply(lambda x: arabert.run_classifier.InputExample(guid=None, 
                                                                    text_a = x["text"], 
                                                                    text_b = None, 
                                                                    label = x["label"]), axis = 1)

In [32]:
labels = list(df_AJGT.label.unique())
print(labels)

train_features = arabert.run_classifier.convert_examples_to_features(train_InputExamples, labels, MAX_SEQ_LENGTH, bert_tokenizer)
test_features = arabert.run_classifier.convert_examples_to_features(test_InputExamples, labels, MAX_SEQ_LENGTH, bert_tokenizer)

2020-03-11 12:10:02,955 :  From /content/arabert/run_classifier.py:777: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.


2020-03-11 12:10:02,967 :  Writing example 0 of 1440
INFO:tensorflow:Writing example 0 of 1440
2020-03-11 12:10:02,971 :  *** Example ***
INFO:tensorflow:*** Example ***
2020-03-11 12:10:02,976 :  guid: None
INFO:tensorflow:guid: None
2020-03-11 12:10:02,979 :  tokens: [CLS] سبحان الله ب+ حمد +ه عدد خلق +ه رضى نفس +ه زن +ه عرش +ه مداد كلم +ات +ه [SEP]
INFO:tensorflow:tokens: [CLS] سبحان الله ب+ حمد +ه عدد خلق +ه رضى نفس +ه زن +ه عرش +ه مداد كلم +ات +ه [SEP]
2020-03-11 12:10:02,982 :  input_ids: 29756 36006 12695 448 3945 129 5367 4095 129 4444 6746 129 630 129 5383 129 21336 6025 1012 129 29758 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

['Positive', 'Negative']


2020-03-11 12:10:03,689 :  Writing example 0 of 360
INFO:tensorflow:Writing example 0 of 360
2020-03-11 12:10:03,692 :  *** Example ***
INFO:tensorflow:*** Example ***
2020-03-11 12:10:03,694 :  guid: None
INFO:tensorflow:guid: None
2020-03-11 12:10:03,696 :  tokens: [CLS] و+ الله حرام و+ الله موتو +ه ل+ شعب ال+ اردني من و ##ين بدن +ا نجيب ال+ كو من و ##ين يا الله ارحم ##و من في ال+ ارض يرحمك ##م من في ال+ سماء الله حرام [SEP]
INFO:tensorflow:tokens: [CLS] و+ الله حرام و+ الله موتو +ه ل+ شعب ال+ اردني من و ##ين بدن +ا نجيب ال+ كو من و ##ين يا الله ارحم ##و من في ال+ ارض يرحمك ##م من في ال+ سماء الله حرام [SEP]
2020-03-11 12:10:03,697 :  input_ids: 29756 897 12695 16006 897 12695 22398 129 816 4928 3000 31462 857 117 8268 3106 124 22786 3000 813 857 117 8268 900 12695 12271 1005 857 781 3000 2889 41768 1002 857 781 3000 17867 12695 16006 29758 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [34]:
num_train_steps = int(len(train_features) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
num_steps_per_epoch = int(len(train_features) / TRAIN_BATCH_SIZE)

print("num train steps: {}".format(num_train_steps))
print("num warmup steps: {}".format(num_warmup_steps))
print("num_steps_per_epoch: {}".format(num_steps_per_epoch))

model_fn = model_fn_builder(
  bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
  num_labels=2,
  init_checkpoint=INIT_CHECKPOINT,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=USE_TPU,
  use_one_hot_embeddings=USE_TPU
)

tpu_cluster_resolver = None
if USE_TPU:
  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=OUTPUT_DIR_PER_MODEL,
    save_checkpoints_steps=num_steps_per_epoch,
    keep_checkpoint_max=0,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=num_steps_per_epoch,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=USE_TPU,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE,
    predict_batch_size=PREDICT_BATCH_SIZE)
  
train_input_fn = input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)

test_input_fn = input_fn_builder(
  features=test_features,
  seq_length=MAX_SEQ_LENGTH,
  is_training=False,
  drop_remainder=True)

2020-03-11 12:10:40,673 :  Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x7f040e78d620>) includes params argument, but params are not passed to Estimator.
2020-03-11 12:10:40,683 :  Using config: {'_model_dir': './finetuned_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 45, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 0, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f040e8c29b0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '

num train steps: 270
num warmup steps: 27
num_steps_per_epoch: 45


##Train the model

In [0]:
print(f'Beginning Training!')
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

##Evaluate the model on all saved checkpoint files

In [0]:
print(f'Beginning Evaluation!')
eval_model_files = tf.gfile.Glob(os.path.join(OUTPUT_DIR_PER_MODEL,'*index'))

for eval_checkpoint in tqdm(sorted(eval_model_files,key=lambda x: int(x[0:-6].split('-')[-1]))):
  result = estimator.evaluate(input_fn=test_input_fn, steps=int(len(test_features)/EVAL_BATCH_SIZE),checkpoint_path=eval_checkpoint[0:-6])
  tf.logging.info("***** Eval results *****")
  for key in sorted(result.keys()):
    tf.logging.info("  %s = %s", key, str(result[key]))