<a href="https://colab.research.google.com/github/YahyaGrb/Arabi_Poem_Hacathon/blob/main/AraBERT_finteune_poem_fill_mask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers datasets

In [None]:
import numpy as np
import tensorflow
from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForMaskedLM

In [None]:
# https://huggingface.co/docs/transformers/model_sharing?highlight=login#setup
!huggingface-cli login #grant access to my private datasets
# add the token from https://huggingface.co/settings/tokens

# select a pretrained model

In [None]:
# checkpoint = "aubmindlab/bert-base-arabertv02" # classic arabic could be more close to poems
checkpoint = "distilbert-base-multilingual-cased" # not very good performance with primary test
# checkpoint = "CAMeL-Lab/bert-base-arabic-camelbert-ca"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForMaskedLM.from_pretrained(checkpoint)

In [None]:
tokenizer

In [None]:
model(model.dummy_inputs)  # Build the model
model.summary()

In [None]:
# run a quick test
import numpy as np
import tensorflow as tf
text= "قفا نبك من ذِكرى حبيب و [MASK]"
inputs = tokenizer(text, return_tensors="np")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
# We negate the array before argsort to get the largest, not the smallest, logits
top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist()

for token in top_5_tokens:
    print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")

In [None]:
from datasets import load_dataset

dataset = load_dataset("Yah216/Poem_APCD_text_only", use_auth_token=True)

## Prep the data

In [None]:
dataset

In [None]:
sample = dataset["train"].shuffle(seed=42).select(range(3))
for row in sample:
    print(f"\n'>>> البيت: {row['البيت']}'")

## tokenize data

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["البيت"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["البيت"]
)
tokenized_datasets

## create data chunks

In [None]:
chunk_size = 128

In [None]:
# prepare chunking
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc = 8)
lm_datasets

In [None]:
tokenizer.decode(lm_datasets["train"][5]["input_ids"])

# Prepare data collator 

In [None]:
train_size = 400000
test_size = int(0.05 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
tf_train_dataset = downsampled_dataset["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

tf_eval_dataset = downsampled_dataset["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

callback = PushToHubCallback(
    "Yah216/DistilBERT-finetuned-ACDP", tokenizer=tokenizer
)

# eval perplexity

In [None]:
import math

eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

In [None]:
history = model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs = 3,
                    callbacks=[callback]
                    )

In [None]:
# save the checkpoint
model.save_pretrained("model/DistilBERT-finetuned-ACDP")


# test the model

In [None]:
text= " إليك عني صديقي لندهب إلى بيت [MASK] "
inputs = tokenizer(text, return_tensors="np")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
# We negate the array before argsort to get the largest, not the smallest, logits
top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist()

for token in top_5_tokens:
    print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")