In [1]:
# Transformers installation
! pip install transformers datasets
!pip install evaluate
!pip install accelerate -U --quiet
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git



In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset

ds = load_dataset("Abdelkareem/arabic_tweets_classification")



  0%|          | 0/1 [00:00<?, ?it/s]

Then take a look at an example:

In [4]:
ds["train"][0]

{'Date': '2021-03-02 00:00:00',
 'Time': '06:48:15',
 'Date Time': 'Tue Mar 02 06:48:15 +0000 2021',
 'URL': 'https://twitter.com/AlArabiya/status/1366641482610970626',
 'Tweet Text': 'سي إن إن تستعد إدارة الرئيس بايدن لفرض عقوبات على روسيا على خلفية تسميم المعارض لكرملين أليكسينافالني وسجنه العربية',
 'Cleaned Text': 'سي ان ان تستعد اداره الرئيس بايدن لفرض عقوبات علي روسيا علي خلفيه تسميم المعارض لكرملين اليكسينافالني وسجنه العربيه',
 'User Name': 'AlArabiya',
 'Location': None,
 'Replied Tweet ID ': None,
 'Replied Tweet User ID': None,
 'Replied Tweet User name': None,
 'Coordinates': None,
 'Retweet Count': 4.0,
 'Favorite Count': 20,
 'Favorited': 'False',
 'Label': 'Ham'}

In [5]:
ds['train'].features

{'Date': Value(dtype='string', id=None),
 'Time': Value(dtype='string', id=None),
 'Date Time': Value(dtype='string', id=None),
 'URL': Value(dtype='string', id=None),
 'Tweet Text': Value(dtype='string', id=None),
 'Cleaned Text': Value(dtype='string', id=None),
 'User Name': Value(dtype='string', id=None),
 'Location': Value(dtype='string', id=None),
 'Replied Tweet ID ': Value(dtype='float64', id=None),
 'Replied Tweet User ID': Value(dtype='float64', id=None),
 'Replied Tweet User name': Value(dtype='string', id=None),
 'Coordinates': Value(dtype='float64', id=None),
 'Retweet Count': Value(dtype='float64', id=None),
 'Favorite Count': Value(dtype='int64', id=None),
 'Favorited': Value(dtype='string', id=None),
 'Label': Value(dtype='string', id=None)}

In [6]:
columns_to_rmv = [x for x in ds["train"].features if "Cleaned Text" not in x and "Label" not in x ]
columns_to_rmv

['Date',
 'Time',
 'Date Time',
 'URL',
 'Tweet Text',
 'User Name',
 'Location',
 'Replied Tweet ID ',
 'Replied Tweet User ID',
 'Replied Tweet User name',
 'Coordinates',
 'Retweet Count',
 'Favorite Count',
 'Favorited']

In [7]:
ds = ds.remove_columns(columns_to_rmv)
ds

DatasetDict({
    train: Dataset({
        features: ['Cleaned Text', 'Label'],
        num_rows: 13240
    })
})

In [8]:
ds = ds.rename_column("Cleaned Text","text")
ds = ds.rename_column("Label","label")

In [9]:
import pandas as pd
ds = ds.filter(lambda example: not pd.isna(example["text"]))



In [10]:
ds = ds.class_encode_column("label")
ds



DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 13239
    })
})

In [11]:
ds["train"][0]

{'text': 'سي ان ان تستعد اداره الرئيس بايدن لفرض عقوبات علي روسيا علي خلفيه تسميم المعارض لكرملين اليكسينافالني وسجنه العربيه',
 'label': 0}

In [12]:
import pandas as pd
ds = ds.filter(lambda example: not pd.isna(example["text"]))
ds = ds.filter(lambda example: not pd.isna(example["label"]))



## Preprocess

The next step is to load a DistilBERT tokenizer to preprocess the `text` field:

In [13]:
# Load model directly
from transformers import AutoTokenizer, AutoModel

#load your pre_trained model with all its weights
model_name= 'aubmindlab/bert-base-arabertv02'
tokenizer =AutoTokenizer.from_pretrained(model_name)

Create a preprocessing function to tokenize `text` and truncate sequences to be no longer than DistilBERT's maximum input length:

In [14]:
def tokenize_function(examples):
    return tokenizer(examples["text"],padding="max_length",truncation=True)


tokenize_ds = ds.map(tokenize_function,batched=True)



To apply the preprocessing function over the entire dataset, use 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once:

In [15]:
tokenize_ds = tokenize_ds["train"].train_test_split(0.2)

Now create a batch of examples using [DataCollatorWithPadding](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorWithPadding). It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [16]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluate

Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):

In [17]:
import evaluate

accuracy = evaluate.load("accuracy")

Then create a function that passes your predictions and labels to [compute](https://huggingface.co/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.compute) to calculate the accuracy:

In [18]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.

## Train

Before you start training your model, create a map of the expected ids to their labels with `id2label` and `label2id`:

In [19]:
id2label = {0: "Spam", 1: "Ham"}
label2id = {"Spam": 0, "Ham": 1}

<Tip>

If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the basic tutorial [here](https://huggingface.co/docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!

</Tip>

You're ready to start training your model now! Load DistilBERT with [AutoModelForSequenceClassification](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForSequenceClassification) along with the number of expected labels, and the label mappings:

In [20]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,AutoModel
model_name= 'aubmindlab/bert-base-arabertv02'

model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2,from_tf=False)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-bas

At this point, only three steps remain:

1. Define your training hyperparameters in [TrainingArguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments). The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) will evaluate the accuracy and save the training checkpoint.
2. Pass the training arguments to [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
3. Call [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train) to finetune your model.

In [21]:
training_args = TrainingArguments(
    output_dir="arabic_tweets_spam_or_ham",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenize_ds["train"],
    eval_dataset=tokenize_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Cloning https://huggingface.co/Abdelkareem/arabic_tweets_spam_or_ham into local empty directory.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0421,0.010654,0.997356
2,0.0122,0.018922,0.996601


TrainOutput(global_step=1324, training_loss=0.021635985266406012, metrics={'train_runtime': 2122.6023, 'train_samples_per_second': 9.979, 'train_steps_per_second': 0.624, 'total_flos': 5573218374635520.0, 'train_loss': 0.021635985266406012, 'epoch': 2.0})

<Tip>

[Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) applies dynamic padding by default when you pass `tokenizer` to it. In this case, you don't need to specify a data collator explicitly.

</Tip>

Once training is completed, share your model to the Hub with the [push_to_hub()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) method so everyone can use your model:

In [23]:
trainer.push_to_hub()

Upload file runs/Jul10_14-23-37_8dccb8cbb04d/events.out.tfevents.1688999026.8dccb8cbb04d.1784.0:   0%|        …

To https://huggingface.co/Abdelkareem/arabic_tweets_spam_or_ham
   ace392e..1753c71  main -> main

   ace392e..1753c71  main -> main

To https://huggingface.co/Abdelkareem/arabic_tweets_spam_or_ham
   1753c71..9f3310b  main -> main

   1753c71..9f3310b  main -> main



'https://huggingface.co/Abdelkareem/arabic_tweets_spam_or_ham/commit/1753c715dff6438586118d9c69993de7d4481111'

<Tip>

For a more in-depth example of how to finetune a model for text classification, take a look at the corresponding
[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)
or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).

</Tip>

## Inference

Great, now that you've finetuned a model, you can use it for inference!

Grab some text you'd like to run inference on:

In [30]:
text = "الحرامي الوسخ الكلب الحيوان"

The simplest way to try out your finetuned model for inference is to use it in a [pipeline()](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline). Instantiate a `pipeline` for sentiment analysis with your model, and pass your text to it:

In [31]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="Abdelkareem/arabic_tweets_spam_or_ham")
classifier(text)

[{'label': 'LABEL_0', 'score': 0.9618229269981384}]

You can also manually replicate the results of the `pipeline` if you'd like:

Tokenize the text and return PyTorch tensors:

In [32]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Abdelkareem/arabic_tweets_spam_or_ham")
inputs = tokenizer(text, return_tensors="pt")

Pass your inputs to the model and return the `logits`:

In [55]:
from transformers import AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained("Abdelkareem/arabic_tweets_spam_or_ham")
with torch.no_grad():
    logits = model(**inputs).logits

Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label:

In [57]:
predicted_class_id = logits.argmax().item()
predicted_label = id2label[predicted_class_id]



In [58]:
predicted_label

'Spam'

In [59]:
predicted_class_id

0