In [None]:
# Transformers installation
! pip install torch -q
! pip install transformers[torch] datasets -q
# To install from source instead of the last release, comment the command above and uncomment the following one.
! pip install git+https://github.com/huggingface/transformers.git

In [None]:
import pickle
import os
with open("/kaggle/input/wikilingua-arabic-summarisation/arabic.pkl", 'rb') as pickle_file:
    arabic_docs=pickle.load(pickle_file)

In [None]:
import pandas as pd
from datasets import Dataset

# Flatten the nested dictionary into a list of dictionaries
flat_data = []
for url, sections in arabic_docs.items():
    for section_name, section_data in sections.items():
        flat_data.append({
            'input_text': section_data['document'],
            'target_text': section_data['summary'],
        })

# Create a DataFrame from the flattened data
df = pd.DataFrame(flat_data)

# Create a dataset from the DataFrame
summarization_dataset = Dataset.from_pandas(df)

# Display the first few rows of the dataset
summarization_dataset


In [None]:
len(list(arabic_docs.items())[0])
list(arabic_docs.items())[0]

In [None]:
summarization_dataset.shape

# Summarization

Summarization creates a shorter version of a document or an article that captures all the important information. Along with translation, it is another example of a task that can be formulated as a sequence-to-sequence task. 

<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->

[BART](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/bart), [BigBird-Pegasus](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/bigbird_pegasus), [Blenderbot](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/blenderbot), [BlenderbotSmall](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/blenderbot-small), [Encoder decoder](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/encoder-decoder), [FairSeq Machine-Translation](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/fsmt), [GPTSAN-japanese](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/gptsan-japanese), [LED](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/led), [LongT5](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/longt5), [M2M100](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/m2m_100), [Marian](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/marian), [mBART](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/mbart), [MT5](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/mt5), [MVP](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/mvp), [NLLB](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/nllb), [NLLB-MOE](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/nllb-moe), [Pegasus](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/pegasus), [PEGASUS-X](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/pegasus_x), [PLBart](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/plbart), [ProphetNet](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/prophetnet), [SwitchTransformers](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/switch_transformers), [T5](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/t5), [XLM-ProphetNet](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/xlm-prophetnet)

<!--End of the generated tip-->

</Tip>

Before you begin, make sure you have all the necessary libraries installed:

```bash
pip install transformers datasets evaluate rouge_score
```

We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:

In [None]:
! pip install nltk rouge_score

## Load arabic_docs dict

Split the dataset into a train and test set with the [train_test_split](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.train_test_split) method:

In [None]:
# split your arabic dataset here 
from sklearn.model_selection import train_test_split

summarization_dataset = summarization_dataset.train_test_split(test_size=0.2)
print("done!")

In [None]:
summarization_dataset['train']

## Preprocess

in our model we used araBERT

In [None]:
!pip install --upgrade transformers huggingface-hub

In [None]:
from transformers import DataCollatorForSeq2Seq, BarthezTokenizer, MBartForConditionalGeneration

tokenizer = BarthezTokenizer.from_pretrained("moussaKam/AraBART")
model = MBartForConditionalGeneration.from_pretrained("moussaKam/AraBART")

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
def generate_summary(text):
    # Tokenize the input text
    input_ids = tokenizer.encode(text, return_tensors='pt')

    # Generate the summary
    summary_ids = model.generate(input_ids, num_beams=4, max_length=100, early_stopping=True, forced_bos_token_id=tokenizer.lang_code_to_id["ar_AR"])
    
    # Decode the generated summary
    summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
    
    return summary

print('done !')

In [None]:
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    model_inputs = tokenizer(examples["input_text"], max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=max_target_length, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print('done !')

To apply the preprocessing function over the entire dataset, use 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:

In [None]:
tokenized_datasets = summarization_dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_datasets

## Evaluate

In [None]:
from datasets import load_metric

rouge = load_metric("rouge")

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

print('done !')

## Train

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    push_to_hub=False,
    report_to="tensorboard",  # Set this to "none" if you want to disable all integrations
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=summarization_dataset['train'],
    eval_dataset=summarization_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# hugginface use wandab to log training 
# لن نحتاج إليه لذلك سوف نقوم بتعطيله
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer.train

In [None]:
#trainer.push_to_hub()
trainer.save_model('./mymodel')

## Inference

In [None]:
text = "الرياض هي عاصمة المملكة العربية السعودية وأكبر مدينة فيها. تعد الرياض مركزاً حضرياً حديثاً توفر العديد من المرافق والخدمات. تشتهر المدينة بأبراجها الحديثة ومراكز التسوق الفاخرة. يمكن للزوار استكشاف التاريخ الغني للمنطقة من خلال زيارة المتاحف والمعالم الثقافية. الرياض تجمع بين التقاليد العربية والحياة الحديثة، مما يجعلها وجهة مثيرة ومتنوعة للسياح."

this model was made using araBERT. apparently the pipeline does not support this model.
this model will be bilt again durring this week using mt5. 
i spent much time building it, and i have lots of other things to do so i will be sharing this version of the TP and will apdate you with the mt5 version as soon as possible this week.

i hope you'll understand.


In [None]:
import torch

# Assuming 'text' is your input data
input_ids = tokenizer.encode(text, return_tensors="pt")

# Move input to the same device as the model
input_ids = input_ids.to(model.device)
print('done !')

In [None]:
text

In [None]:
# Generate summary
summary_ids = model.generate(input_ids)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print('The orginal text :', text)
print('\n')
print("Generated Summary:", summary)
print('\n \n This summary was made using the abstractive summarization (text generation).')