https://huggingface.co/docs/transformers/en/model_doc/encoder-decoder

# Model

In [4]:
from transformers import BertTokenizer, GPT2Tokenizer, EncoderDecoderModel, GenerationConfig
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, Trainer, EarlyStoppingCallback
from datasets import Dataset, DatasetDict, load_metric
import evaluate
import torch
import os
from os import listdir
from os.path import isfile, join
import json
import re
import numpy as np
import pandas as pd

In [5]:
torch.cuda.is_available()

True

In [2]:
# # max_split_size_mb nya bisa dikecilin klo GPU nya OOM
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

In [19]:
folder_data = "/kaggle/input/dataset-article/"
train_set = pd.read_csv(folder_data+'final_train_set.csv')
valid_set = pd.read_csv(folder_data+'final_valid_set.csv')
test_set = pd.read_csv(folder_data+'final_test_set.csv')

train_set = train_set.sample(n=3000, random_state=88).reset_index(drop=True)
valid_set = valid_set.sample(n=1000, random_state=88).reset_index(drop=True)
test_set = test_set.sample(n=1000, random_state=88).reset_index(drop=True)

In [21]:
train_set.shape, valid_set.shape, test_set.shape

((3000, 2), (1000, 2), (1000, 2))

In [22]:
train_set.duplicated().sum(), valid_set.duplicated().sum(), test_set.duplicated().sum()

(0, 0, 0)

In [23]:
train_set.head()

Unnamed: 0,final_clean_article,combined_clean_summary
0,Manajer Arsenal Arsene Wenger mewanti-wanti Re...,Manajer Arsenal Arsene Wenger menegaskan ia ak...
1,Obafemi Martins menjadi bintang pada laga Newc...,Obafemi Martins memberi kontribusi besar saat ...
2,Kejaksaan Tinggi Bengkulu memeriksa Wali Kota ...,Wali Kota Bengkulu Chalik Effendi diperiksa Ke...
3,Markas Besar TNI memutasi 66 perwira tinggi da...,Alih tugas 66 perwira itu terdiri dari 47 perw...
4,Keputusan pemerintah menunda surat keputusan b...,Jemaah Ahmadiyah menggelar doa dan sujud syuku...


In [24]:
valid_set.head()

Unnamed: 0,final_clean_article,combined_clean_summary
0,Pemerintah masih terkesan setengah hati dalam ...,"Pengamat politik Andi Malarangeng menilai, keb..."
1,Kalangan pekerja hiburan malam melakukan unjuk...,Pekerja hiburan malam berunjuk rasa di DPRD DK...
2,Semua mata dan perhatian tengah terfokus pada ...,Sejak awal Fraksi Partai Persatuan Pembangunan...
3,Tim Terpadu Penanggulangan Penyalahgunaan Baha...,Sejumlah SPBU diduga telah menyelewengkan jata...
4,Ratusan wartawan media cetak dan elektronik be...,Wartawan dari berbagai media cetak dan elektro...


In [25]:
test_set.head()

Unnamed: 0,final_clean_article,combined_clean_summary
0,"Gara-gara menganiaya pembantu rumah tangga, Ny...","Yuli, warga Jalan Pemuda IV, Rawamangun, Jakar..."
1,Buku kumpulan lukisan karya almarhum Hendra Gu...,Koleksi lukisan almarhum Hendra Gunawan diterb...
2,"Dalam rangka menyambut bulan Ramadhan, SCTV mu...",Sinetron mini seri Kembang Padang Kelabu diper...
3,"Ismail Tiris, Salokon Risun, dan Korenus Intor...",Tiga anggota Tentara Pembebasan Nasional Papua...
4,Bank Indonesia (BI) harus segera mengisi kekos...,Kekosongan sejumlah posisi Deputi Gubernur Ban...


In [35]:
train_dataset = Dataset.from_pandas(train_set)
valid_dataset = Dataset.from_pandas(valid_set)
test_dataset = Dataset.from_pandas(test_set)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'val': valid_dataset,
    'test': test_dataset
})

## Define Model

In [26]:
tokenizer = BertTokenizer.from_pretrained("cahya/bert2gpt-indonesian-summarization")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
model = EncoderDecoderModel.from_pretrained("cahya/bert2gpt-indonesian-summarization")

tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/230k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.26k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.08G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [32]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [33]:
prefix = ""
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["final_clean_article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(text_target=examples["combined_clean_summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [36]:
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Training

In [54]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    weight_decay=0.05,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    # logging_dir='./logs',
    # logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
    greater_is_better=True,
)

In [55]:
rouge = evaluate.load('rouge')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Directly decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=False)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=False)

    # Rouge expects newline-separated text
    decoded_preds = ["\n".join(decoded_pred.split()) for decoded_pred in decoded_preds]
    decoded_labels = ["\n".join(decoded_label.split()) for decoded_label in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return {k: round(v, 4) for k, v in result.items()}

In [56]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [57]:
torch.cuda.empty_cache()

In [58]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,2.729598,0.3873,0.2376,0.351,0.3874
2,1.641600,2.828375,0.4041,0.2538,0.3692,0.404
3,1.104100,2.989086,0.3755,0.2231,0.3379,0.3755
4,0.726700,3.079499,0.3721,0.2205,0.3342,0.3718
5,0.726700,3.214618,0.3804,0.2291,0.3435,0.38


Non-default generation parameters: {'max_length': 40, 'min_length': 20, 'early_stopping': True, 'num_beams': 10, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 40, 'min_length': 20, 'early_stopping': True, 'num_beams': 10, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 40, 'min_length': 20, 'early_stopping': True, 'num_beams': 10, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 40, 'min_length': 20, 'early_stopping': True, 'num_beams': 10, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 40, 'min_length': 20, 'early_stopping': True, 'num_beams': 10, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
There were missing keys in the checkpoint model loaded: ['decoder.lm_head.weight'].


TrainOutput(global_step=1875, training_loss=1.0151575520833334, metrics={'train_runtime': 4994.3108, 'train_samples_per_second': 6.007, 'train_steps_per_second': 0.751, 'total_flos': 6587742868254720.0, 'train_loss': 1.0151575520833334, 'epoch': 5.0})

In [59]:
eval_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print(eval_results)



{'eval_loss': 2.5003066062927246, 'eval_rouge1': 0.4422, 'eval_rouge2': 0.2853, 'eval_rougeL': 0.402, 'eval_rougeLsum': 0.4423, 'eval_runtime': 758.4168, 'eval_samples_per_second': 1.319, 'eval_steps_per_second': 0.33, 'epoch': 5.0}


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [60]:
# Save the model
model.save_pretrained("/kaggle/working/bert2gpt_indo_sum")

# Save the tokenizer
tokenizer.save_pretrained("/kaggle/working/bert2gpt_indo_sum/tokenizer")

Non-default generation parameters: {'max_length': 40, 'min_length': 20, 'early_stopping': True, 'num_beams': 10, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


('/kaggle/working/bert2gpt_indo_sum/tokenizer/tokenizer_config.json',
 '/kaggle/working/bert2gpt_indo_sum/tokenizer/special_tokens_map.json',
 '/kaggle/working/bert2gpt_indo_sum/tokenizer/vocab.txt',
 '/kaggle/working/bert2gpt_indo_sum/tokenizer/added_tokens.json')

In [62]:
!zip -r bert2gpt_indo_sum.zip /kaggle/working/bert2gpt_indo_sum

updating: kaggle/working/bert2gpt_indo_sum/ (stored 0%)
  adding: kaggle/working/bert2gpt_indo_sum/generation_config.json (deflated 41%)
  adding: kaggle/working/bert2gpt_indo_sum/tokenizer/ (stored 0%)
  adding: kaggle/working/bert2gpt_indo_sum/tokenizer/vocab.txt (deflated 51%)
  adding: kaggle/working/bert2gpt_indo_sum/tokenizer/special_tokens_map.json (deflated 54%)
  adding: kaggle/working/bert2gpt_indo_sum/tokenizer/tokenizer_config.json (deflated 75%)
  adding: kaggle/working/bert2gpt_indo_sum/model.safetensors (deflated 7%)
  adding: kaggle/working/bert2gpt_indo_sum/config.json (deflated 76%)


## Inference

In [None]:
# # Load the tokenizer
# tokenizer = BertTokenizer.from_pretrained("/content/drive/MyDrive/model/bert2gpt_indo_sum/tokenizer")

# # Load the model
# model = EncoderDecoderModel.from_pretrained("/content/drive/MyDrive/model/bert2gpt_indo_sum")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [63]:
def clean_article(text):
    # Remove phrases like "Liputan6.com, [City Name]:"
    text = re.sub(r'Liputan6\. com, [A-Za-z\s]+:', '', text)

    # Remove words followed by a date in the format (dd/mm) or (d/m)
    text = re.sub(r'\w+\s*\(\d{1,2}/\d{1,2}\)', '', text)

    # Remove text inside parentheses that follow a specific pattern (e.g., (UPI/Reporter Name)) and optionally with 'dan' conjunction
    text = re.sub(r'\([A-Z]+/[A-Za-z\s]+(?: dan [A-Za-z\s]+)?\)\.', '', text)

    # Remove text inside square brackets that starts with 'baca:'
    text = re.sub(r'\[baca: .*?\]', '', text)

    # Remove URLs starting with http or https
    text = re.sub(r'https?://\S+', '', text)

    # Remove leading and trailing whitespaces from the text
    return text.strip()

In [64]:
%%time
ARTICLE_TO_SUMMARIZE = """Bank Mandiri, sebagaimana bank umumnya, menyediakan layanan kartu debit bagi nasabahnya. Kartu debit Mandiri dapat digunakan oleh nasabah untuk melakukan berbagai transaksi di mesin ATM atau mesin EDC. Fungsi dari Kartu Debit Mandiri ini sangat beragam, mulai dari tarik tunai, setor tunai, transfer uang, cek saldo rekening, hingga membayar berbagai tagihan melalui mesin ATM.
Penting bagi Anda yang ingin membuka rekening tabungan di Bank Mandiri untuk memahami jenis Kartu Debit Mandiri agar tidak salah memilih. Setiap kartu debit Mandiri memiliki kelebihan dan kekurangannya masing-masing, sehingga penting bagi nasabah untuk memilih yang sesuai dengan kebutuhan dan preferensi mereka.

Dalam memilih jenis Kartu Debit Mandiri, nasabah perlu mempertimbangkan kebutuhan dan gaya hidup mereka. Apakah mereka membutuhkan manfaat tambahan seperti asuransi atau akses ke airport lounge, ataukah mereka menginginkan kartu debit yang sederhana namun praktis. Dengan mengetahui jenis Kartu Debit Mandiri yang sesuai, nasabah dapat memaksimalkan manfaat yang mereka dapatkan dari penggunaan kartu debit tersebut.
"""

# generate summary
input_ids = tokenizer.encode(clean_article(ARTICLE_TO_SUMMARIZE), return_tensors='pt')
summary_ids = model.generate(input_ids.to(model.device),
            min_length=20,
            max_length=128,
            num_beams=10,
            repetition_penalty=2.5,
            length_penalty=1.0,
            early_stopping=True,
            no_repeat_ngram_size=2,
            use_cache=True,
            do_sample = True,
            temperature = 0.1,
            top_k = 50,
            top_p = 0.95)
# start time dan end time
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary_text)

bank mandiri menyediakan layanan kartu debit bagi nasabah untuk melakukan berbagai transaksi di mesin atm atau mesin edc.
CPU times: user 750 ms, sys: 16 ms, total: 766 ms
Wall time: 819 ms


In [None]:
# parent_folder = "/content/" # Pake yang ada di folder "clean_data" di Google drive
# test_set = pd.read_csv(parent_folder+'final_test_set.csv')

In [65]:
df_sample = test_set.sample(1)

In [66]:
df_sample["final_clean_article"].values

array(['Kepala Kepolisian Daerah Metro Jaya Inspektur Jenderal Polisi Sofjan Jacoeb menilai para pelaku kejahatan semakin nekat dan tak segan-segan melukai serta membunuh korban. Sebab itu, ia meminta masyarakat menggiatkan pengamanan swakarsa dan menjaga lingkungan masing-masing. Penilaian tersebut Sofjan sampaikan di Jakarta, baru-baru ini. Menurut Sofjan, musibah yang menimpa Brigadir Polisi Mursito, anggota Kepolisian Resor Jakarta Barat yang tewas saat mengawal uang nasabah Bank Central Asia, dapat menjadi cermin kebrutalan penjahat . Di samping itu, Sofjan menegaskan, insiden tersebut juga menjadi tantangan bagi kepolisian untuk memberikan rasa aman kepada masyarakat.'],
      dtype=object)

In [67]:
%%time
ARTICLE_TO_SUMMARIZE = """Kepala Kepolisian Daerah Metro Jaya Inspektur Jenderal Polisi Sofjan Jacoeb menilai para pelaku kejahatan semakin nekat dan tak segan-segan melukai serta membunuh korban. Sebab itu, ia meminta masyarakat menggiatkan pengamanan swakarsa dan menjaga lingkungan masing-masing. Penilaian tersebut Sofjan sampaikan di Jakarta, baru-baru ini. Menurut Sofjan, musibah yang menimpa Brigadir Polisi Mursito, anggota Kepolisian Resor Jakarta Barat yang tewas saat mengawal uang nasabah Bank Central Asia, dapat menjadi cermin kebrutalan penjahat . Di samping itu, Sofjan menegaskan, insiden tersebut juga menjadi tantangan bagi kepolisian untuk memberikan rasa aman kepada masyarakat."""

# generate summary
input_ids = tokenizer.encode(clean_article(ARTICLE_TO_SUMMARIZE), return_tensors='pt')
summary_ids = model.generate(input_ids.to(model.device),
            min_length=20,
            max_length=128,
            num_beams=10,
            repetition_penalty=2.5,
            length_penalty=1.0,
            early_stopping=True,
            no_repeat_ngram_size=2,
            use_cache=True,
            do_sample = True,
            temperature = 0.1,
            top_k = 50,
            top_p = 0.95)
# start time dan end time
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary_text)

kapolda metro jaya irjen pol. sofjan jacoeb menilai para pelaku kejahatan semakin nekat dan tak segan - segan melukai serta membunuh korban.
CPU times: user 684 ms, sys: 2.02 ms, total: 686 ms
Wall time: 684 ms
