In [None]:
pip install accelerate -U

In [None]:
# Transformers installation
! pip install transformers datasets

## Load BillSum dataset

In [None]:
from datasets import load_dataset

# data_files = {"train":"train.csv","validation":"validation.csv","test":"test.csv"}
billsum = load_dataset("osama39/BNHG",split="train")  #split="train"  data_files=data_files

In [None]:
billsum.shape

In [None]:
billsum

In [None]:
# Calculate the average number of words in the "Article" column
total_words = 0
total_examples = len(billsum)

for example in billsum:
    # Tokenize the article text
    article_tokens = example["Article"].split()  # Split the text into words

    # Count the number of words in the article
    num_words = len(article_tokens)

    # Add the number of words to the total
    total_words += num_words

# Calculate the average number of words
average_words_per_example = total_words / total_examples

print("Average number of words per example in the 'Article' column:", average_words_per_example)

In [None]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=3):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [None]:
# Display a random sample of examples from the dataset
show_random_elements(billsum)

In [None]:
df = billsum.to_pandas()

In [None]:
df.columns

In [None]:
df.head(5)

In [None]:
df["Article"][0]

In [None]:
!pip install git+https://github.com/Hasan-Mesbaul-Ali-Taher/normalizer
from normalizer import normalize
import re

In [None]:
def preprocess_text(text):

    # Normalize the text using BUET normalizer
    text = normalize(text, unicode_norm="NFKC", punct_replacement= None, url_replacement=" ", emoji_replacement=" ", apply_unicode_norm_last=True)

    # # Define the Bengali Unicode range (Bengali script starts at U+0980)
    # bengali_range = "\u0980-\u09FF"

    # # Remove all characters that are not Bengali words or punctuations
    # processed_text = re.sub(f"[^{bengali_range}!\"#$%&'()*+,-./:;<=>?@[\\]^_`{{|}}~।\s]", "", text)

    # Replace consecutive punctuations with whitespace in between with only the first punctuation
    # processed_text = re.sub(r"([!\"#$%&'()*+,-./:;<=>?@[\\]^_`{{|}}~।])\\1+", r"\1", processed_text)
    processed_text =  re.sub(r'([.,?!|।^_`#@{{|}}~।#$%\&()*+,-./:;<=>?@])\1+', r'\1', text)
    # Split the text into words and remove empty strings caused by consecutive spaces
    words = processed_text.split()
    # Join the words with a single space to form the cleaned text
    text = ' '.join(words)

    result = []
    prev_char = ''

    for char in text:
        if char.strip() == prev_char.strip() and char.strip() in "!।|@#$%^&*()_+-=[]{}|;':\",./<>?.,?!|।^_`#@{{|}}~।#$%\&()*+,-./:;<=>?@":
            continue
        result.append(char)
        prev_char = char

    modified_text = ''.join(result)
    # Define a regular expression to find consecutive punctuations separated by space
    pattern = r'(\s*[\]\[\।\!\@\#\$\%\^\&\*\(\)\_\+\-\=\[\]\{\}\|\;\'\:\"\,\.\/\<\>\?\.\,\?\!\|\।\^\_\`\#\@\{\{\|\}\}\~\।\#\$\%\\\&\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@]+)+\s*'

    # Use re.sub() to replace consecutive punctuations with a single punctuation
    processed_text = re.sub(pattern, r'\1', modified_text)
    # Remove leading and lagging spaces
    processed_text = processed_text.strip()
    return processed_text

In [None]:
df['Article'] = df['Article'].apply(lambda text: preprocess_text(text))


In [None]:
df["Article"][0]

In [None]:
df.head(5)

In [None]:
from datasets import Dataset

# Convert the Pandas DataFrame back to a Hugging Face dataset
billsum = Dataset.from_pandas(df)

In [None]:
billsum = billsum.train_test_split(test_size=0.2)

In [None]:
billsum["train"].shape

In [None]:
billsum

In [None]:
billsum["test"][0]

In [None]:
billsum["test"].shape

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast,AlbertTokenizer

In [None]:
from transformers import AutoTokenizer

checkpoint = "csebuetnlp/banglat5"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map = 'auto')
# model.to(device)

In [None]:
billsum

## Preprocess

In [None]:
prefix = "summarize the Article as Headlines."
def preprocess_function(examples):
    # Concatenate inputs with [SEP] tokens
    inputs = [prefix + "[SEP]"+ category + " [SEP] " + aspect  +  " [SEP] " + doc
              for category,aspect, doc in zip( examples["Category"],
                                               examples["Aspect"],
                                               examples["Article"])]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=1024, padding = True,truncation=True)

    # Tokenize labels
    labels = tokenizer.batch_encode_plus(examples["Headlines"],padding = True, max_length=128, truncation=True)["input_ids"]

    # Add labels to model inputs
    model_inputs["labels"] = labels
    return model_inputs


In [None]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

In [None]:
tokenized_billsum

In [None]:
from datasets import concatenate_datasets

# Concatenate train and test datasets
concatenated_dataset = concatenate_datasets([tokenized_billsum["train"], tokenized_billsum["test"]])

# Calculate the average token length for the input "Article" column
total_tokens = 0
total_examples = len(concatenated_dataset)

for example in concatenated_dataset:
    # Get the tokenized article input_ids
    article_input_ids = example["input_ids"]

    # Count the number of tokens in the article
    num_tokens = len(article_input_ids)

    # Add the number of tokens to the total
    total_tokens += num_tokens

# Calculate the average token length
average_token_length = total_tokens / total_examples

print("Average token length for the 'Article' column:", average_token_length)


In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
pip install transformers datasets rouge_score evaluate

## Evaluate

In [None]:
import evaluate

rouge = evaluate.load("rouge")

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from huggingface_hub import login

login(token = "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")

## Train

In [None]:
from transformers import EarlyStoppingCallback


In [None]:
batch_size = 4
model_name = checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-headline_WithIp-category",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    lr_scheduler_type = 'linear',
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
    logging_steps=10,
    report_to = "none",
    load_best_model_at_end= True,
    metric_for_best_model = "rougeL",
    greater_is_better = True,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],

)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()