<a href="https://colab.research.google.com/github/ageraustine/text-summariser/blob/master/Text_Summariser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install keras_nlp
!pip install datasets
!pip install rouge-score

In [None]:
import tensorflow_datasets as tfds
import os
import shutil
import zipfile
import pandas as pd
import tensorflow as tf
from tensorflow import keras 
import math
import transformers
import keras_nlp
import rouge_score

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

                              Unzip The Dataset

In [4]:
dataset_dir = os.getcwd() + '/dataset'
zipcsv_dir = '/content/drive/MyDrive/ml/nlp/datasets/wikihowAll.zip'

if not os.path.exists("/content/dataset/wikihowAll.csv"):
  with zipfile.ZipFile(zipcsv_dir, "r") as zp:
    zp.extractall(dataset_dir)

                                     Load The Dataset

In [5]:
# The percentage of the dataset you want to split as train and test
TRAIN_TEST_SPLIT = 0.1

MAX_INPUT_LENGTH = 1024  # Maximum length of the input to the model
MIN_TARGET_LENGTH = 5  # Minimum length of the output by the model
MAX_TARGET_LENGTH = 128  # Maximum length of the output by the model
BATCH_SIZE = 8  # Batch-size for training our model
LEARNING_RATE = 2e-5  # Learning-rate for training our model
MAX_EPOCHS = 1  # Maximum number of epochs we will train the model for
MODEL_CHECKPOINT = "t5-small"

csv_path = dataset_dir + '/wikihowAll.csv'
dataset_df = pd.read_csv(csv_path, nrows=10000)

In [6]:
filtered_df = dataset_df[['headline', 'text']].astype(str)

In [7]:
from datasets import Dataset
 
raw_dataset = Dataset.from_pandas(filtered_df)

raw_dataset = raw_dataset.train_test_split(
    train_size=TRAIN_TEST_SPLIT, test_size=TRAIN_TEST_SPLIT
)

                                   Data Preprocessing

In [None]:
from transformers import AutoTokenizer

MODEL_CHECKPOINT = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [9]:
if MODEL_CHECKPOINT in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [10]:
def tokenize_data(data):
    inputs = [prefix + doc for doc in data['text']]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            data['headline'], max_length=MAX_TARGET_LENGTH, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [None]:
tokenized_datasets = raw_dataset.map(tokenize_data, batched=True)

                              Defining the Model

In [None]:
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
test_dataset = tokenized_datasets["test"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)
generation_dataset = (
    tokenized_datasets["test"]
    .shuffle()
    .select(list(range(200)))
    .to_tf_dataset(
        batch_size=BATCH_SIZE,
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        collate_fn=data_collator,
    )
)

                            Building and Compiling The Model

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer)

                                    Training and EValuation

In [16]:
import keras_nlp
import rouge_score

rouge_l = keras_nlp.metrics.RougeL()

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_l(decoded_labels, decoded_predictions)
    # We will print only the F1 score, you can use other aggregation metrics as well
    result = {"RougeL": result["f1_score"]}
    return result


In [None]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=generation_dataset, predict_with_generate=True
)

callbacks = [metric_callback]

# For now we will use our test set as our validation_data
model.fit(
    train_dataset, validation_data=test_dataset, epochs=MAX_EPOCHS, callbacks=callbacks
)





Inference

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")

summarizer(
    raw_dataset["test"][0]["text"],
    min_length=MIN_TARGET_LENGTH,
    max_length=MAX_TARGET_LENGTH,
)