To save in google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


**Packages**

In [2]:
! pip install datasets --quiet
! pip install evaluate --quiet
! pip install rouge_score --quiet
! pip install sacrebleu --quiet
! pip install transformers --quiet
! pip install -q sentencepiece --quiet
! pip install summarizer --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 KB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 KB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from datasets import load_dataset
import evaluate

from pprint import pprint

**Data**

In [4]:
train_df = pd.read_csv('/content/drive/MyDrive/W266FinalProject/Datasets/xl_sum_sample_train.csv')
val_df = pd.read_csv('/content/drive/MyDrive/W266FinalProject/Datasets/xl_sum_sample_val.csv')
test_df = pd.read_csv('/content/drive/MyDrive/W266FinalProject/Datasets/xl_sum_sample_test.csv')

In [5]:
print(f'train shape: {train_df.shape}')
print(f'val size: {val_df.shape}')
print(f'test size: {test_df.shape}')

train shape: (1000, 2)
val size: (100, 2)
test size: (100, 2)


In [6]:
train_df.head(n=2)

Unnamed: 0,text,summary
0,By Rebecca Ricks & Johnny O'SheaBBC Spotlight ...,"During the spring, at the height of the Covid-..."
1,"By Rachel SchraerBBC Reality Check So, why did...","The parents of five-year-old Tafida Raqeeb, wh..."


In [35]:
def get_length(text):
  return len(text)

article_ave = train_df['text'].apply(get_length).mean()
article_max = train_df['text'].apply(get_length).max()
article_min = train_df['text'].apply(get_length).min()
article_std = train_df['text'].apply(get_length).std()

print(f'average article size: {article_ave}')
print(f'max article size: {article_max}')
print(f'min article size: {article_min}')
print(f'stdev article size: {article_std}')

average article size: 2685.591
max article size: 14526
min article size: 300
stdev article size: 1815.5251965914597


In [36]:
summ_ave = train_df['summary'].apply(get_length).mean()
summ_max = train_df['summary'].apply(get_length).max()
summ_min = train_df['summary'].apply(get_length).min()
summ_std = train_df['summary'].apply(get_length).std()

print(f'average summary size: {summ_ave}')
print(f'max summary size: {summ_max}')
print(f'min summary size: {summ_min}')
print(f'stdev summary size: {summ_std}')

average summary size: 129.662
max summary size: 497
min summary size: 45
stdev summary size: 40.11242653795408


**T5 Model**

##### 1.) Load and set up model

In [7]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

t5model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
t5tokenizer = T5Tokenizer.from_pretrained("t5-base")

Downloading (…)lve/main/config.json: 0.00B [00:00, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/892M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [37]:
## Set up model params

summary_max_length = 512
target_max_length = 128
batch_size = 16

In [38]:
def preprocess_data(text_pairs, tokenizer, model, summary_max_length=128, target_max_length=128):
    orig_text = text_pairs[0]
    orig_encoded = tokenizer.batch_encode_plus(
        orig_text,
        max_length=summary_max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='tf'
    )

    orig_input_ids = np.array(orig_encoded["input_ids"], dtype="int32")
    orig_attention_masks = np.array(orig_encoded["attention_mask"], dtype="int32")
    
    target_text = text_pairs[0]
    target_encoded = tokenizer.batch_encode_plus(
        target_text,
        max_length=target_max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

    label_ids = np.array(target_encoded['input_ids'])
    decoder_input_ids = model._shift_right(label_ids)
    
    return [orig_input_ids, orig_attention_masks, decoder_input_ids], label_ids

In [46]:
import tensorflow as tf

class SummarizationDataGenerator(tf.keras.utils.Sequence):
    
    def __init__(self,
                 tokenizer,
                 model,
                 n_examples,
                 dataframe,
                 summary_max_length=128,
                 target_max_length=64,
                 batch_size=16,
                 shuffle=True):
        
        self.tokenizer = tokenizer
        self.model = model
        self.n_examples = n_examples
        self.dataframe = dataframe
        self.summary_max_length = summary_max_length
        self.target_max_length = target_max_length
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()
    
    def __len__(self):
        # Return the number of batches in the full dataset
        return self.n_examples // self.batch_size
    
    def __getitem__(self, idx):
        batch_start = idx * self.batch_size
        batch_end = (idx + 1) * self.batch_size

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this batch
        batch_idx_skip = self.row_order[:batch_start] + self.row_order[batch_end:]
        
        text_pairs = self.dataframe[['text', 'summary']].values.astype(str).tolist()
        
        batch_data = preprocess_data(
            text_pairs,
            self.tokenizer,
            self.model,
            self.summary_max_length,
            self.target_max_length
        )

        return batch_data
    
    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)
            
            if i == self.__len__()-1:
                self.on_epoch_end()
    
    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

In [47]:
from tensorflow.keras import layers

def build_t5_training_wrapper_model(t5_model, summary_max_length, target_max_length):
    input_ids = layers.Input(shape=(summary_max_length), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(summary_max_length), dtype=tf.int32, name='attention_mask')
    decoder_input_ids = layers.Input(shape=(target_max_length), dtype=tf.int32, name='labels')
    
    t5_logits = t5_model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[0]

    model = tf.keras.models.Model(inputs=[input_ids, attention_mask, decoder_input_ids],
                                  outputs=[t5_logits])
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    
    return model

##### 2.) Train model

In [48]:
model_wrapper = build_t5_training_wrapper_model(t5model, summary_max_length, target_max_length)

In [49]:
train_data_generator = SummarizationDataGenerator(
    tokenizer=t5tokenizer,
    model=t5model,
    n_examples=train_df.shape[0],
    dataframe=train_df,
    summary_max_length=summary_max_length,
    target_max_length=target_max_length,
    batch_size=batch_size
)

valid_data_generator = SummarizationDataGenerator(
    tokenizer=t5tokenizer,
    model=t5model,
    n_examples=val_df.shape[0],
    dataframe=val_df,
    summary_max_length=summary_max_length,
    target_max_length=target_max_length,
    batch_size=batch_size
)

In [50]:
checkpoint_dir = f'/content/drive/MyDrive/W266FinalProject/model_checkpoints/t5_summaxlength{summary_max_length}_targmaxlength{target_max_length}/'
checkpoint_filepath = checkpoint_dir + 't5_weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True)

In [51]:
model_wrapper.fit(train_data_generator,
                  validation_data=valid_data_generator,
                  epochs=10,
                  callbacks=[model_checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: ignored

##### 3.) Test model

In [13]:
rouge = evaluate.load('rouge')

Downloading builder script: 0.00B [00:00, ?B/s]

In [14]:
chrf = evaluate.load("chrf")

Downloading builder script: 0.00B [00:00, ?B/s]

In [15]:
model_wrapper = build_t5_training_wrapper_model(t5model, max_length)

In [16]:
checkpoint_filepath = '/content/drive/MyDrive/W266FinalProject/model_checkpoints/t5_finetuned4/t5_weights.01-0.96.hdf5'

model_wrapper.load_weights(checkpoint_filepath)

In [17]:
test_df.shape

(100, 2)

In [18]:
r1 = []
r2 = []
rL = []
rLs = []
chrfs = []

for i in test_df.index:

    T5ARTICLE_TO_SUMMARIZE = test_df['text'][i]

    inputs = t5tokenizer(T5ARTICLE_TO_SUMMARIZE, 
                         max_length=max_length, 
                         truncation=True, 
                         return_tensors="tf")

    summary_ids = t5model.generate(inputs["input_ids"])

    candidate = t5tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    #pprint(candidate[0], compact=True)

    ref = [test_df['summary'][i]]

    rouge_results = rouge.compute(predictions=candidate,
                                  references=ref)

    r1.append(rouge_results['rouge1'])
    r2.append(rouge_results['rouge2'])
    rL.append(rouge_results['rougeL'])
    rLs.append(rouge_results['rougeLsum'])
    
    chrf_results = chrf.compute(predictions=candidate,
                                references=ref)
    chrfs.append(chrf_results['score'])



0


In [19]:
print('rouge1 average :', np.mean(r1))
print('rouge2 average :', np.mean(r2))
print('rougeL average :', np.mean(rL))
print('rougeLs average :', np.mean(rLs))
print('chrf average :', np.mean(chrfs))

rouge1 average : 0.028553139222582
rouge2 average : 0.002839225903546292
rougeL average : 0.02551033565346264
rougeLs average : 0.02551033565346264
chrf average : 3.64307443010981


In [20]:
data = {'rouge1': r1, 'rouge2': r2, 'rogueL': rL, 'rogueLs': rLs, 'chrf': chrfs}

scores = pd.DataFrame(data)

scores.to_csv(r'/content/drive/MyDrive/W266FinalProject/model_results/t5_finetuned4_scores.csv', index=False)