To save in google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


**Packages**

In [2]:
! pip install datasets --quiet
! pip install evaluate --quiet
! pip install rouge_score --quiet
! pip install transformers --quiet
! pip install -q sentencepiece --quiet
! pip install summarizer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from datasets import load_dataset
import evaluate

#let's make longer output readable without horizontal scrolling
from pprint import pprint

**Necessary Functions**

In [4]:
rouge = evaluate.load('rouge')

**Data**

In [5]:
dataset = load_dataset("csebuetnlp/xlsum", "english")



  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
# EDA
len(dataset['train'])

306522

In [7]:
# EDA
dataset['train'][1]

{'id': 'uk-scotland-highlands-islands-11069985',
 'url': 'https://www.bbc.com/news/uk-scotland-highlands-islands-11069985',
 'title': 'Huge tidal turbine installed at Orkney test site',
 'summary': 'The massive tidal turbine AK1000 has been installed in 35m (114.8ft) of water at a test site in Orkney.',
 'text': 'Atlantis Resources unveiled the marine energy device at Invergordon ahead of it being shipped to Kirkwall. Trials on the device will now be run at the European Marine Energy Centre test site off Eday. The device stands 22.5m (73ft) tall, weighs 1,300 tonnes and has two sets of blades on a single unit. It could generate enough power for 1,000 homes.'}

In [8]:
# category = []
title = []
article = []
summary = []

for data in dataset['train']:
    title.append(data['title']) 
    article.append(data['text'])
    summary.append(data['summary'])

In [9]:
d = {'title': title, 'article': article, 'summary': summary}
df = pd.DataFrame(data=d)
df.head(5)

Unnamed: 0,title,article,summary
0,Weather alert issued for gale force winds in W...,The Met Office has issued a yellow weather war...,Winds could reach gale force in Wales with sto...
1,Huge tidal turbine installed at Orkney test site,Atlantis Resources unveiled the marine energy ...,The massive tidal turbine AK1000 has been inst...
2,Leeds stabbing: Man attacked outside betting shop,Police were called to the scene outside the Co...,A man has been stabbed in broad daylight outsi...
3,Could killing of Iranian general help Trump ge...,Anthony ZurcherNorth America reporter@awzurche...,It was inevitable that the fallout from the US...
4,Coronavirus: 'I've moved out to protect my fam...,By Debbie JacksonBBC Scotland But while most o...,Week four of social distancing is starting to ...


**T5**

In [10]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

t5model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
t5tokenizer = T5Tokenizer.from_pretrained("t5-base")

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [11]:
import tensorflow as tf

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

In [12]:
num_epochs = 100
batch_size=32

In [None]:
for epoch in range(num_epochs):
  input_ids = None
  batch_df = df.sample(n=batch_size)
  for i in batch_df.index:
      encoded = t5tokenizer.encode_plus(df['article'][i], return_tensors='tf')
      if input_ids is None:
          input_ids = encoded['input_ids']
      else:
          input_ids = tf.concat([input_ids, encoded['input_ids']], axis=1)

  with tf.GradientTape() as tape:
      outputs = t5model(input_ids=input_ids)
      logits = outputs.inputs
      loss = loss_fn(input_ids, logits)
  gradients = tape.gradient(loss, t5model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, t5model.trainable_variables))

Token indices sequence length is longer than the specified maximum sequence length for this model (762 > 512). Running this sequence through the model will result in indexing errors


#### T5 - summarize on val and score
###### This needs to be edited to actually run on the val set

In [None]:
t5_r1 = []
t5_r2 = []
t5_rL = []
t5_rLs = []

for i in df.index:

    T5ARTICLE_TO_SUMMARIZE = 'summarize: ' + df['article'][i]

    inputs = t5tokenizer(T5ARTICLE_TO_SUMMARIZE, 
                         #max_length=1024, 
                         truncation=True, 
                         return_tensors="tf")

    summary_ids = t5model.generate(inputs["input_ids"], 
                                  # ADD HYPER PARAMETERS HERE 
                                    num_beams = 4,
                                    no_repeat_ngram_size = 3,  
                                    min_length = 10
                                  )
    
    candidate = t5tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    t5_r1.append(results['rouge1'])
    t5_r2.append(results['rouge2'])
    t5_rL.append(results['rougeL'])
    t5_rLs.append(results['rougeLsum'])
    
    if i in np.arange(0, 2200, 100):
        data = {'rouge1': t5_r1, 'rouge2': t5_r2, 'rogueL': t5_rL, 'rogueLs': t5_rLs}
        scores = pd.DataFrame(data)
        scores.to_csv(r'T5_scores.csv', index=False)
        print(i)

In [None]:
print('rouge1 average :', np.mean(t5_r1))
print('rouge2 average :', np.mean(t5_r2))
print('rougeL average :', np.mean(t5_rL))
print('rougeLs average :', np.mean(t5_rLs))

In [None]:
data = {'rouge1': t5_r1, 'rouge2': t5_r2, 'rogueL': t5_rL, 'rogueLs': t5_rLs}

scores = pd.DataFrame(data)

scores.to_csv(r'/content/drive/MyDrive/W266FinalProject/T5_scores_hyps.csv', index=False)