To save in google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

**Packages**

In [None]:
! pip install datasets --quiet
! pip install evaluate --quiet
! pip install rouge_score --quiet
! pip install transformers --quiet
! pip install -q sentencepiece --quiet
! pip install summarizer

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from datasets import load_dataset
import evaluate

#let's make longer output readable without horizontal scrolling
from pprint import pprint

**Necessary Functions**

In [None]:
rouge = evaluate.load('rouge')

**Data**

In [None]:
dataset = load_dataset("csebuetnlp/xlsum", "english")

In [None]:
# EDA
len(dataset['train'])

In [None]:
# EDA
dataset['train'][1]

In [None]:
# category = []
title = []
article = []
summary = []

for data in dataset['train']:
    title.append(data['title']) 
    article.append(data['text'])
    summary.append(data['summary'])

In [None]:
d = {'title': title, 'article': article, 'summary': summary}
df = pd.DataFrame(data=d)
df.head(5)

Get a sample for training

In [None]:
df = df.sample(n=1000)

**Baseline**

In [None]:
base_r1 = []
base_r2 = []
base_rL = []
base_rLs = []

for i in df.index:

    string = df['article'][i].replace('...', '. ')
    
    # first three sentences 
    candidate = ". ".join(string.split('. ')[0:3]) + "."
    candidate = [candidate]
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references= ref)
    
    base_r1.append(results['rouge1'])
    base_r2.append(results['rouge2'])
    base_rL.append(results['rougeL'])
    base_rLs.append(results['rougeLsum'])

In [None]:
print('rouge1 average: ', np.mean(base_r1))
print('rouge2 average: ', np.mean(base_r2))
print('rougeL average: ', np.mean(base_rL))
print('rougeLs average:', np.mean(base_rLs))

**T5**

In [None]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

t5model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
t5tokenizer = T5Tokenizer.from_pretrained("t5-base")

In [None]:
import tensorflow as tf

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

In [None]:
input_ids = None
for i in df.index:
    encoded = t5tokenizer.encode_plus(df['article'][i], return_tensors='tf')
    if input_ids is None:
        input_ids = encoded['input_ids']
    else:
        input_ids = tf.concat([input_ids, encoded['input_ids']], axis=1)

In [None]:
num_epochs = 100

In [None]:
for epoch in range(num_epochs):
    with tf.GradientTape() as tape:
        outputs = t5model(input_ids=input_ids)
        logits = outputs.inputs
        loss = loss_fn(input_ids, logits)
    gradients = tape.gradient(loss, t5model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, t5model.trainable_variables))

#### T5 - summarize on val
###### This needs to be edited to actually run on the val set

In [None]:
t5_r1 = []
t5_r2 = []
t5_rL = []
t5_rLs = []

for i in df.index:

    T5ARTICLE_TO_SUMMARIZE = 'summarize: ' + df['article'][i]

    inputs = t5tokenizer(T5ARTICLE_TO_SUMMARIZE, 
                         #max_length=1024, 
                         truncation=True, 
                         return_tensors="tf")

    summary_ids = t5model.generate(inputs["input_ids"], 
                                  # ADD HYPER PARAMETERS HERE 
                                    num_beams = 4,
                                    no_repeat_ngram_size = 3,  
                                    min_length = 10
                                  )
    
    candidate = t5tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    t5_r1.append(results['rouge1'])
    t5_r2.append(results['rouge2'])
    t5_rL.append(results['rougeL'])
    t5_rLs.append(results['rougeLsum'])
    
    if i in np.arange(0, 2200, 100):
        data = {'rouge1': t5_r1, 'rouge2': t5_r2, 'rogueL': t5_rL, 'rogueLs': t5_rLs}
        scores = pd.DataFrame(data)
        scores.to_csv(r'T5_scores.csv', index=False)
        print(i)

In [None]:
print('rouge1 average :', np.mean(t5_r1))
print('rouge2 average :', np.mean(t5_r2))
print('rougeL average :', np.mean(t5_rL))
print('rougeLs average :', np.mean(t5_rLs))

In [None]:
data = {'rouge1': t5_r1, 'rouge2': t5_r2, 'rogueL': t5_rL, 'rogueLs': t5_rLs}

scores = pd.DataFrame(data)

scores.to_csv(r'/content/drive/MyDrive/W266FinalProject/T5_scores_hyps.csv', index=False)