## 1. Data retrieval

In [5]:
import torch
import gzip
import json
import pandas as pd
import csv
import io

from operator import itemgetter
from summarizer import Summarizer, TransformerSummarizer



### Data directory and batch size selection

In [6]:
# Modify this to wherever you locally downloaded the data
data_base_path = './data/newsroom-release/release/'
wordpiece_cased_path = 'bert-base-cased-vocab.txt'

# train_path = data_base_path + 'train.jsonl.gz' DONT USE THIS
validation_path = data_base_path + 'dev.jsonl.gz'
test_path = data_base_path + 'dev.jsonl.gz'

batch_size = 1

In [7]:
class NewsroomDataset(torch.utils.data.Dataset):
    '''
    Attributes:
        batch_size: Batch size to be taken on single getitem
        file: path to the dataset file
        category: category of the data summarization. i.e. 'extractive'
    '''
    def __init__(self, path, category: str):
        self.category = category
        data = []
        with gzip.open(path) as f:
            for ln in f:
                obj = json.loads(ln)
                data.append(obj)
        data = pd.DataFrame(data)
        # Take only samples with certain category
        self.data = data.loc[data['density_bin'] == self.category, :]
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return dict(self.data.iloc[idx, :])

In [8]:
test_dset = NewsroomDataset(test_path, "extractive")
testloader = torch.utils.data.DataLoader(test_dset, batch_size=batch_size)   

## 2. Initialize model and do predictions

In [9]:
from nltk.tokenize import sent_tokenize # For Lede-3

Define functions for making predictions and writing to file

In [83]:
def make_predictions(transformer_type, transformer_model_key, n_predictions, lower_case=True):
    model = TransformerSummarizer(transformer_type=transformer_type,
                                 transformer_model_key=transformer_model_key)
    results = [] # Predictions for the BERT
    lede3_preds = [] # Lede 3 predictions
    
    for i, batch_df in enumerate(testloader):
        txt, summary = itemgetter('text', 'summary')(batch_df)
        txt = ''.join(txt)
        summary = ''.join(summary)
        
        if lower_case:
            txt = txt.lower()
            summary = summary.lower()
        
#         print(model.tokenizer)
        try:
            pred = model(txt)
        except RuntimeError as exception:
            if "out of memory" in str(exception):
                print("WARNING: out of memory")
                if hasattr(torch.cuda, 'empty_cache'):
                    torch.cuda.empty_cache()

        results.append((pred, summary))

        # Lede-3
        lede3 = sent_tokenize(txt)[:3]
        lede3_preds.append((lede3, summary))

        if i % 10 == 0:
            print(f"prediction: {i}\n")

        if i == n_predictions:
            break
    
    return results, lede3_preds

def save_to_file(results, name, column_headers: list, dialect=None):
    # Save model to file
    with io.open(name, 'w', encoding="utf-8") as out:
        if dialect is None:
            csv_out = csv.writer(out)
        else:
            csv_out = csv.writer(out, dialect=dialect)
        csv_out.writerow(column_headers)
        for row in results:
            csv_out.writerow(row)



In [81]:
t, s = itemgetter('text', 'summary')(next(iter(testloader)))
s

['India provides I.B.M. with its fastest-growing market and a crucial base for delivering services to much of the world.']

Do predictions and save to file

In [84]:
n_predictions = 150
BERT = 'Bert'
GPT2_NAME = 'GPT2'

BERT_LARGE = 'bert-large-uncased'
BERT_BASE = 'bert-base-uncased'
GPT2 = 'gpt2-medium'
GPT2_L = 'gpt2-large'
LEDE = 'lede3'

CLASSIFIERS = [(BERT, BERT_LARGE), (BERT, BERT_BASE), (GPT2_NAME, GPT2), (GPT2_NAME, GPT2_L)]



for i, clf in enumerate(CLASSIFIERS):
    model, lede = make_predictions(clf[0],
                                   clf[1],
                                   n_predictions)
    save_to_file(model, f'{clf[1]}.csv', ['prediction', 'actual'])
    
    if i == 0:
        # Get Lede-3 to format that csv.writerows wants 
        lede[0] = [[''.join(x)] for x in lede[0]]
        save_to_file(lede, f'{LEDE}.csv', ['prediction', 'actual'], dialect='excel')


prediction: 0

prediction: 10

prediction: 20

prediction: 30

prediction: 40

prediction: 50

prediction: 60

prediction: 70

prediction: 80

prediction: 90

prediction: 100

prediction: 110

prediction: 120

prediction: 130

prediction: 140

prediction: 150

prediction: 0

prediction: 10

prediction: 20

prediction: 30

prediction: 40

prediction: 50

prediction: 60

prediction: 70

prediction: 80

prediction: 90

prediction: 100

prediction: 110

prediction: 120

prediction: 130

prediction: 140

prediction: 150



Some weights of GPT2Model were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'h.12.attn.masked_bias', 'h.13.attn.masked_bias', 'h.14.attn.masked_bias', 'h.15.attn.masked_bias', 'h.16.attn.masked_bias', 'h.17.attn.masked_bias', 'h.18.attn.masked_bias', 'h.19.attn.masked_bias', 'h.20.attn.masked_bias', 'h.21.attn.masked_bias', 'h.22.attn.masked_bias', 'h.23.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


prediction: 0

prediction: 10

prediction: 20

prediction: 30

prediction: 40

prediction: 50

prediction: 60

prediction: 70

prediction: 80

prediction: 90

prediction: 100

prediction: 110

prediction: 120

prediction: 130

prediction: 140

prediction: 150



Some weights of GPT2Model were not initialized from the model checkpoint at gpt2-large and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'h.12.attn.masked_bias', 'h.13.attn.masked_bias', 'h.14.attn.masked_bias', 'h.15.attn.masked_bias', 'h.16.attn.masked_bias', 'h.17.attn.masked_bias', 'h.18.attn.masked_bias', 'h.19.attn.masked_bias', 'h.20.attn.masked_bias', 'h.21.attn.masked_bias', 'h.22.attn.masked_bias', 'h.23.attn.masked_bias', 'h.24.attn.masked_bias', 'h.25.attn.masked_bias', 'h.26.attn.masked_bias', 'h.27.attn.masked_bias', 'h.28.attn.masked_bias', 'h.29.attn.masked_bias', 'h.30.attn.masked_bias', 'h.31.attn.masked_bias', 'h.32.attn.masked_bias', 'h.33.attn.masked_bias', 'h.34.attn.masked_bias', 'h.35.attn.masked_bi

prediction: 0

prediction: 10

prediction: 20

prediction: 30

prediction: 40

prediction: 50

prediction: 60

prediction: 70

prediction: 80

prediction: 90

prediction: 100

prediction: 110

prediction: 120

prediction: 130

prediction: 140

prediction: 150



In [85]:
CLASSIFIERS = [(BERT, BERT_LARGE), (BERT, BERT_BASE), (GPT2_NAME, GPT2), (GPT2_NAME, GPT2_L), (LEDE, LEDE)]

## 3. Performance evaluation and results

Get mean Rouge-1, Rouge-2 and Rouge-L scores

In [86]:
import rouge
from rouge import Rouge

# CLASSIFIERS.append((LEDE, LEDE))
recall_df = pd.DataFrame(columns=CLASSIFIERS)
precision_df = recall_df.copy()
f1_df = recall_df.copy()

rouge = Rouge()
dfs = []
for df_name in CLASSIFIERS:
    filename = df_name[1]
    name = df_name[0]
    
    df = pd.read_csv(f"{filename}.csv")
    scores = rouge.get_scores(df.iloc[:, 0], df.iloc[:, 1], avg=True)
    
    dfs.append((name, pd.DataFrame(scores)))
    

In [104]:
df = pd.read_csv(f"gpt2-large.csv")
''.join(next(iter(testloader))['summary'])
df.iloc[6, 0]

"nikki reed in twilight and ian somerhalder in the vampire diaries\r\n\r\nby nate jones & alison schwartz\r\n\r\nupdated 08/01/2014 at 02:45 pm edt\r\n\r\nthe recent revelations that ian somerhalder of\r\n\r\nhas thrown us into a tizzy. here are two actors, from two completely different vampire franchises, dating and doing romance stuff – as if someone had turned fan fiction into real life. we know they've got sparks in real life, but would they still work on screen? in the hug-happy world of\r\n\r\n, part-time record producer crosby is a lovable mess, but compared to some of veronica's exes, he's the very picture of stability."

In [87]:
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

def plot_rouges(rdfs, titles=['Rouge-1', 'Rouge-2', 'Rouge-L']):
    figs = []
    for df, title in zip(rdfs, titles):
        df.insert(0, 'Model name', ['BERT-large', 'BERT-base', 'GPT2-medium', 'GPT2-large', 'Lede-3'])
        fig = go.Figure(data=[go.Table(
            header=dict(
                values=['<b>Model name</b>', '<b>f1-score</b>', '<b>precision</b>', '<b>recall</b>']
            ),
            cells=dict(
                values=df.T,
                fill_color='white',
            )
        )])
        fig.update_layout(title_text=f"<b>{title}<b>")
        fig.update_layout({'margin':{'t':50}})
        
        figs.append(fig)
    return figs



In [88]:
r1_df = pd.DataFrame([round(m['rouge-1']*100, 2) for n,m in dfs])
r2_df = pd.DataFrame([round(m['rouge-2']*100, 2) for n,m in dfs])
r_df = pd.DataFrame([round(m['rouge-l']*100, 2) for n,m in dfs])
figs = plot_rouges([r1_df, r2_df, r_df])

[f.show() for f in figs]

[None, None, None]

In [122]:
lens = []
for i, batch_df in enumerate(testloader):
    txt, summary = itemgetter('text', 'summary')(batch_df)
    summary = ''.join(summary)
    txt = ''.join(txt)
    
    lens.append(len(sent_tokenize(summary)))
    
    if i == 2000:
        break
    
fig = px.histogram(pd.DataFrame({'sentence length': lens}))
fig.update_layout(
    xaxis_title_text='Sentence length'
)