## 1. Data retrieval

In [37]:
import torch
import gzip
import json
import pandas as pd
import csv
import io

from operator import itemgetter
from summarizer import Summarizer, TransformerSummarizer

### Data directory and batch size selection

In [2]:
# Modify this to wherever you locally downloaded the data
data_base_path = './data/newsroom-release/release/'
wordpiece_cased_path = 'bert-base-cased-vocab.txt'

# train_path = data_base_path + 'train.jsonl.gz' DONT USE THIS
validation_path = data_base_path + 'dev.jsonl.gz'
test_path = data_base_path + 'dev.jsonl.gz'

batch_size = 1

In [3]:
class NewsroomDataset(torch.utils.data.Dataset):
    '''
    Attributes:
        batch_size: Batch size to be taken on single getitem
        file: path to the dataset file
        category: category of the data summarization. i.e. 'extractive'
    '''
    def __init__(self, path, category: str):
        self.category = category
        data = []
        with gzip.open(path) as f:
            for ln in f:
                obj = json.loads(ln)
                data.append(obj)
        data = pd.DataFrame(data)
        # Take only samples with certain category
        self.data = data.loc[data['density_bin'] == self.category, :]
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return dict(self.data.iloc[idx, :])

In [4]:
test_dset = NewsroomDataset(test_path, "extractive")
testloader = torch.utils.data.DataLoader(test_dset, batch_size=batch_size)   

### Verify that the dataloader works like we want

In [5]:
tset = next(iter(testloader))
text, summary = itemgetter('text', 'summary')(tset)
print(f"Text-to-summarize: {text}, \n\nsummarization: {summary}")

Text-to-summarize: ['BANGALORE, India, June 4 \x97 The world\'s biggest computer services company could not have chosen a more appropriate setting to lay out its strategy for staying on top.\n\nA building housing I.B.M.\'s software laboratory and application service teams on the company\'s corporate campus in Bangalore, India.\n\nOn Tuesday, on the expansive grounds of the Bangalore Palace, a colonial-era mansion once inhabited by a maharajah, the chairman and chief executive of I.B.M., Samuel J. Palmisano, will address 10,000 Indian employees. He will share the stage with A. P. J. Abdul Kalam, India\'s president, and Sunil Mittal, chairman of the country\'s largest cellular services provider, Bharti Tele-Ventures. An additional 6,500 employees will look in on the town hall-style meeting by satellite from other Indian cities.\n\nOn the same day, Mr. Palmisano and other top executives will meet here with investment analysts and local customers to showcase I.B.M.\'s global integration ca

## 2. Initialize model and do predictions

In [5]:
from nltk.tokenize import sent_tokenize # For Lede-3

Define functions for making predictions and writing to file

In [52]:
def make_predictions(transformer_type, transformer_model_key, n_predictions, lower_case=True):
    model = TransformerSummarizer(transformer_type=transformer_type,
                                 transformer_model_key=transformer_model_key)
    results = [] # Predictions for the BERT
    lede3_preds = [] # Lede 3 predictions
    
    for i, batch_df in enumerate(testloader):
        txt, summary = itemgetter('text', 'summary')(batch_df)
        txt = ''.join(txt)
        summary = ''.join(txt)
        
        if lower_case:
            txt = txt.lower()
            summary = summary.lower()
        
#         print(model.tokenizer)
        pred = model(txt)
        results.append((pred, summary))

        # Lede-3
        lede3 = sent_tokenize(txt)[:3]
        lede3_preds.append((lede3, summary))

        if i % 10 == 0:
            print(f"prediction: {i}\n")

        if i == n_predictions:
            break
    
    return results, lede3_preds

def save_to_file(results, name, column_headers: list, dialect=None):
    # Save model to file
    with io.open(name, 'w', encoding="utf-8") as out:
        if dialect is None:
            csv_out = csv.writer(out)
        else:
            csv_out = csv.writer(out, dialect=dialect)
        csv_out.writerow(column_headers)
        for row in results:
            csv_out.writerow(row)



Do predictions and save to file

In [59]:
[[''.join(x)] for x in lede[0]]

[["bangalore, india, june 4 \x97 the world's biggest computer services company could not have chosen a more appropriate setting to lay out its strategy for staying on top.a building housing i.b.m.'s software laboratory and application service teams on the company's corporate campus in bangalore, india."],
 ['bangalore, india, june 4 \x97 the world\'s biggest computer services company could not have chosen a more appropriate setting to lay out its strategy for staying on top.\n\na building housing i.b.m.\'s software laboratory and application service teams on the company\'s corporate campus in bangalore, india.\n\non tuesday, on the expansive grounds of the bangalore palace, a colonial-era mansion once inhabited by a maharajah, the chairman and chief executive of i.b.m., samuel j. palmisano, will address 10,000 indian employees. he will share the stage with a. p. j. abdul kalam, india\'s president, and sunil mittal, chairman of the country\'s largest cellular services provider, bharti t

In [60]:
n_predictions = 150
BERT = 'Bert'
XLNet = 'XLNet'

BERT_LARGE = 'bert-large-uncased'
BERT_BASE = 'bert-base-uncased'
XLNET = 'xlnet-base-cased' # TO DO
LEDE = 'lede3'
CLASSIFIERS = [(BERT, BERT_LARGE), (BERT, BERT_BASE)]

for i, clf in enumerate(CLASSIFIERS):
    model, lede = make_predictions(clf[0],
                                   clf[1],
                                   n_predictions)
    save_to_file(model, f'{clf[1]}.csv', ['prediction', 'actual'])
    
    if i == 0:
        # Get Lede-3 to format that csv.writerows wants 
        lede[0] = [[''.join(x)] for x in lede[0]]
        save_to_file(lede, f'{LEDE}.csv', ['prediction', 'actual'], dialect='excel')


prediction: 0

prediction: 10

prediction: 20

prediction: 30

prediction: 40

prediction: 50

prediction: 60

prediction: 70

prediction: 80

prediction: 90

prediction: 100

prediction: 110

prediction: 120

prediction: 130

prediction: 140

prediction: 150

prediction: 0

prediction: 10

prediction: 20

prediction: 30

prediction: 40

prediction: 50

prediction: 60

prediction: 70

prediction: 80

prediction: 90

prediction: 100

prediction: 110

prediction: 120

prediction: 130

prediction: 140

prediction: 150



## 3. Performance evaluation and results

Get mean Rouge-1, Rouge-2 and Rouge-L scores

In [70]:
model, lede = make_predictions(clf[0],
                               clf[1],
                               n_predictions)
lede[0] = [[''.join(x)] for x in lede[0]]
save_to_file(lede, f'{LEDE}.csv', ['prediction', 'actual'], dialect='excel')

prediction: 0

prediction: 10

prediction: 20

prediction: 30

prediction: 40

prediction: 50

prediction: 60

prediction: 70

prediction: 80

prediction: 90

prediction: 100

prediction: 110

prediction: 120

prediction: 130

prediction: 140

prediction: 150



In [96]:
import rouge
from rouge import Rouge

CLASSIFIERS.append((LEDE, LEDE))
recall_df = pd.DataFrame(columns=CLASSIFIERS)
precision_df = recall_df.copy()
f1_df = recall_df.copy()

rouge = Rouge()
dfs = []
for df_name in CLASSIFIERS:
    filename = df_name[1]
    name = df_name[0]
    
    df = pd.read_csv(f"{filename}.csv")
    if name==LEDE:
        scores = rouge.get_scores(df.iloc[:, 0], df.iloc[:, 1], avg=True)
    else:
        scores = rouge.get_scores(df.iloc[:, 0], df.iloc[:, 1], avg=True)
    dfs.append((name, pd.DataFrame(scores)))
    

In [175]:
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

def plot_rouges(rdfs, titles=['Rouge-1', 'Rouge-2', 'Rouge-L']):
    figs = []
    for df, title in zip(rdfs, titles):
        df.insert(0, 'Model name', ['BERT-large', 'BERT-base', 'Lede-3'])
        fig = go.Figure(data=[go.Table(
            header=dict(
                values=['<b>Model name</b>', '<b>f1-score</b>', '<b>precision</b>', '<b>recall</b>']
            ),
            cells=dict(
                values=df.T,
                fill_color='white',
            )
        )])
        fig.update_layout(title_text=f"<b>{title}<b>")
        fig.update_layout({'margin':{'t':50}})
        
        figs.append(fig)
    return figs



In [176]:
r1_df = pd.DataFrame([round(m['rouge-1'], 3) for n,m in dfs])
r2_df = pd.DataFrame([round(m['rouge-2'], 3) for n,m in dfs])
r_df = pd.DataFrame([round(m['rouge-l'], 3) for n,m in dfs])
figs = plot_rouges([r1_df, r2_df, r_df], )

[f.show() for f in figs]

[None, None, None]

In [None]:
r1_fig = go.Figure(data=[go.Table(
    header=dict(
        values=['<b>Model name</b>', '<b>f1-score</b>', '<b>precision</b>', '<b>recall</b>']
    ),
    cells=dict(
        values=r1_df.T,
        fill_color='white',
    )
)])
r1_fig.update_layout(title_text="Rouge-1")
r1_fig.update_layout({'margin':{'t':50}})
r1_fig.show()

Calculate Lede-3 sentences and scores (leading 3 sentences as summarization)