Imports

In [1]:
import pandas as pd
import numpy as np
import torch, transformers
import nltk
import sklearn
import gzip, json

## Data pipeline

Specify paths

In [2]:
# Modify this to wherever you locally downloaded the data
data_base_path = './data/newsroom-release/release/'

train_path = data_base_path + 'train.jsonl.gz'
validation_path = data_base_path + 'dev.jsonl.gz'
test_path = data_base_path + 'dev.jsonl.gz'

Create dataset

In [3]:
class NewsroomDataset(torch.utils.data.Dataset):
    def __init__(self, batch_size, path):
        self.batch_size = batch_size
        self.file = gzip.open(path)
        
    def __len__(self):
        return self.batch_size

    def __getitem__(self, idx):
        return self.load_data()

    
    def load_data(self):
        data = []
        
        for ln in self.file:
            if len(data) < self.batch_size:
                obj = json.loads(ln)
                data.append(obj)
            else:
                return data

Dataset is too big to load to memory - create minibatches and parallelize loading with DataLoader

In [4]:
train_dset = NewsroomDataset(100, train_path)
trainloader = torch.utils.data.DataLoader(train_dset)    

Get batch for visualization

In [5]:
train_df = pd.DataFrame(next(iter(trainloader)))
train_df

Unnamed: 0,url,archive,title,date,text,summary,compression,coverage,density,compression_bin,coverage_bin,density_bin
0,[http://www.nytimes.com/2006/06/04/sports/socc...,[http://web.archive.org/web/20060618204254id_/...,[Surge in Racist Mood Raises Concerns on Eve o...,[20060618204254],"[HAMBURG, Germany, June 3  As he left the soc...",[A surge in discriminatory behavior toward bla...,"[tensor(137.4706, dtype=torch.float64)]","[tensor(1., dtype=torch.float64)]","[tensor(7.8235, dtype=torch.float64)]",[high],[high],[mixed]
1,[http://www.nytimes.com/2005/12/24/politics/24...,[http://web.archive.org/web/20060620043011id_/...,"[Spy Agency Mined Vast Data Trove, Officials R...",[20060620043011],"[WASHINGTON, Dec. 23 - The National Security A...","[The volume of information harvested, without ...","[tensor(33.6364, dtype=torch.float64)]","[tensor(0.9091, dtype=torch.float64)]","[tensor(4.7273, dtype=torch.float64)]",[medium],[medium],[mixed]
2,[http://www.nytimes.com/2006/04/23/business/yo...,[http://web.archive.org/web/20060909062911id_/...,[Investors vs. Pfizer: Guess Who Has the Guns?],[20060909062911],[IF outsized executive pay has indeed become a...,[The battle between Pfizer Inc.'s investors an...,"[tensor(33.8800, dtype=torch.float64)]","[tensor(1., dtype=torch.float64)]","[tensor(11.7200, dtype=torch.float64)]",[medium],[high],[extractive]
3,[http://www.nydailynews.com/archives/gossip/19...,[http://web.archive.org/web/20080313232743id_/...,[REX FLEXED PECS FOR SKIN PICS],[20080313232743],[BY A.J. BENZA & MICHAEL LEWITTES\n\nIf Simon ...,"[If Simon Rex looks a little familiar, it may ...","[tensor(11.8941, dtype=torch.float64)]","[tensor(0.9882, dtype=torch.float64)]","[tensor(38.9882, dtype=torch.float64)]",[low],[high],[extractive]
4,[http://www.nydailynews.com/archives/entertain...,[http://web.archive.org/web/20080314003027id_/...,[POPEYE-WORTHY PIE. PHYLLO DOUGH WRAPS SPINACH...,[20080314003027],[Spinach has terrorized generations of veggie-...,[POPEYE-WORTHY PIE. PHYLLO DOUGH WRAPS SPINACH...,"[tensor(3.9326, dtype=torch.float64)]","[tensor(0.9213, dtype=torch.float64)]","[tensor(36.6292, dtype=torch.float64)]",[low],[medium],[extractive]
...,...,...,...,...,...,...,...,...,...,...,...,...
95,[http://www.nytimes.com/1982/01/06/books/the-p...,[http://web.archive.org/web/20131114095011id_/...,[THE POP LIFE - NYTimes.com],[20131114095011],[THE worst concert tragedy in the history of r...,[THE worst concert tragedy in the history of r...,"[tensor(4.2846, dtype=torch.float64)]","[tensor(0.9921, dtype=torch.float64)]","[tensor(93.6798, dtype=torch.float64)]",[low],[high],[extractive]
96,[http://www.msnbc.com/all/roadmap-financial-re...,[http://web.archive.org/web/20131114223718id_/...,[A 'roadmap' for financial reform],[20131114223718],[The sweeping financial reform law known as Do...,"[Financial reform still has a long way to go, ...","[tensor(45., dtype=torch.float64)]","[tensor(0.8571, dtype=torch.float64)]","[tensor(1.8095, dtype=torch.float64)]",[high],[medium],[mixed]
97,[http://www.theguardian.com/artanddesign/2012/...,[http://web.archive.org/web/20131115220701id_/...,[Mind over mattress: Yoko Ono remembers the be...,[20131115220701],[Reading this on mobile? Watch the bed-in vide...,"[In 1969, Ono and John Lennon took to their be...","[tensor(8.6000, dtype=torch.float64)]","[tensor(0.6400, dtype=torch.float64)]","[tensor(1.2000, dtype=torch.float64)]",[low],[low],[abstractive]
98,[http://www.bostonglobe.com/arts/movies/2013/1...,[http://web.archive.org/web/20131121001136id_/...,[Taken aback by modern-day time travel],[20131121001136],[Has time travel in the movies gotten easier o...,[Has time travel in the movies gotten easier o...,"[tensor(3.8207, dtype=torch.float64)]","[tensor(0.9724, dtype=torch.float64)]","[tensor(42.2414, dtype=torch.float64)]",[low],[high],[extractive]


## Tokenization

Import the dependencies and initialize tokenizer and model

In [114]:
from transformers import BertTokenizer, BertModel
from nltk import tokenize
from sklearn.cluster import KMeans
from operator import itemgetter
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\teemu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [91]:
bert_model = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_model)
bert_base_model = BertModel.from_pretrained(bert_model)

Split text to sentences, and tokenize sentences

In [181]:
def tokenize_sentences(sentences):
    """Tokenizes and preprocesses to sentences"""
    sentences = [tokenize.sent_tokenize(s) for s in sentences][0] # Split to sentences
    sentences = [x for x in sentences if len(x) > 2] # Remove too short sentences
    return sentences

Test the K-Means clustering with single sample

In [205]:
sample_train, sample_test = itemgetter('text', 'summary')(next(iter(trainloader))[0])

sample_sentences = tokenize_sentences(sample_train)

# Tokenize
sample_train_tokens = bert_tokenizer(sample_sentences, return_tensors='pt', padding='longest')

# Preprocess
sample_sentences = np.array(sample_sentences)

## Get BERT CLS embeddings
model_output = bert_base_model(**sample_train_tokens)
pooler_output = model_output.pooler_output.detach().numpy() # Get numpy array

# Cluster embeddings to find centroids
sample_sentences_test = tokenize_sentences(sample_test)

# Choose same amount of centroids than the actual summary has
k = len(sample_sentences_test)
kmeans = KMeans(n_clusters=k).fit(pooler_output)

Find closest token to the centroid

In [207]:
def best_n_summaries(n_summaries, centroids, pooler_outputs, model_outputs: np.ndarray):
    """Returns n most likely sentences for summarization"""
    
    summarizations = []
        
    for k in centroids: # Go through centroids
        indices = [] # Get indices for each pooler output
        values = [] # Get distances to the centroid k
        
        # Go through pooler outputs, and find sentence closest to centroid
        for i, sample in enumerate(pooler_outputs):
            current_dist = np.linalg.norm(sample - k)
            values.append(current_dist)
            indices.append(i)

        # Sort indices based on values
        _, indices = zip(*sorted(zip(values, indices)))
        
        summarization = model_outputs[indices[:n_summaries]]
        summarizations.append(summarization)
        
    return summarizations

centroid = kmeans.cluster_centers_
summaries = best_n_summaries(1, centroid, pooler_output, sample_sentences)

Compare found centroid and actual summarization

In [208]:
def print_top_n_summaries(summaries, test_summary):
    print("Generated summaries: \n")
    [print(f"{i+1}. {summary}") for i, summary in enumerate(summaries)]
    print(f"\n\nActual summary: \n {test_summary}")
    
print_top_n_summaries(summaries, sample_sentences_test)

Generated summaries: 

1. In 1971, Tomlinson was working on a program to transfer data files between two or more computers.


Actual summary: 
 ['Ray Tomlinson, who with his engineering colleagues at BBN gave the world electronic mail, died on Saturday.']
