Imports

In [1]:
import pandas as pd
import numpy as np
import torch, transformers
import nltk
import sklearn
import gzip, json

Inspect memory usage

In [2]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in globals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                           _i2:  588.0 B
                           _oh:  240.0 B
                           Out:  240.0 B
                            _i:  159.0 B
                           _i1:  159.0 B
                    sizeof_fmt:  136.0 B
                       __doc__:  113.0 B
                           _ih:   96.0 B
                            In:   96.0 B
                   __builtin__:   80.0 B


## Data pipeline

Specify paths

In [3]:
# Modify this to wherever you locally downloaded the data
data_base_path = './data/newsroom-release/release/'
wordpiece_cased_path = 'bert-base-cased-vocab.txt'

# train_path = data_base_path + 'train.jsonl.gz' DONT USE THIS
validation_path = data_base_path + 'dev.jsonl.gz'
test_path = data_base_path + 'dev.jsonl.gz'

Create dataset

In [4]:
class NewsroomDataset(torch.utils.data.Dataset):
    '''
    Attributes:
        batch_size: Batch size to be taken on single getitem
        file: path to the dataset file
        category: category of the data summarization. i.e. 'extractive'
    '''
    def __init__(self, path, category: str):
        self.category = category
        data = []
        with gzip.open(path) as f:
            for ln in f:
                obj = json.loads(ln)
                data.append(obj)
        data = pd.DataFrame(data)
        # Take only samples with certain category
        self.data = data.loc[data['density_bin'] == self.category, :]
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return dict(self.data.iloc[idx, :])


Dataset is too big to load to memory - create minibatches and parallelize loading with DataLoader

In [5]:
test_dset = NewsroomDataset(test_path, "extractive")
testloader = torch.utils.data.DataLoader(test_dset, batch_size=2)    

Get batch for visualization

## Tokenization

Import the dependencies and initialize tokenizer and model

In [23]:
from transformers import BertTokenizer, BertModel
from tokenizers import (BertWordPieceTokenizer)
from sklearn.cluster import KMeans
from operator import itemgetter
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [7]:
bert_model = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_model)
bert_wordpiece_tokenizer = BertWordPieceTokenizer(wordpiece_cased_path)
bert_base_model = BertModel.from_pretrained(bert_model)

Split text to sentences, and tokenize sentences

In [8]:
def tokenize_sentences(sentences):
    """Tokenizes and preprocesses to sentences"""
    # Split to sentences, Remove too short sentences
    sents = tokenizer.sent_tokenize(sentences)
    sents = [bert_wordpiece_tokenizer.encode(s).tokens for s in sents] 
    return sents

def best_n_summaries(centroids, pooler_outputs, model_outputs: np.ndarray, n_summaries=1):
    '''
    Returns n most likely sentences for summarization
    
    Attributes:
        n_summaries: How many sentences to choose from a single centroid (for debugging)
        centroids: Centroids of the K-clusters
        pooler_outputs: embeddings from the model's [CLS] token
        model_outputs: Text tokenized to sentences. Used for retrieving sentences from embeddings
    '''
    assert n_summaries < pooler_outputs.shape[0], "n_summaries must be less than sentences in the trainset"
    
    summarizations = []
        
    for k in centroids: # Go through centroids
        indices = [] # Get indices for each pooler output
        values = [] # Get distances to the centroid k
        
        # Go through pooler outputs, and find sentence closest to centroid
        for i, sample in enumerate(pooler_outputs):
            current_dist = np.linalg.norm(sample - k)
            values.append(current_dist)
            indices.append(i)

        # Sort indices based on values
        _, indices = zip(*sorted(zip(values, indices)))
        indices = list(indices)
        
        summarization = model_outputs[indices[:n_summaries]]
        summarizations.append(''.join(summarization))
        
    return summarizations

In [9]:
bert_wordpiece_tokenizer.encode(x).tokens for x in text_tokens[0]

SyntaxError: invalid syntax (<ipython-input-9-51be42eddef0>, line 1)

## Test metrics

Import dependencies

In [10]:
from rouge import Rouge

Create prediction loop, test rouge score for each batch

In [11]:
rouge = Rouge()
scores = []
MAX_LENGTH = 128 # Pad to this length
bert_base_model.eval()

for i, batch in enumerate(testloader):
    ### TODO ###
    # currently we only get single sample from batch, we need better indxing from the batch
    b = batch
    # Get text and summary from batch
    text, summary = itemgetter('text', 'summary')(batch)
    # Preprocess
    summary_tokens = [tokenize_sentences(s) for s in summary]
    text_tokens = [tokenize_sentences(t) for t in text]
    
    # Create empty tensor, append new embeddings
    text_embeddings = torch.zeros((1, MAX_LENGTH), dtype=torch.int)
    for text_token in text_tokens:
        embeds = bert_tokenizer(text=text_token, 
                                         max_length=MAX_LENGTH,
                                         return_tensors='pt', 
                                         padding='max_length')['input_ids']
        text_embeddings = torch.cat((text_embeddings, embeds))
    
    # Create embeddings
    print(f"text_embeddings: {text_embeddings.shape}")
    print(text_embeddings)
    with torch.no_grad():
        model_out = bert_base_model(text_embeddings)
    
    embeddings = model_out.pooler_output.detach().numpy()
    
    print(f"embeddings: {embeddings}")
    # Do K-means clustering
    k = len(summary_tokens)
    kmeans = KMeans(n_clusters=k).fit(embeddings)
    # Find corresponding summary sentences
    centroids = kmeans.cluster_centers_
    text_summaries = best_n_summaries(centroids=centroids, 
                                      pooler_outputs=embeddings, 
                                      model_outputs=np.array(text_tokens))
    
    # Evaluation
    print(f"text_summaries: {text_summaries}")
    score = rouge.get_scores(text_summaries, summary_tokens)
    scores.append(score)
    break

ValueError: too many values to unpack (expected 2)

In [22]:
text[0]

'BANGALORE, India, June 4 \x97 The world\'s biggest computer services company could not have chosen a more appropriate setting to lay out its strategy for staying on top.\n\nA building housing I.B.M.\'s software laboratory and application service teams on the company\'s corporate campus in Bangalore, India.\n\nOn Tuesday, on the expansive grounds of the Bangalore Palace, a colonial-era mansion once inhabited by a maharajah, the chairman and chief executive of I.B.M., Samuel J. Palmisano, will address 10,000 Indian employees. He will share the stage with A. P. J. Abdul Kalam, India\'s president, and Sunil Mittal, chairman of the country\'s largest cellular services provider, Bharti Tele-Ventures. An additional 6,500 employees will look in on the town hall-style meeting by satellite from other Indian cities.\n\nOn the same day, Mr. Palmisano and other top executives will meet here with investment analysts and local customers to showcase I.B.M.\'s global integration capabilities in a brie

In [None]:
[t for t in text]

In [None]:
e = torch.zeros((1, 128))
o = bert_tokenizer(text=text_tokens[1], return_tensors='pt', max_length=128, padding='max_length', return_length=True)['input_ids']
e = torch.cat((e, o))
o = bert_tokenizer(text=text_tokens[0], return_tensors='pt', max_length=128, padding='max_length', return_length=True)['input_ids']
torch.cat((e, o)).shape

Test the K-Means clustering with single sample

In [None]:
sample_train, sample_test = itemgetter('text', 'summary')(next(iter(trainloader)))

sample_sentences = tokenize_sentences(sample_train)

# Tokenize
sample_train_tokens = bert_tokenizer(sample_sentences, return_tensors='pt', padding='longest')

# Preprocess
sample_sentences = np.array(sample_sentences)

## Get BERT CLS embeddings
model_output = bert_base_model(**sample_train_tokens)
pooler_output = model_output.pooler_output.detach().numpy() # Get numpy array

# Cluster embeddings to find centroids
sample_sentences_test = tokenize_sentences(sample_test)

# Choose same amount of centroids than the actual summary has
k = len(sample_sentences_test)
kmeans = KMeans(n_clusters=k).fit(pooler_output)

Find closest token to the centroid

In [None]:


# centroid = kmeans.cluster_centers_
# summaries = best_n_summaries(3, centroid, pooler_output, sample_sentences)

Compare found centroid and actual summarization

In [None]:
def print_top_n_summaries(summaries, test_summary):
    print("Generated summaries: \n")
    [print(f"{i+1}. {summary}") for i, summary in enumerate(summaries)]
    print(f"\n\nActual summary: \n {test_summary}")
    
print_top_n_summaries(summaries, sample_sentences_test)