In [1]:
import torch
from torch.utils.data import Dataset, DataLoader, SequentialSampler
import pandas as pd
import numpy as np
import h5py 
import pickle
import os
import sys
from torch.utils.data.dataloader import default_collate

np.set_printoptions(threshold=sys.maxsize)

In [2]:
data_dir = "./preprocessed/"
train_path = data_dir + 'train_data.hdf5'
test_path = data_dir + 'test_data.hdf5'
valid_path = data_dir + 'valid_data.hdf5'

f = h5py.File(train_path, 'r')
for key in f.keys():
    print(key)
    print(f[key].shape)

# train_src = f['src']
# train_clss = f['clss']
# print(train_src.shape)  
# print(train_clss.shape) 
# print(f['tgt'].shape) 



clss
(287083, 512)
segs
(287083, 512)
src
(287083, 512)
src_sent_labels
(287083, 512)
tgt
(287083, 512)


In [3]:
class SummarisationDataset(Dataset):
    def __init__(self, path, subset=None):
        if path.endswith('.hdf5'):
            self.dataset = h5py.File(path, 'r')
            self.keys = list(self.dataset.keys()) # ['src', 'tgt', 'src_sent_labels', 'segs', 'clss']           
        
    def __len__(self):
        return self.dataset[self.keys[-1]].shape[0]
        
    def __getitem__(self, idx):
        out = {}
        for k in self.keys:
            out[k] = self.dataset[k][idx, :]
#         out['token_len'] = np.count_nonzero(self.dataset[k][idx, :]+1)       
            out['token_len_src'] = np.sum(self.dataset['src'][idx, :] != -1)
            out['token_len_tgt'] = np.sum(self.dataset['tgt'][idx, :] != -1)
            out['sent_len'] = np.sum(self.dataset['clss'][idx, :] != -1)

        return out

In [4]:
# test dataset class
bert_dataset = SummarisationDataset(path = './preprocessed/train_data.hdf5')
print(len(bert_dataset))

print(bert_dataset[0].keys())
# print(bert_dataset[0]['src'])
print(bert_dataset[0]['token_len_src'])

batch_list = [bert_dataset[idx] for idx in range(3)]
print(len(batch_list))
print(batch_list)

287083
dict_keys(['clss', 'token_len_src', 'token_len_tgt', 'sent_len', 'segs', 'src', 'src_sent_labels', 'tgt'])
512
3
[{'clss': array([ 0.000e+00,  4.000e+01,  6.600e+01,  1.050e+02,  1.420e+02,
        1.710e+02,  2.120e+02,  2.390e+02,  2.510e+02,  2.730e+02,
        2.880e+02,  3.500e+02,  3.880e+02,  4.130e+02,  4.260e+02,
        4.560e+02,  5.060e+02,  5.490e+02,  5.690e+02,  5.930e+02,
        6.120e+02,  6.210e+02,  6.440e+02,  6.900e+02,  7.210e+02,
        7.510e+02,  7.930e+02,  8.100e+02,  8.410e+02,  8.700e+02,
        9.000e+02,  9.150e+02,  9.470e+02,  9.810e+02,  1.012e+03,
        1.068e+03,  1.055e+03,  1.097e+03,  1.116e+03,  1.142e+03,
        1.165e+03,  1.211e+03,  1.243e+03,  1.266e+03,  1.297e+03,
        1.339e+03,  1.396e+03,  1.412e+03,  1.424e+03,  1.437e+03,
        1.475e+03,  1.514e+03,  1.548e+03,  1.592e+03,  1.604e+03,
        1.627e+03,  1.516e+03,  1.548e+03,  1.568e+03,  1.582e+03,
        1.485e+03,  1.511e+03,  1.534e+03,  1.559e+03,  1.582e+03,

In [33]:
def collate_fn(batch_list):
    '''
    batch_list = list of dictionary retrieved from SummarisationDataset
    data[i].keys() = dict_keys(['clss', 'segs', 'src', 'src_sent_labels', 'tgt'])
    
    return : dict()
    '''
    batch = default_collate(batch_list) 
    max_src_token_len = max(batch['token_len_src'])
    max_tgt_token_len = max(batch['token_len_tgt'])
    max_sent_len = max(batch['sent_len'])
    
    src_token_keys = ['src', 'segs']
    for k in src_token_keys:
        batch[k] = batch[k][:, :max_src_token_len]
        
    tgt_token_keys = ['tgt']
    for k in tgt_token_keys:
        batch[k] = batch[k][:, :max_tgt_token_len]
        
    sent_keys = ['clss', 'src_sent_labels']
    for k in sent_keys:
        batch[k] = batch[k][:, :max_sent_len]
    
    print(batch['src'].shape)
    return batch



In [34]:
batch = collate_fn(batch_list)

for k, v in batch.items():
    print(k, v)

torch.Size([3, 895])
clss tensor([[  0.,  21.,  38.,  56.,  83.,  96., 134., 159., 170., 186., 195., 238.,
         253., 286., 309., 340., 350., 374., 390., 407., 422.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.],
        [  0.,  24.,  57.,  91., 110., 126., 150., 164., 179., 192., 204., 215.,
         229., 254., 266., 295., 314., 341., 371., 387., 407., 428., 451.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,
          -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.,  -1.],
        [  0.,  19.,  39.,  60.,  88., 109., 126., 155., 174., 186., 201., 224.,
         257., 266., 290., 332., 352., 366., 388., 397., 420., 441., 456., 492.,
         524., 543., 564., 580., 590., 609., 627., 644., 664., 685., 701., 728.,
         750., 772., 781., 795., 816., 841., 855., 864., 880.]],
       dtype=torch.float64)
token_len_src tensor([43

In [2]:
from transformers import BertTokenizer, BertModel
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader, SequentialSampler
from torch.nn import functional as F
from torch import nn, optim

In [5]:
if torch.cuda.is_available():
  print('cuda is available')
  device = 'cuda'
  print('device is set to cuda')

cuda is available
device is set to cuda


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

class Bert(nn.Module):
    def __init__(self, large=False, temp_dir, finetune=False):
        super(Bert, self).__init__()
        if(large):
            self.model = BertModel.from_pretrained('bert-large-uncased', cache_dir=temp_dir)
        else:
            self.model = BertModel.from_pretrained('bert-base-uncased', cache_dir=temp_dir)
        self.finetune = finetune

    def forward(self, x, segs, mask): # TODO: mask = ?
        if(self.finetune):
            top_vec, _ = self.model(x, segs, attention_mask=mask)
        else:
            self.eval()
            with torch.no_grad():
                top_vec, _ = self.model(x, segs, attention_mask=mask)
        return top_vec
    
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        '''
        self.self_attn = MultiHeadedAttention(
            heads, d_model, dropout=dropout)

        self.context_attn = MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        mask = self._get_attn_subsequent_mask(MAX_SIZE)
        # Register self.mask as a buffer in TransformerDecoderLayer, so
        # it gets TransformerDecoderLayer's cuda behavior automatically.
        self.register_buffer('mask', mask)
        '''
        

class AbsSum(nn.Module):
    def __init__ (self):
        #encoder
        super(AbsSum, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        
        # decoder
        self.fc = nn.Linear(768, 1)
        
    def forward(self, ids, mask, token_type_ids):
        _, pooledOut = self.bert(ids, attention_mask = mask,
                                token_type_ids=token_type_ids)
        bertOut = self.bert_drop(pooledOut)
        output = self.fc(bertOut)
        
        '''
        def forward(self, src, tgt, segs, clss, mask_src, mask_tgt, mask_cls):
            # src = tensor, shape n*d (n = batch_size, d = dimension of src from dataset __get_item__)
            tensor n * d (n = , d = )
            mask = src ==-1
            top_vec = self.bert(src, segs, mask_src)
            dec_state = self.decoder.init_decoder_state(src, top_vec)
            decoder_outputs, state = self.decoder(tgt[:, :-1], top_vec, dec_state)
            return decoder_outputs, None
        '''
        
        return output

In [29]:
m = 100
d = 10

values = torch.randn(m, d)
keys = torch.randn(m, d)
queries = torch.randn(1, d)

print(f'queries = {queries}')

def attention(keys, values, queries):
    assert keys.shape[1] = queries.shape[1]
    assert keys.shape[0] = values.shape[0]
    
    softmax = nn.Softmax(dim=1)
    s = torch.mm(queries, keys.t()) # 1*m
    # print(s)

    attention_function = softmax(s)
    assert abs(1 - attention_function.sum()) < 0.01 # sum of softmax = 1

    attended_features = torch.mm(attention_function, values) # 1*d
    print(attended_features)
    
    return attended_features

attended_features = attention(keys, values, queries)

queries = tensor([[-0.8669,  1.1669,  0.2882, -0.9251, -0.1397,  0.4915,  1.3211, -0.1668,
          0.5268,  0.3499]])
tensor([[-0.3530, -0.0149, -0.2216,  0.0997, -0.1941, -0.0403, -0.2078, -0.0354,
         -0.1380,  0.1135]])
