In [1]:
import tqdm
import torch
import ast
import torch.optim as optim
from performer_pytorch import PerformerEncDec
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader
import pandas
import math

In [2]:
## constants

#NUM_BATCHES = int(1e5)
NUM_BATCHES = 10
# BATCH_SIZE = 32
BATCH_SIZE = 64
LEARNING_RATE = 1e-4
GENERATE_EVERY  = 100
# NUM_TOKENS = 16 + 2
NUM_TOKENS = 28996 + 2
# ENC_SEQ_LEN = 32
# DEC_SEQ_LEN = 64 + 1
ENC_SEQ_LEN = 512
DEC_SEQ_LEN = 256 + 1

In [None]:
# helpers

def cycle():
    while True:
        prefix = torch.ones((BATCH_SIZE, 1)).long().cuda()
        src = torch.randint(2, NUM_TOKENS, (BATCH_SIZE, ENC_SEQ_LEN)).long().cuda()
        tgt = torch.cat((prefix, src, src), 1)
        src_mask = torch.ones(BATCH_SIZE, ENC_SEQ_LEN).bool().cuda()
        tgt_mask = torch.ones(BATCH_SIZE, tgt.shape[1]).bool().cuda()
        yield (src, tgt, src_mask, tgt_mask)

In [3]:
class SummaryDataset(Dataset):

    def __init__(self, filename):
    # def __init__(self):

        summary_data = pandas.read_csv(filename)
        # summary_data = pandas.read_csv("/home/ayan/data/python_files/my_summ_data/datasets/train_tokens_sample.csv")
        
        x = summary_data['src_txt_tokens'].apply(ast.literal_eval)
        y = summary_data['tgt_txt_tokens'].apply(ast.literal_eval)
        xm = summary_data['src_txt_att_mask'].apply(ast.literal_eval)
        ym = summary_data['tgt_txt_att_mask'].apply(ast.literal_eval)

        
        # self.X = torch.tensor(list(zip(*itertools.zip_longest(*x, fillvalue = 0))))
        # self.Y = torch.tensor(list(zip(*itertools.zip_longest(*y, fillvalue = 0))))
        # self.X_mask = torch.tensor(list(zip(*itertools.zip_longest(*x_att, fillvalue = 0))))
        # self.Y_mask = torch.tensor(list(zip(*itertools.zip_longest(*y_att, fillvalue = 0))))
        
        self.X = torch.tensor(x)
        self.Y = torch.tensor(y)
        self.X_mask = torch.tensor(xm)
        self.Y_mask = torch.tensor(ym)
    

    def __len__(self):
        return self.Y.shape[0]
    
    def __getitem__(self, index):
        
        src = self.X[index]
        src_msk = self.X_mask[index].bool()
        
        one = torch.ones(1)
        # tgt = torch.cat((one, self.Y[index], self.Y[index]), 0)
        # tgt_msk = torch.cat((one, self.Y_mask[index], self.Y_mask[index]), 0).bool()
        tgt = torch.cat((one, self.Y[index]), 0)
        tgt_msk = torch.cat((one, self.Y_mask[index]), 0).bool()
        
        # return (src[0:32], src_msk[0:32], tgt[0:32], tgt_msk[0:32])
        return (src, src_msk, tgt, tgt_msk, index)

In [None]:
summary_data_csv = pandas.read_csv("/home/ayan/ayan_fed_home/data/python_files/my_summ_data/datasets/train_tokens.csv")

In [None]:
summary_data_h5 = pandas.read_hdf("/home/ayan/ayan_fed_home/data/python_files/my_summ_data/datasets/train_tokens.h5", "/samples/samples")

In [4]:
summary_dataset = SummaryDataset("/home/ayan/ayan_fed_home/data/python_files/my_summ_data/datasets/train_tokens.csv")

In [5]:
summary_dataloader = DataLoader(summary_dataset, batch_size = BATCH_SIZE, shuffle = True)

In [None]:
# instantiate model

model = PerformerEncDec(
    dim=512,
    enc_num_tokens=NUM_TOKENS,
    enc_depth=1,
    enc_heads=8,
    enc_max_seq_len=ENC_SEQ_LEN,
    enc_reversible=True,
    enc_feature_redraw_interval=1000,
    enc_nb_features = 64,
    dec_num_tokens=NUM_TOKENS,
    dec_depth=3,
    dec_heads=8,
    dec_max_seq_len=DEC_SEQ_LEN,
    dec_reversible=True,
    dec_feature_redraw_interval=1000,
    dec_nb_features=64
).cuda()

In [None]:
# optimizer

optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
scaler = GradScaler()

In [None]:
# for i in tqdm.tqdm(range(NUM_BATCHES), mininterval = 10., desc = 'training'):
rng = math.ceil(287083/BATCH_SIZE)

In [None]:
for i in tqdm.tqdm(range(rng), mininterval = 10., desc = 'training'):
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.train()
    
    src, src_mask, tgt, tgt_mask = next(iter(summary_dataloader))
    src_mask = src_mask.cuda()
    src = src.long().cuda()
    tgt_mask = tgt_mask.cuda()
    tgt = tgt.long().cuda()
    
    with autocast():
        loss = model(src, tgt, enc_mask = src_mask, dec_mask = tgt_mask)
    
    scaler.scale(loss).backward()
       
    print(f'{i}: {loss.item()}')

    scaler.step(optim)
    scaler.update()
    optim.zero_grad()

    if i != 0 and i % GENERATE_EVERY == 0:
        
        model.eval()
        src, src_mask, _, _ = next(iter(summary_dataloader))

        src, src_mask = src[:1], src_mask[:1]
        start_tokens = (torch.ones((1, 1)) * 1).long().cuda()

        src = src.cuda()
        src_mask = src_mask.cuda()

        sample = model.generate(src, start_tokens, ENC_SEQ_LEN, enc_mask=src_mask)
        incorrects = (src != sample).abs().sum()

        print(f"input:  ", src)
        print(f"predicted output:  ", sample)
        print(f"incorrects: {incorrects}")

In [None]:
for i in tqdm.tqdm(range(rng), mininterval = 10., desc = 'training'):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # model = nn.DataParallel(model)
    model.to(device)
    model.train()
    
    #src, tgt, src_mask, tgt_mask = next(cycle())
    src, src_mask, tgt, tgt_mask = next(iter(summary_dataloader))
    # print(type(src))
    # print(type(src_mask))
    # print(type(tgt))
    # print(type(tgt_mask))
    
    src_mask = src_mask.cuda()
    src = src.long().cuda()
    tgt_mask = tgt_mask.cuda()
    tgt = tgt.long().cuda()
    
    # src_mask = src_mask.cuda(0)
    # src = src.long().cuda(0)
    # tgt_mask = tgt_mask.cuda(0)
    # tgt = tgt.long().cuda(0)
    
    # print (src.shape, src_mask.shape, tgt.shape, tgt_mask.shape)
    #model = model.cuda()
    

    with autocast():
        loss = model(src, tgt, enc_mask = src_mask, dec_mask = tgt_mask)
        #loss = model(src, tgt)
    
    # print('lossp1 - ', loss)
    # print('lossp1 - ', type(loss))
    # print('lossp1 - ', loss.shape)
    
    scaler.scale(loss).backward()
    
    
    print(f'{i}: {loss.item()}')

    scaler.step(optim)
    scaler.update()
    optim.zero_grad()
    # break

    if i != 0 and i % GENERATE_EVERY == 0:
        
        model.eval()
        #src, _, src_mask, _ = next(cycle())
        src, src_mask, _, _ = next(iter(summary_dataloader))

        src, src_mask = src[:1], src_mask[:1]
        start_tokens = (torch.ones((1, 1)) * 1).long().cuda()

        sample = model.generate(src, start_tokens, ENC_SEQ_LEN, enc_mask=src_mask)
        incorrects = (src != sample).abs().sum()

        print(f"input:  ", src)
        print(f"predicted output:  ", sample)
        print(f"incorrects: {incorrects}")

In [26]:
torch.cuda.empty_cache()

In [2]:
summary_data_t = pandas.read_csv("/home/ayan/ayan_fed_home/data/python_files/my_summ_data/datasets/train_tokens.csv")

In [7]:
BATCH_SIZE = 64
LEARNING_RATE = 1e-4
GENERATE_EVERY  = 100
NUM_TOKENS = 28996 + 2
ENC_SEQ_LEN = 512
DEC_SEQ_LEN = 256 + 1

In [8]:
model = PerformerEncDec(
    dim=512,
    enc_num_tokens=NUM_TOKENS,
    enc_depth=1,
    enc_heads=8,
    enc_max_seq_len=ENC_SEQ_LEN,
    enc_reversible=True,
    enc_feature_redraw_interval=1000,
    enc_nb_features = 64,
    dec_num_tokens=NUM_TOKENS,
    dec_depth=3,
    dec_heads=8,
    dec_max_seq_len=DEC_SEQ_LEN,
    dec_reversible=True,
    dec_feature_redraw_interval=1000,
    dec_nb_features=64
).cuda()

unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version
unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version
unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version


In [9]:
optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
scaler = GradScaler()

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# src, src_mask, tgt, tgt_mask, indices = next(iter(summary_dataloader))

def single_run(src, src_mask, tgt, tgt_mask):

    src = src.cuda()
    src_mask = src_mask.bool().cuda()
    tgt = tgt.cuda()
    tgt_mask = tgt_mask.bool().cuda()

    with autocast():
        loss = model(src, tgt, enc_mask = src_mask, dec_mask = tgt_mask)

    print(src.shape, src_mask.shape, tgt.shape, tgt_mask.shape)
    #print(indices)
    print(loss)
    scaler.scale(loss).backward()

    # print(f'{i}: {loss.item()}')
    print(f'loss: {loss.item()}')

    scaler.step(optim)
    scaler.update()
    optim.zero_grad()

In [25]:
samples = summary_data_t.iloc[[103819, 142689,  13439, 112329,  32677, 275564,  90784,  18786,  91488,
        195170, 249460, 228088,  16303,  37961, 204093, 115905, 192251, 179068,
        196260, 136432, 164435, 158048, 255970,   5525, 272237, 246354,  51773,
        105490, 178813, 130813, 229303, 163523, 212595, 220619, 242735, 269087,
         58262, 277290, 221507, 172436, 140651, 237822,  53877, 252209,  57175,
         41408, 273560, 124275, 278838, 109448, 247685, 235827,  18839, 122117,
        272651, 223672,   1371, 108826,  43890, 262758,  52505, 251462, 232237,
          7040]]

x = torch.tensor(list(samples['src_txt_tokens'].apply(ast.literal_eval)))
xm = torch.tensor(list(samples['src_txt_att_mask'].apply(ast.literal_eval)))

y = list(samples['tgt_txt_tokens'].apply(ast.literal_eval))
ym = list(samples['tgt_txt_att_mask'].apply(ast.literal_eval))

for i in range(len(y)):
  y[i] = [1] + y[i]
  ym[i] = [1] + ym[i]

y = torch.tensor(y)
ym = torch.tensor(ym)

single_run(x, xm, y, ym)

torch.Size([64, 512]) torch.Size([64, 512]) torch.Size([64, 257]) torch.Size([64, 257])
tensor(10.3939, device='cuda:0', grad_fn=<NllLoss2DBackward0>)
loss: 10.393864631652832


In [24]:
[179310, 129559, 266582,  29970, 123514, 138101,  29450,  28677,  66329,
        172908,  86640,  19724, 221796, 260803, 268974,  52179,  54780, 149421,
          3628,  44860, 154409, 116264,  65573, 138291,  44087,  78587, 177545,
        202400,  67990, 250929, 139273,  88409,  69672, 168532,   3151, 104263,
        199137,   8649, 160722, 119363, 162033, 126911, 280892, 273120, 266279,
        173412,  56441,  28284,  12484, 103112,  42305, 211312, 219730, 196961,
        145970, 280487, 284682, 167988,  74578, 216525, 128626,  14263, 279115,
        221270])

257


[1,
 101,
 1488,
 1465,
 1177,
 18778,
 4164,
 1766,
 2474,
 1120,
 6927,
 1633,
 1107,
 2388,
 1107,
 2223,
 1884,
 118,
 5048,
 1705,
 133,
 186,
 135,
 1131,
 2373,
 1482,
 112,
 188,
 18046,
 1106,
 11778,
 1123,
 4035,
 23655,
 2737,
 2269,
 4196,
 133,
 186,
 135,
 1131,
 2536,
 170,
 23609,
 7340,
 1186,
 187,
 10658,
 2168,
 1372,
 1105,
 17200,
 1111,
 2495,
 20064,
 1116,
 133,
 186,
 135,
 1118,
 1103,
 1159,
 1131,
 1355,
 1106,
 11078,
 1513,
 117,
 1131,
 1108,
 9588,
 1105,
 25194,
 117,
 26562,
 19120,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0

In [None]:
x = summary_data_t['src_txt_tokens'].apply(ast.literal_eval)