In [2]:
import numpy as np
import time
import random
import torch
from x_transformers.x_transformers import XTransformer
import torch

from run_experiment import *
from generate_data import *

## Variables

In [3]:
from sklearn.model_selection import ParameterGrid

TAG = 'improve_score_2paper'

TASK_NAME = 'listops'
TRAIN_SIZE = 90_000
VAL_SIZE = 5_000
TEST_SIZE = 10_000
NUM_INITS = 3


NUM_BATCHES = int(2.3e5)
BATCH_SIZE = 128
LEARNING_RATE = 3e-4
GENERATE_EVERY  = 5000
ENC_NUM_TOKENS = 10+5+2
DEC_NUM_TOKENS = 12
ENC_SEQ_LEN = 1150
DEC_SEQ_LEN = 1

INPUT_LEN = 1150

model_parameters = ParameterGrid({'dim': [256],
    'tie_token_embeds': [True],
    'return_tgt_loss': [True],
    'enc_num_tokens': [ENC_NUM_TOKENS],
    'depth,heads': [(1,1)],
    'enc_max_seq_len': [1150],
    'dec_num_tokens': [DEC_NUM_TOKENS],
    'dec_max_seq_len': [DEC_SEQ_LEN],
    'enc_num_memory_tokens': [0]})

print('Total runs: ', NUM_INITS * len(model_parameters))

Total runs:  3


In [4]:
# for i, p in enumerate(model_parameters):
#     print(i, p)

#### Generate data

In [5]:
# class listops_generator:
#     def __init__(self, max_depth=2):
#         self.src_mask = torch.ones(BATCH_SIZE, ENC_SEQ_LEN).bool()
#         self.tgt_mask = torch.ones(BATCH_SIZE, DEC_SEQ_LEN+1).bool()
#         self.max_depth = max_depth
    
#     def __next__(self):
#         X = np.zeros([BATCH_SIZE, ENC_SEQ_LEN]).astype(int)
#         y = np.ones([BATCH_SIZE, 2]).astype(int) * 2
#         for i in range(BATCH_SIZE):
#             t = generate_tree(self.max_depth)
#             tokens, value = to_tokens(t), to_value(t) 
#             X[i, 0:len(tokens)], y[i, 1:] = tokens, value+2
#             del t

#         return torch.tensor(X), torch.tensor(y), self.src_mask, self.tgt_mask         


# generator = listops_generator()
# generate_data(generator, task_name=TASK_NAME, train_size=TRAIN_SIZE, test_size=TEST_SIZE, val_size=VAL_SIZE)

#### Gridsearch params

In [10]:
optimizer = torch.optim.SGD

optim_params = list(ParameterGrid({
    'lr': [0.0008, 0.0004],
    'momentum': [0.2, 0.4]
}))

print(len(optim_params))
optim_params

4


[{'lr': 0.0008, 'momentum': 0.2},
 {'lr': 0.0008, 'momentum': 0.4},
 {'lr': 0.0004, 'momentum': 0.2},
 {'lr': 0.0004, 'momentum': 0.4}]

In [11]:
# import numpy as np

# names = ['data/load_listops/load_listops_train_X.npy',
#         'data/load_listops/load_listops_train_y.npy',
#         'data/load_listops/load_listops_test_X.npy',
#         'data/load_listops/load_listops_test_y.npy']

# for name in names: 
#     x = np.load(name).astype(int)
#     np.save(name, x)

In [None]:
gen_train = data_loader(task_name='load_listops/load_listops_train', batch_size=BATCH_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN)
gen_val = data_loader(task_name='load_listops/load_listops_test', batch_size=VAL_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN)
gen_test = data_loader(task_name='load_listops/load_listops_test', batch_size=TEST_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN)


print_file = f'logs/{TASK_NAME}_{TAG}_cout_logs2.txt'
t = time.time()

param = list(model_parameters)[0]
param['enc_depth'], param['enc_heads'] = param['depth,heads']
param['dec_depth'], param['dec_heads'] = param['depth,heads']
param.pop('depth,heads')

with torch.cuda.device(1):
    for i, optim_param in enumerate(list(optim_params)):
        with open(print_file, 'a') as f:
            f.write('\n\n' + str(optim_param)+'\n')
        
        for init_num in range(1):
            model = XTransformer(**param).cuda()

            model_name = f"{TASK_NAME}{INPUT_LEN}_dim{param['dim']}d{param['enc_depth']}h{param['enc_heads']}M{param['enc_num_memory_tokens']}l{param['enc_max_seq_len']}_v{init_num}_{optim_param}"

            optim = optimizer(model.parameters(), **optim_param)
            train_validate_model(model, 
                                train_generator=gen_train, 
                                val_generator=gen_val, 
                                optim=optim, 
                                model_name=model_name, 
                                dec_seq_len=DEC_SEQ_LEN,
                                num_batches=NUM_BATCHES,
                                generate_every=GENERATE_EVERY,
                                print_file=print_file)
            test_model(model, gen_test, model_name, param, TASK_NAME, tag=str(optim_param), dec_seq_len=param['dec_max_seq_len'])
            with open(print_file, 'a') as f:
                f.write(f'\nTotal time: {time.time() - t}\n')
            t = time.time()

### Run

In [12]:
# s, t, _, _ = next(gen_train)
# s[0], t[0]

In [7]:
gen_train = data_loader(task_name='load_listops/load_listops_train', batch_size=BATCH_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN)
gen_val = data_loader(task_name='load_listops/load_listops_test', batch_size=VAL_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN)
gen_test = data_loader(task_name='load_listops/load_listops_test', batch_size=TEST_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN)


t = time.time()
with torch.cuda.device(1):
    for init_num in range(NUM_INITS):
        print('\n\n\nInit number ', init_num)
        for i, param in enumerate(list(model_parameters)):
            print(param)
            param['enc_depth'], param['enc_heads'] = param['depth,heads']
            param['dec_depth'], param['dec_heads'] = param['depth,heads']
            param.pop('depth,heads')

            print(i / len(model_parameters) * 100, '%')
            model = XTransformer(**param).cuda()

            model_name = f"{TASK_NAME}{INPUT_LEN}_dim{param['dim']}d{param['enc_depth']}h{param['enc_heads']}M{param['enc_num_memory_tokens']}l{param['enc_max_seq_len']}_v{init_num}"

            optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
            train_validate_model(model, 
                                train_generator=gen_train, 
                                val_generator=gen_val, 
                                optim=optim, 
                                model_name=model_name, 
                                dec_seq_len=DEC_SEQ_LEN,
                                num_batches=NUM_BATCHES,
                                generate_every=GENERATE_EVERY)
            test_model(model, gen_test, model_name, param, TASK_NAME, tag=TAG, dec_seq_len=param['dec_max_seq_len'])
            print('Total time: ', time.time() - t)
            t = time.time()




Init number  0
{'dec_max_seq_len': 1, 'dec_num_tokens': 12, 'depth,heads': (1, 1), 'dim': 256, 'enc_max_seq_len': 1150, 'enc_num_memory_tokens': 0, 'enc_num_tokens': 17, 'return_tgt_loss': True, 'tie_token_embeds': True}
0.0 %


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.DoubleTensor instead (while checking arguments for embedding)

### Test!

In [5]:
init_num = 0

gen_train = data_loader(task_name='load_listops/load_listops_train', batch_size=BATCH_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN)
gen_val = data_loader(task_name='load_listops/load_listops_test', batch_size=VAL_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN)
gen_test = data_loader(task_name='load_listops/load_listops_test', batch_size=TEST_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN)


param = list(model_parameters)[0]
print(param)
param['enc_depth'], param['enc_heads'] = param['depth,heads']
param['dec_depth'], param['dec_heads'] = param['depth,heads']
param.pop('depth,heads')

model = XTransformer(**param).cuda()

model_name = f"{TASK_NAME}_dim{param['dim']}d{param['enc_depth']}h{param['enc_heads']}M{param['enc_num_memory_tokens']}l{param['enc_max_seq_len']}_v{init_num}"

optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

src, tgt, _, _ = next(gen_train)

print(model.encoder.max_seq_len, model.encoder.num_memory_tokens)
model.encoder(torch.cat((src, src)), return_embeddings=True).shape

{'dec_max_seq_len': 1, 'dec_num_tokens': 12, 'depth,heads': (1, 1), 'dim': 128, 'enc_max_seq_len': 1150, 'enc_num_memory_tokens': 0, 'enc_num_tokens': 17, 'return_tgt_loss': True, 'tie_token_embeds': True}
1150 0


torch.Size([64, 1149, 128])

In [21]:
init_num = 0

gen_train = data_loader(task_name='load_listops/load_listops_train', batch_size=BATCH_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN)
gen_val = data_loader(task_name='load_listops/load_listops_test', batch_size=VAL_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN)
gen_test = data_loader(task_name='load_listops/load_listops_test', batch_size=TEST_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN)


param = list(model_parameters)[-1]
print(param)
param['enc_depth'], param['enc_heads'] = param['depth,heads']
param['dec_depth'], param['dec_heads'] = param['depth,heads']
param.pop('depth,heads')

model = XTransformer(**param).cuda()

model_name = f"{TASK_NAME}_dim{param['dim']}d{param['enc_depth']}h{param['enc_heads']}M{param['enc_num_memory_tokens']}l{param['enc_max_seq_len']}_v{init_num}"

optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

src, tgt, _, _ = next(gen_train)

print(model.encoder.max_seq_len, model.encoder.num_memory_tokens)
model.encoder(torch.cat((src, src)), return_embeddings=True).shape

{'dec_max_seq_len': 1, 'dec_num_tokens': 12, 'depth,heads': (2, 4), 'dim': 32, 'enc_max_seq_len': 400, 'enc_num_memory_tokens': 128, 'enc_num_tokens': 17, 'return_tgt_loss': True, 'tie_token_embeds': True}
400 128


torch.Size([64, 528, 32])

In [23]:
# y_test = np.load('data/load_listops/load_listops_test_y.npy').reshape((-1,1))
# y_test = np.hstack((np.ones((y_test.shape[0], 1)) * 2, y_test))
# np.save('data/load_listops/load_listops_test_y.npy', y_test)

# y_train = np.load('data/load_listops/load_listops_train_y.npy').reshape((-1,1))
# y_train = np.hstack((np.ones((y_train.shape[0], 1)) * 2, y_train))
# np.save('data/load_listops/load_listops_train_y.npy', y_train)