In [6]:
import numpy as np
import pandas as pd
import random
import time
import sys
import os

# sys.path.append('algotrade/NN')
# from algotrade.NN.x_transformers.x_transformers import *
from sklearn.model_selection import train_test_split

import torch

# from algotrade.NN.run_experiment import *
# from algotrade.NN.generate_data import *

## Variables

In [7]:
from sklearn.model_selection import ParameterGrid

TAG = 'test'

TASK_NAME = 'price'
TRAIN_SIZE = 100_000
VAL_SIZE = 2_000
TEST_SIZE = 10_000
NUM_INITS = 4


NUM_BATCHES = int(4e5)
BATCH_SIZE = 128
GENERATE_EVERY  = 10000
NUM_TOKENS = 10 + 2
ENC_SEQ_LEN = 24
DEC_SEQ_LEN = 48

INPUT_LEN = 24

#### Load data

In [8]:
from sklearn.linear_model import LinearRegression

WINDOW_SIZES = [7, 14, 28, 56, 224, 700, 1400]

def get_increase_pct(prices, horizon=30):
    res = np.vstack([prices.shift(-i).values for i in range(horizon)])
    max_value_in_horizon = np.nanmax(res, axis=0)
    return max_value_in_horizon / prices
    

def add_ewm(df, col_name='Close', window_sizes=WINDOW_SIZES):
    columns = []
    for ws in window_sizes:
        ewm = df[col_name].ewm(span=ws).mean()
        df.loc[:, f'ewm_{col_name}_{ws}'] = ewm
        columns.append(f'ewm_{col_name}_{ws}')
    return columns


def add_delta_pct(df, columns, col_name='Close'):
    names = []
    for col in columns:
        name = f'{col}_delta_pct'
        df.loc[:, name] = -(df[col_name] - df[col]) / df[col_name]
        df.loc[df[col_name] == 0, name] = 0
        names.append(name)
    return names


def add_exp_trend(df, price_col='Close'):
    xs = np.array(list(df.index))
    ys = np.log(df[price_col].values)

    model = LinearRegression()
    model.fit(xs.reshape(-1, 1), ys.reshape(-1, 1))
    preds = model.predict(xs.reshape(-1, 1))

    a = model.coef_[0]
    b = preds[0] - xs[0] * a
    df.loc[:, 'trend'] = np.exp(preds)
    return a, b


def add_linear_trend(df, price_col='Close'):
    xs = np.array(list(df.index))
    ys = df[price_col].values

    model = LinearRegression()
    model.fit(xs.reshape(-1, 1), ys.reshape(-1, 1))
    preds = model.predict(xs.reshape(-1, 1))

    a = model.coef_[0]
    b = preds[0] - xs[0] * a
    df.loc[:, 'trend'] = preds
    return a, b

In [25]:
def preprocess(data_path = 'algotrade/data/gemini_BTCUSD_1hr.csv',
            save_path = 'algotrade/data/BTCUSD/',
            train_size = 0.7,
            val_size = 0.1,
            test_size = 0.2,
            shuffle = False,
            # name = 'BTCUSD',
            trend = 'exp',
            price_col = 'Close',
            start_token=-10_000):
    
    df = pd.read_csv(data_path)
    df = df.reset_index()
    df.columns = df.iloc[0]
    df = df.iloc[1:].sort_values('Date').reset_index(drop=True)
    for col in df.columns[-5:]:
        df.loc[:, col] = df[col].astype(float)

    price_df = df
    price_df['DT'] = pd.to_datetime(price_df.Date)
    price_df['Date'] = price_df.DT.dt.date

    if trend == 'exp':
        a, b = add_exp_trend(price_df)
    else:
        a, b = add_linear_trend(price_df)

    ewm_cols = add_ewm(price_df, price_col)
    delta_cols = add_delta_pct(price_df, ewm_cols + ['trend'], price_col)

    vol_ewm_cols = add_ewm(price_df, 'Volume')
    vol_delta_cols = add_delta_pct(price_df, vol_ewm_cols, 'Volume')

    price_df[f'next_{price_col}'] = price_df[price_col].shift(-1)
    delta_pct_col = add_delta_pct(price_df, [f'next_{price_col}'], price_col)[0]
    next_price_col = f'next_{price_col}'

    feature_columns = [price_col] + delta_cols + vol_delta_cols
    target_columns = [delta_pct_col, next_price_col]

    features = price_df[feature_columns].values[:-1]
    target_ = price_df[target_columns].values[:-1]
    target = np.ones((1 + target_.shape[1], target_.shape[0]))*start_token
    target[1:, :] = target_.T

    X_train, X_test, y_train, y_test = train_test_split(features, target.T, train_size=train_size, shuffle=False)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=val_size/(1-train_size), shuffle=False)

    # print('X_train.shape, X_test.shape, X_val.shape, y_train.shape, y_test.shape, y_val.shape')
    # print(X_train.shape, X_test.shape, X_val.shape, y_train.shape, y_test.shape, y_val.shape)

    names = ['X_train', 'X_test', 'X_val', 'y_train', 'y_test', 'y_val']
    vars = [X_train, X_test, X_val, y_train, y_test, y_val]
    os.system(f'mkdir {save_path}')
    # os.system(f'mkdir {save_path+name}')
    for name, var in zip(names, vars):
        print(name, var.shape)
        np.save(save_path+name+'.npy', var)
    return price_df[:-1]

In [26]:
res = preprocess(data_path = '../data/gemini_BTCUSD_1hr.csv',
            save_path = '../data/BTCUSD/',
            train_size = 0.7,
            val_size = 0.1,
            test_size = 0.2,
            shuffle = False,
            # name = 'BTCUSD',
            trend = 'exp',
            price_col = 'Close')

X_train (36982, 16)
X_test (10567, 16)
X_val (5283, 16)
y_train (36982, 3)
y_test (10567, 3)
y_val (5283, 3)


In [29]:
# res[['Close', 'next_Close']]

In [65]:
X = torch.tensor(np.load('../data/BTCUSD/X_train.npy'))

In [68]:
X[0]

tensor([245.0000,  -0.0000,  -0.0000,  -0.0000,  -0.0000,  -0.0000,  -0.0000,
         -0.0000,   0.9505,  -0.0000,  -0.0000,  -0.0000,  -0.0000,  -0.0000,
         -0.0000,  -0.0000], dtype=torch.float64)

In [72]:
tgt_len = 24

X.shape

torch.Size([36982, 16])

In [69]:
slices = [X[i: tgt_len+i] for i in range(X.shape[0] - tgt_len)]

In [86]:
X_train = torch.cat(slices).reshape(-1, tgt_len, X.shape[1])
X_train = X_train.transpose(0,1)
X_train.shape

torch.Size([24, 36958, 16])

In [87]:
X_train[:, 0]

tensor([[ 2.4500e+02, -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00,
         -0.0000e+00, -0.0000e+00, -0.0000e+00,  9.5053e-01, -0.0000e+00,
         -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00,
         -0.0000e+00],
        [ 2.4500e+02, -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00,
         -0.0000e+00, -0.0000e+00, -0.0000e+00,  9.5070e-01, -3.7019e-01,
         -4.0104e-01, -4.1647e-01, -4.2418e-01, -4.2996e-01, -4.3128e-01,
         -4.3158e-01],
        [ 2.4492e+02,  1.8539e-04,  2.0186e-04,  2.0989e-04,  2.1385e-04,
          2.1678e-04,  2.1745e-04,  2.1760e-04,  9.5151e-01, -3.9881e-02,
         -7.1568e-02, -8.9048e-02, -9.8179e-02, -1.0520e-01, -1.0681e-01,
         -1.0718e-01],
        [ 2.4425e+02,  1.8578e-03,  2.0444e-03,  2.1342e-03,  2.1781e-03,
          2.2104e-03,  2.2177e-03,  2.2194e-03,  9.5703e-01, -1.6261e-01,
         -1.9498e-01, -2.1277e-01, -2.2206e-01, -2.2919e-01, -2.3083e-01,
         -2.3121e-01],
        [ 2.4499e+02

In [8]:
class data_loader:
    def __init__(self, mode, path='data', tgt_len=24, batch_size=32, none_mask=True):
        X, y = np.load(f'{path}/X_{mode}.npy'), np.load(f'{path}/y_{mode}.npy')
        # self.X, self.y = torch.tensor(self.X).cuda(), \
        #                     torch.tensor(self.y).cuda()
        X, y = torch.tensor(X), torch.tensor(y)
        slices_x = [X[i: tgt_len+i] for i in range(X.shape[0] - tgt_len)]
        slices_y = [y[i: tgt_len+i] for i in range(X.shape[0] - tgt_len)]

        self.X, self.y = torch.cat(slices_x).reshape(-1, tgt_len, X.shape[1]),\
                         torch.cat(slices_y).reshape(-1, tgt_len, X.shape[1])
        self.data_size = self.X.shape[0]
        self.data_ptr = 0

        self.batch_size = batch_size

    def __next__(self):
        X = self.X[self.data_ptr: self.data_ptr+self.batch_size]
        y = self.y[self.data_ptr: self.data_ptr+self.batch_size]
        
        if not self.none_mask:
            sm = self.src_masks[self.data_ptr: self.data_ptr+self.batch_size]
            sm = torch.tensor(sm).cuda()
        else:
            sm = None
            
        self.data_ptr = (self.data_ptr + self.batch_size) % self.data_size

        return X, y, sm, self.tgt_mask

In [63]:
# class data_loader:
#     def __init__(self, mode, path='data', batch_size=32, none_mask=True):
#         X, y = np.load(f'{path}/X_{mode}.npy'), np.load(f'{path}/y_{mode}.npy')
#         # self.X, self.y = torch.tensor(self.X).cuda(), \
#         #                     torch.tensor(self.y).cuda()
#         X, y = torch.tensor(X), torch.tensor(y)
#         slices_x = [X[i: tgt_len+i] for i in range(X.shape[0] - tgt_len)]
#         slices_y = [y[i: tgt_len+i] for i in range(X.shape[0] - tgt_len)]

#         self.X, self.y = torch.cat(slices_x).reshape(-1, tgt_len, X.shape[1]),\
#                          torch.cat(slices_y).reshape(-1, tgt_len, X.shape[1])
#         self.data_size = self.X.shape[0]
#         self.data_ptr = 0

#         self.batch_size = batch_size

#     def __next__(self):
#         X = self.X[self.data_ptr: self.data_ptr+self.batch_size]
#         y = self.y[self.data_ptr: self.data_ptr+self.batch_size]
        
#         if not self.none_mask:
#             sm = self.src_masks[self.data_ptr: self.data_ptr+self.batch_size]
#             sm = torch.tensor(sm).cuda()
#         else:
#             sm = None
            
#         self.data_ptr = (self.data_ptr + self.batch_size) % self.data_size

#         return X, y, sm, self.tgt_mask

### Run

In [64]:

gen_train = data_loader(path=f'algotrade/data/BTCUSD', mode='train', batch_size=BATCH_SIZE)
gen_val = data_loader(path=f'algotrade/data/BTCUSD', mode='val', batch_size=BATCH_SIZE)
gen_test = data_loader(path=f'algotrade/data/BTCUSD', mode='test', batch_size=BATCH_SIZE)

FileNotFoundError: [Errno 2] No such file or directory: 'algotrade/data/BTCUSD/X_train.npy'

In [None]:
s, t, _, _ = next(gen_train)

In [None]:
s.shape

torch.Size([128, 15])

In [None]:
class CXTransformer(nn.Module):
    def __init__(
        self,
        *,
        dim,
        tie_token_emb = False,
        **kwargs
    ):
        super().__init__()
        enc_kwargs, kwargs = groupby_prefix_and_trim('enc_', kwargs)
        dec_kwargs, kwargs = groupby_prefix_and_trim('dec_', kwargs)
        
        assert 'dim' not in enc_kwargs and 'dim' not in dec_kwargs, 'dimension of either encoder or decoder must be set with `dim` keyword'
        enc_transformer_kwargs = pick_and_pop(['max_seq_len'], enc_kwargs)
        # enc_transformer_kwargs['num_memory_tokens'] = enc_kwargs.pop('num_memory_tokens', None)

        dec_transformer_kwargs = pick_and_pop(['max_seq_len'], dec_kwargs)

        self.encoder = ContinuousTransformerWrapper(
            **enc_transformer_kwargs,
            attn_layers = Encoder(dim = dim, **enc_kwargs)
        )

        self.decoder = ContinuousTransformerWrapper(
            **dec_transformer_kwargs,
            attn_layers = Decoder(dim = dim, cross_attend = True, **dec_kwargs)
        )

        if tie_token_emb:
            self.decoder.token_emb = self.encoder.token_emb

        self.decoder = AutoregressiveWrapper(self.decoder)

    @torch.no_grad()
    def generate(self, seq_in, seq_out_start, seq_len, src_mask = None, **kwargs):
        encodings = self.encoder(seq_in, return_embeddings = True, mask = src_mask)
        return self.decoder.generate(seq_out_start, seq_len, context = encodings, context_mask = src_mask, **kwargs)

    def forward(self, src, tgt, src_mask = None, tgt_mask = None):
        enc = self.encoder(src, mask = src_mask, return_embeddings = True)

        d = enc.shape[1] - src_mask.shape[1]
        context_mask = src_mask[:, -d:]

        out = self.decoder(tgt, context = enc, mask = tgt_mask, context_mask = context_mask)
        return out

In [None]:
LEARNING_RATE = 0.0007

model_parameters = ParameterGrid({'dim': [128],
    'tie_token_embeds': [True],
    'return_tgt_loss': [True],
    'enc_depth': [2],
    'enc_heads': [4],
    'dec_depth': [2],
    'dec_heads': [4],
    'enc_max_seq_len': [15],
    'dec_max_seq_len': [1],
    # 'enc_num_memory_tokens': [2, 8, 0],
    'dim_in': [None],
    'dim_out': [1],
    'emb_dim': [128],
    'emb_dropout': [0.],
    'use_pos_emb': [True]
})

param = list(model_parameters)[0]

In [None]:
model = CXTransformer(**param)

In [None]:
model(s, t, None)

ValueError: ignored

In [None]:
drive_path = 'drive/MyDrive/stocks_logs/'
print_file = f'{drive_path}{TAG}_logs.txt'
t = time.time()
for init_num in range(NUM_INITS):
    with open(print_file, 'a') as f:
        f.write('\n\nInit number ' + str(init_num)+'\n')
    for i, param in enumerate(list(model_parameters)):
        with open(print_file, 'a') as f:
            f.write('\n\n' + str(param)+'\n')
        param['enc_depth'], param['enc_heads'] = param['depth,heads']
        param['dec_depth'], param['dec_heads'] = param['depth,heads']
        param.pop('depth,heads')

        with open(print_file, 'a') as f:
            f.write(f'{i / len(model_parameters) * 100}%')
        model = XTransformer(**param).cuda()

        model_name = f"{TASK_NAME}{INPUT_LEN}_dim{param['dim']}d{param['enc_depth']}h{param['enc_heads']}M{param['enc_num_memory_tokens']}l{param['enc_max_seq_len']}_{TAG}_v{init_num}"

        optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
        train_validate_model(model, 
                        train_generator=gen_train, 
                        val_generator=gen_val, 
                        optim=optim, 
                        model_name=model_name, 
                        config=param,
                        num_batches=NUM_BATCHES,
                        generate_every=GENERATE_EVERY,
                        print_file=print_file,
                        tag=TAG,
                        overfit_stop=False)
        test_model(model, gen_test, model_name, param, TASK_NAME, tag=TAG, log_path=drive_path+'test_results.csv')
        with open(print_file, 'a') as f:
            f.write(f'\nTotal time: {time.time() - t}\n')
        t = time.time()

RuntimeError: ignored

### Refit models

In [None]:
# import os

# def load_cpt(config, v, task_name, input_length):
#     for fns in os.walk('checkpoints'):
#         model_names = fns[2]
        
#     prefix = '{task_name}_dim{dim}d{d}h{h}M{M}l{l}'
#     name = prefix.format(task_name=task_name,
#                         dim=config['dim'],
#                         d=config['enc_depth'], h=config['enc_heads'], 
#                         M=config['enc_num_memory_tokens'], 
#                         l=input_length)

#     checkpoint_paths = ['checkpoints/' + n for n in model_names if name in n]
#     cpt = torch.load(checkpoint_paths[v])
#     bn, model_state, optim_state = cpt['batch_num'], cpt['state_dict'], cpt['optimizer']

#     model = XTransformer(**config).cuda()
#     model.load_state_dict(model_state)

#     optim = torch.optim.Adam(model.parameters(), lr=0.001)
#     optim.load_state_dict(optim_state)

#     return bn, model, optim


In [None]:
# TAG = 'refit_to_max'
# LEARNING_RATE = 0.001

# path = f"checkpoints/{TASK_NAME}{INPUT_LEN}/"

# for name in next(os.walk(path))[2]:
#     print(name)
#     if name == 'copy24_dim128d2h4M12l12_10tkn_len24_v2_10tkn_len24.pt':
#         continue
#     cpt = torch.load(path+name)
#     print(cpt['batch_num'])
#     delta_batches = NUM_BATCHES - cpt['batch_num'] - 1
#     if delta_batches < 1:
#         continue
    
#     split = name.split('_')
#     config = {'dec_max_seq_len': DEC_SEQ_LEN,
#          'dec_num_tokens': NUM_TOKENS,
#          'dim': int(split[1].split('dim')[1].split('d')[0]),
#          'enc_max_seq_len': int(split[1].split('M')[1].split('l')[1]),
#          'enc_num_memory_tokens': int(split[1].split('M')[1].split('l')[0]),
#          'enc_num_tokens': NUM_TOKENS,
#          'return_tgt_loss': True,
#          'tie_token_embeds': True,
#          'enc_depth': int(split[1][3:].split('d')[1].split('h')[0]),
#          'enc_heads': int(split[1][3:].split('d')[1].split('h')[1].split('M')[0]),
#          'dec_depth': int(split[1][3:].split('d')[1].split('h')[0]),
#          'dec_heads': int(split[1][3:].split('d')[1].split('h')[1].split('M')[0]),
#          'tag': TAG,
#          'task_name': TASK_NAME}
    
    
#     gen_train = data_loader(path=f'data{INPUT_LEN}', task_name=f'{TASK_NAME}_train', batch_size=BATCH_SIZE)
#     gen_val = data_loader(path=f'data{INPUT_LEN}', task_name=f'{TASK_NAME}_val', batch_size=VAL_SIZE)
#     gen_test = data_loader(path=f'data{INPUT_LEN}', task_name=f'{TASK_NAME}_test', batch_size=TEST_SIZE)


#     print_file = f'logs/{TASK_NAME}_{TAG}_memory_logs.txt'
#     t = time.time()
#     with torch.cuda.device(0):
#         with open(print_file, 'a') as f:
#             f.write('\n\n' + str(config)+'\n')
#             f.write(str(delta_batches) + ' batches to go.\n')

#         print('\n\n' + str(config)+'\n')
#         print(str(delta_batches) + ' batches to go.\n')
#         model_name = name
#         model = XTransformer(**config).cuda()
#         optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
        
#         model.load_state_dict(cpt['state_dict'])
#         optim.load_state_dict(cpt['optimizer'])

#         train_validate_model(model, 
#                             train_generator=gen_train, 
#                             val_generator=gen_val, 
#                             optim=optim, 
#                             model_name=model_name, 
#                             config=config,
#                             num_batches=delta_batches,
#                             generate_every=GENERATE_EVERY,
#                             print_file=print_file,
#                             tag=TAG,
#                             overfit_stop=False)
#         test_model(model, gen_test, model_name, config, TASK_NAME, tag=TAG)

#         with open(print_file, 'a') as f:
#             f.write(f'\nTotal time: {time.time() - t}\n')
#         t = time.time()

In [None]:
test_model(model, gen_test, model_name, config, TASK_NAME, tag=TAG)

In [None]:
# gen_train = data_loader(task_name=f'{TASK_NAME}_train', batch_size=BATCH_SIZE, enc_seq_len=INPUT_LEN, dec_seq_len=DEC_SEQ_LEN)
# gen_val = data_loader(task_name=f'{TASK_NAME}_val', batch_size=VAL_SIZE, enc_seq_len=INPUT_LEN, dec_seq_len=DEC_SEQ_LEN)
# gen_test = data_loader(task_name=f'{TASK_NAME}_test', batch_size=TEST_SIZE, enc_seq_len=INPUT_LEN, dec_seq_len=DEC_SEQ_LEN)


# print_file = f'logs/{TASK_NAME}_{TAG}_memory_logs.txt'
# t = time.time()
# with torch.cuda.device(0):
#     for init_num in range(NUM_INITS):
#         with open(print_file, 'a') as f:
#             f.write('\n\nInit number ' + str(init_num)+'\n')
#         for i, param in enumerate(list(model_parameters)):
#             with open(print_file, 'a') as f:
#                 f.write('\n\n' + str(param)+'\n')
#             param['enc_depth'], param['enc_heads'] = param['depth,heads']
#             param['dec_depth'], param['dec_heads'] = param['depth,heads']
#             param.pop('depth,heads')

#             with open(print_file, 'a') as f:
#                 f.write(f'{i / len(model_parameters) * 100}%')
#             model = XTransformer(**param).cuda()

#             model_name = f"{TASK_NAME}{INPUT_LEN}_dim{param['dim']}d{param['enc_depth']}h{param['enc_heads']}M{param['enc_num_memory_tokens']}l{param['enc_max_seq_len']}_v{init_num}"

#             optim = torch.optim.Adam(model.ффparameters(), lr=LEARNING_RATE)
            
#             bn, model, optim = load_cpt(param, v=init_num, task_name='copy55', input_length=param['enc_max_seq_len'])
#             with open(print_file, 'a') as f:
#                 f.write(f'BN: {bn}\n')
#             if bn < 130_000:
#                 train_validate_model(model, 
#                                     train_generator=gen_train, 
#                                     val_generator=gen_val, 
#                                     optim=optim, 
#                                     model_name=model_name, 
#                                     dec_seq_len=DEC_SEQ_LEN,
#                                     num_batches=NUM_BATCHES,
#                                     generate_every=GENERATE_EVERY,
#                                     print_file=print_file,
#                                     tag=TAG,
#                                     overfit_stop=False,
#                                     head_start=(130_000 - bn)/GENERATE_EVERY)
#                 test_model(model, gen_test, model_name, param, TASK_NAME, tag=TAG, dec_seq_len=param['dec_max_seq_len'])
#             with open(print_file, 'a') as f:
#                 f.write(f'\nTotal time: {time.time() - t}\n')
#             t = time.time()

In [None]:
from run_experiment import save_checkpoint

In [None]:
# save_path = f'checkpoints/{model_name}_b{i}_{TAG}_maxval.pt'
# save_cpt(save_path, model, optim)

# if i // generate_every < head_start:
#     continue

# # early stopping
# smoothed_val_scores = [np.mean(validation_scores[i-WINDOW_SIZE+1:i]) for i in range(WINDOW_SIZE-1, len(validation_scores))]

# if overfit_stop and max(smoothed_val_scores) > max(smoothed_val_scores[-PATIENCE:]):
#     break

### Test!

In [None]:
init_num = 0

gen_train = data_loader(task_name=f'{TASK_NAME}_train', batch_size=BATCH_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN)
gen_val = data_loader(task_name=f'{TASK_NAME}_val', batch_size=VAL_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN)
gen_test = data_loader(task_name=f'{TASK_NAME}_test', batch_size=TEST_SIZE, enc_seq_len=ENC_SEQ_LEN, dec_seq_len=DEC_SEQ_LEN)


param = list(model_parameters)[5]
print(param)
param['enc_depth'], param['enc_heads'] = param['depth,heads']
param['dec_depth'], param['dec_heads'] = param['depth,heads']
param.pop('depth,heads')

model = XTransformer(**param).cuda()

model_name = f"{TASK_NAME}_dim{param['dim']}d{param['enc_depth']}h{param['enc_heads']}M{param['enc_num_memory_tokens']}l{param['enc_max_seq_len']}_v{init_num}"

optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

src, tgt, _, _ = next(gen_train)

print(model.encoder.max_seq_len, model.encoder.num_memory_tokens)
model.encoder(torch.cat((src, src)), return_embeddings=True).shape