In [1]:
!python --version

Python 3.11.9


In [2]:
import numpy as np

np.__version__

'1.23.5'

In [3]:
import argparse
import os
import torch

In [4]:
from REC.config import Config


local_rank = 0
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)

In [5]:
config_file = ['IDNet/srgnnseqllm.yaml', 'overall/ID.yaml']
config = Config(config_file_list=config_file)
config['device'] = device

  from .autonotebook import tqdm as notebook_tqdm
2025-02-10 05:28:27,225	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [6]:
config


[1;35mGeneral Hyper Parameters:
[0m[1;36mmodel[0m =[1;33m SRGNNSEQLLM[0m
[1;36mseed[0m =[1;33m 2020[0m
[1;36mstate[0m =[1;33m INFO[0m
[1;36muse_modality[0m =[1;33m False[0m
[1;36mreproducibility[0m =[1;33m True[0m
[1;36mcheckpoint_dir[0m =[1;33m saved[0m
[1;36mshow_progress[0m =[1;33m False[0m
[1;36mlog_wandb[0m =[1;33m False[0m
[1;36mdata_path[0m =[1;33m ../dataset/[0m

[1;35mTraining Hyper Parameters:
[0m[1;36mepochs[0m =[1;33m 200[0m
[1;36mtrain_batch_size[0m =[1;33m 64[0m
[1;36moptim_args[0m =[1;33m {'learning_rate': 0.0001, 'weight_decay': 0.1}[0m
[1;36meval_step[0m =[1;33m 1[0m
[1;36mstopping_step[0m =[1;33m 30[0m

[1;35mEvaluation Hyper Parameters:
[0m[1;36meval_batch_size[0m =[1;33m 1024[0m
[1;36mtopk[0m =[1;33m [5, 10][0m
[1;36mmetrics[0m =[1;33m ['Recall', 'NDCG'][0m
[1;36mvalid_metric[0m =[1;33m NDCG@10[0m
[1;36mmetric_decimal_place[0m =[1;33m 7[0m
[1;36meval_type[0m =[1;33m EvaluatorType.R

# Set up Pytorch Distributed

In [7]:
import torch.distributed as dist

os.environ['MASTER_ADDR'] = '127.0.0.2'
os.environ['MASTER_PORT'] = '5447'

dist.init_process_group('nccl', rank=0, world_size=1)



# Check data

In [8]:
print(config['model'])

SRGNNSEQLLM


In [9]:
from REC.data import *
dataload = load_data(config)

In [10]:
train_loader, valid_loader, test_loader = bulid_dataloader(config, dataload)  

In [11]:
alias_inputs, global_alias_inputs, A, items, A_b, global_items, mask, pos, neg = next(iter(train_loader))

In [12]:
mask.shape

torch.Size([64, 10])

In [13]:
global_items[0, global_alias_inputs[1,]]

tensor([41349,  5329, 35775, 54265,     0,     0,     0,     0,     0,     0])

In [14]:
items[1, alias_inputs[1,]]

tensor([41349,  5329, 35775, 54265,     0,     0,     0,     0,     0,     0])

In [15]:
global_alias_inputs.device

device(type='cpu')

# SASREC-GNN-LLM Model Definition

In [58]:
import torch
import torch.nn as nn
from REC.utils import InputType
from REC.model.basemodel import BaseModel
from REC.model.layers import TransformerEncoder
import torch.nn.functional as F
import math
import numpy as np
from vllm import LLM, SamplingParams
import time
import time
import pickle

#torch.set_default_dtype(torch.float64)
class SRGNNSEQLLM(BaseModel):
    input_type = InputType.AUGSEQ
    def __init__(self, config, data):
        super(SRGNNSEQLLM, self).__init__()

        self.device = config['device']
        self.item_num = data.item_num

        # set up GNN
        self.item_hidden_size = config['item_embedding_size']  
        self.gnn_step = config['gnn_step'] # number of hidden GNN

        self.item_embedding = nn.Embedding(self.item_num, self.item_hidden_size)
        self.gnn = GNN(self.item_hidden_size, step=self.gnn_step)

        # load parameters info
        self.seq_n_layers = config['seq_n_layers']
        self.seq_n_heads = config['seq_n_heads']
        self.seq_hidden_size = config['seq_embedding_size']  # same as embedding_size
        self.seq_inner_size = config['seq_inner_size']  # the dimensionality in feed-forward layer  
        self.seq_inner_size *= self.seq_hidden_size
        self.seq_hidden_dropout_prob = config['seq_hidden_dropout_prob']
        self.seq_attn_dropout_prob = config['seq_attn_dropout_prob']
        self.seq_hidden_act = config['seq_hidden_act']
        self.seq_layer_norm_eps = config['seq_layer_norm_eps']
        self.seq_initializer_range = config['seq_initializer_range']
        
        self.max_seq_length = config['MAX_ITEM_LIST_LENGTH'] # problem !!!
        # define layers and loss
        self.position_embedding = nn.Embedding(self.max_seq_length, self.seq_hidden_size)
        
        self.trm_encoder = TransformerEncoder(
            n_layers=self.seq_n_layers,
            n_heads=self.seq_n_heads,
            hidden_size=self.seq_hidden_size,
            inner_size=self.seq_inner_size,
            hidden_dropout_prob=self.seq_hidden_dropout_prob,
            attn_dropout_prob=self.seq_attn_dropout_prob,
            hidden_act=self.seq_hidden_act,
            layer_norm_eps=self.seq_layer_norm_eps
        )

        self.seq_layernorm = nn.LayerNorm(self.seq_hidden_size, eps=self.seq_layer_norm_eps)
        self.seq_dropout = nn.Dropout(self.seq_hidden_dropout_prob)

        # class weight
        self.weight = torch.tensor([[1.0],[-1.0]]).to(self.device)
        self._gnn_reset_parameters()
        self.apply(self._seq_init_weights)

        # set-up lm model
        self.llm_cache = {}
        self.llm_embed_size = config['llm_embed_dim']
        self.graph_llm = FrozenGraphLLM(config['query_model'], config['encoder_model'], device = self.device, gpu_mem_utl = config['llm_gpu_utlization'])
        self.llm_linear = nn.Linear(self.llm_embed_size, self.seq_hidden_size)

        self.cross_attention = MultiHeadCrossAttention(
            n_heads = self.seq_n_heads,
            hidden_size = self.seq_hidden_size,
            hidden_dropout_prob = self.seq_hidden_dropout_prob,
            attn_dropout_prob = self.seq_attn_dropout_prob,
            layer_norm_eps = self.seq_layer_norm_eps,
        )

    def _seq_init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.seq_initializer_range)
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()
    
    def _gnn_reset_parameters(self):
        stdv = 1.0 / np.sqrt(self.item_hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)

    def seq_modeling(self, alias_inputs, gnn_output, llm_output, mask):
        llm_output = llm_output.to(alias_inputs.device)
        # transformer: output
        seq_input = []
        llm_input = []
        
        for i in range(len(alias_inputs)):
            seq_inputi = gnn_output[0, alias_inputs[i]]
            seq_input.append(seq_inputi)

            llm_input.append(llm_output[alias_inputs[i]])

        seq_input = torch.stack(seq_input)
        llm_input = self.llm_linear(torch.stack(llm_input))

        # cross-attention seq_input and llm_input
        seq_input = self.cross_attention(seq_input, llm_input, self.seq_get_attention_mask(mask, bidirectional=True))
        print(f'seq_input.shape = {seq_input.shape}')

        # position embedding
        position_ids = torch.arange(mask.size(1), dtype=torch.long, device=mask.device)
        position_ids = position_ids.unsqueeze(0).expand_as(mask)
        position_embedding = self.position_embedding(position_ids)

        seq_input_emb = seq_input + position_embedding
        seq_input_emb = self.seq_layernorm(seq_input_emb)
        seq_input_emb = self.seq_dropout(seq_input_emb)
        extended_attention_mask = self.seq_get_attention_mask(mask, bidirectional=False)

        seq_output_emb = self.trm_encoder(
            seq_input_emb, extended_attention_mask, output_all_encoded_layers=False
        )

        return seq_output_emb[-1]

    def forward(self, input):
        _, global_inputs, _, _, A_b, global_items, mask, tarpos, tarneg = input

        # look up table map id -> embedding vector
        hidden = self.item_embedding(global_items)
        target_pos_embs = self.item_embedding(tarpos)
        target_neg_embs = self.item_embedding(tarneg)

        # Stage 1: apply GNN to get node embeddings (incoporating local interaction signals)
        gnn_output = self.gnn(A_b, hidden)

        # Stage 2: query graph llm model for graph structure details
        batch_key = str(A_b.tolist())

        if batch_key not in self.llm_cache:
            llm_output = self.graph_llm.query(A_b)
            self.llm_cache[batch_key] = 'tmp/llm_output_' + str(time.time()) + '.pkl'
            with open(self.llm_cache[batch_key], 'wb') as f:
                pickle.dump(llm_output, f)
        else:
            with open(self.llm_cache[batch_key], 'rb') as f:
                llm_output = pickle.load(f)

        llm_output.to(global_inputs.device)
        seq_output = self.seq_modeling(global_inputs, gnn_output, llm_output, mask)

        pos_score = (seq_output * target_pos_embs).sum(-1)
        neg_score = (seq_output * target_neg_embs).sum(-1)

        loss = -(torch.log((pos_score - neg_score).sigmoid() + 1e-8) * mask).sum(-1)
        return loss.mean(-1)

    @torch.no_grad()
    def predict(self, input, item_feature):
        _, global_inputs, _, _, A_b, global_items, mask = input

        hidden = item_feature[global_items]
        gnn_output = self.gnn(A_b, hidden)
        llm_output = self.graph_llm.query(A_b)
        
        seq_output = self.seq_modeling(global_inputs, gnn_output, llm_output, mask)

        scores = torch.matmul(seq_output[:, -1], item_feature.t())
        return scores

    @torch.no_grad()
    def compute_item_all(self):
        embed_item = self.item_embedding.weight
        return embed_item

    def seq_get_attention_mask(self, item_seq, bidirectional=False):
        """Generate left-to-right uni-directional or bidirectional attention mask for multi-head attention."""
        attention_mask = item_seq != 0
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # torch.bool
        if not bidirectional:
            extended_attention_mask = torch.tril(
                extended_attention_mask.expand((-1, -1, item_seq.size(-1), -1))
            )
        extended_attention_mask = torch.where(extended_attention_mask, 0.0, -1e9)

        return extended_attention_mask
    

class GNN(nn.Module):
    def __init__(self, hidden_size, step=1):
        super(GNN, self).__init__()
        self.step = step
        self.hidden_size = hidden_size
        self.input_size = hidden_size * 2
        self.gate_size = 3 * hidden_size
        self.w_ih = nn.Parameter(torch.Tensor(self.gate_size, self.input_size))
        self.w_hh = nn.Parameter(torch.Tensor(self.gate_size, self.hidden_size))
        self.b_ih = nn.Parameter(torch.Tensor(self.gate_size))
        self.b_hh = nn.Parameter(torch.Tensor(self.gate_size))
        self.b_iah = nn.Parameter(torch.Tensor(self.hidden_size))
        self.b_oah = nn.Parameter(torch.Tensor(self.hidden_size))

        self.linear_edge_in = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
        self.linear_edge_out = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
        self.linear_edge_f = nn.Linear(self.hidden_size, self.hidden_size, bias=True)

    def GNNCell(self, A, hidden):
        input_in = torch.matmul(A[:, :, :A.shape[1]], self.linear_edge_in(hidden)) + self.b_iah
        input_out = torch.matmul(A[:, :, A.shape[1]: 2 * A.shape[1]], self.linear_edge_out(hidden)) + self.b_oah
        inputs = torch.cat([input_in, input_out], 2)
        gi = F.linear(inputs, self.w_ih, self.b_ih)
        gh = F.linear(hidden, self.w_hh, self.b_hh)
        i_r, i_i, i_n = gi.chunk(3, 2)
        h_r, h_i, h_n = gh.chunk(3, 2)
        resetgate = torch.sigmoid(i_r + h_r)
        inputgate = torch.sigmoid(i_i + h_i)
        newgate = torch.tanh(i_n + resetgate * h_n)
        hy = newgate + inputgate * (hidden - newgate)
        return hy

    def forward(self, A, hidden):
        for i in range(self.step):
            hidden = self.GNNCell(A, hidden)
        return hidden

class FrozenGraphLLM:
    def __init__(self, query_model, encoder_model, device, gpu_mem_utl = 0.2, res_max_token = 200, res_temp = 0.8, res_top_p = 0.95):
        self.device = device
        self.sampling_params = SamplingParams(temperature=res_temp, top_p=res_top_p, max_tokens = res_max_token)
        self.query_model = LLM(model=query_model, gpu_memory_utilization = gpu_mem_utl)
        self.encoder_model = LLM(model=encoder_model, gpu_memory_utilization = gpu_mem_utl, enforce_eager=True)

    def query(self, A_b):
        num_vertice = A_b.shape[1]
        A_in, A_out = A_b[:, :, :num_vertice].squeeze(0), A_b[:, :, num_vertice:].squeeze(0)
        A = ((A_in + A_out) != 0).float()
        
        prompts = []
        # constructing prompt
        for i in range(A.shape[0]):
            prompts.append(f'You are an expert in graph modeling. The row {i} of the graph adjacency matrix is {A[i,:]}. \
                             Describe the relationship of vertex {i} with the remaining vertices. \
                             Include any topological properties if applicable.')

        # perform the inference
        outputs = self.query_model.generate(prompts, self.sampling_params)
        responses = []

        # query the LLM
        for output in outputs:
            prompt = output.prompt
            generated_text = output.outputs[0].text
            responses.append(generated_text)


        print(f'A[1,:] = {A[1,:]}')
        print(f'Example response for vertex 1: {responses[1]}')

        # convert responses to embeddings
        embeddings = []
        encoded_outputs = self.encoder_model.encode(responses)
        for eo in encoded_outputs:
            embeddings.append(torch.tensor(eo.outputs.embedding))

        return torch.stack(embeddings)

class MultiHeadCrossAttention(nn.Module):
    """
    Multi-head Cross-attention layers, a attention score dropout layer is introduced.

    Args:
        item_seq (torch.Tensor): the first sequence of the multi-head cross-attention layer
        embed_seq (torch.Tensor): the second sequence of the multi-head cross-attention layer
        attention_mask (torch.Tensor): the attention mask for input tensor

    Returns:
        hidden_states (torch.Tensor): the output of the multi-head self-attention layer

    """

    def __init__(
        self,
        n_heads,
        hidden_size,
        hidden_dropout_prob,
        attn_dropout_prob,
        layer_norm_eps,
    ):
        super(MultiHeadCrossAttention, self).__init__()
        if hidden_size % n_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (hidden_size, n_heads)
            )

        self.num_attention_heads = n_heads
        self.attention_head_size = int(hidden_size / n_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.sqrt_attention_head_size = math.sqrt(self.attention_head_size)

        self.query = nn.Linear(hidden_size, self.all_head_size)
        self.key = nn.Linear(hidden_size, self.all_head_size)
        self.value = nn.Linear(hidden_size, self.all_head_size)

        self.softmax = nn.Softmax(dim=-1)
        self.attn_dropout = nn.Dropout(attn_dropout_prob)

        self.dense = nn.Linear(hidden_size, hidden_size)
        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
        self.out_dropout = nn.Dropout(hidden_dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (
            self.num_attention_heads,
            self.attention_head_size,
        )
        x = x.view(*new_x_shape)
        return x

    def forward(self, item_seq, embed_seq, attention_mask):
        mixed_query_layer = self.query(item_seq)
        mixed_key_layer = self.key(item_seq)
        mixed_value_layer = self.value(embed_seq)

        query_layer = self.transpose_for_scores(mixed_query_layer).permute(0, 2, 1, 3)
        key_layer = self.transpose_for_scores(mixed_key_layer).permute(0, 2, 3, 1)
        value_layer = self.transpose_for_scores(mixed_value_layer).permute(0, 2, 1, 3)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer)

        attention_scores = attention_scores / self.sqrt_attention_head_size
        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        # [batch_size heads seq_len seq_len] scores
        # [batch_size 1 1 seq_len]
        attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = self.softmax(attention_scores)
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.

        attention_probs = self.attn_dropout(attention_probs)
        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        hidden_states = self.dense(context_layer)
        hidden_states = self.out_dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + embed_seq)

        return hidden_states

# Run Model

In [59]:
from REC.data import *

config_file = ['IDNet/srgnnseqllm.yaml', 'overall/ID.yaml']
config = Config(config_file_list=config_file)
dataload = load_data(config)
train_loader, valid_loader, test_loader = bulid_dataloader(config, dataload)  

In [60]:
from REC.utils import get_model
from torch.nn.parallel import DistributedDataParallel as DDP

device = torch.device("cuda", local_rank)

model = SRGNNSEQLLM(config, dataload) # important part to get a model    
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) 

# create DDP model
model = DDP(model, device_ids=[local_rank], output_device=local_rank,find_unused_parameters=True)

INFO 02-10 06:12:39 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='/home/bachdo/meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='/home/bachdo/meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=/home/bachdo/meta-llama/Meta-Llama-3-8B-Instruct)
INFO 02-10 06:12:46 model_runner.py:146] Loading model weights took 14.9575 GB
INFO 02-10 06:12:46 gpu_executor.py:83] # GPU blocks: 3934, # CPU blocks: 2048
INFO 02-10 06:12:47 model_runner.py:854] Capturing the model for CUDA graphs.

In [61]:
model.device

device(type='cuda', index=0)

In [62]:
from REC.trainer import Trainer

trainer = Trainer(config,model)

In [63]:
trainer.epochs = 2 # set number of epochs

In [None]:
best_valid_score, best_valid_result = trainer.fit(
        train_loader, valid_loader, saved=True, show_progress=True
)

[1;35mTrain     0[0m:   0%|                                         | 0/43563 [00:00<?, ?it/s][0mhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just go




Processed prompts:   0%|          | 1/640 [00:07<1:20:49,  7.59s/it, Generation Speed: 20.69 toks/s][A
Processed prompts:   0%|          | 2/640 [00:08<36:35,  3.44s/it, Generation Speed: 41.47 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<22:51,  2.15s/it, Generation Speed: 61.39 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:14<03:13,  3.15it/s, Generation Speed: 409.19 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:14<03:10,  3.20it/s, Generation Speed: 408.94 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:15<03:17,  3.08it/s, Generation Speed: 402.27 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:17<04:38,  2.18it/s, Generation Speed: 367.04 toks/s][A
Processed prompts:   5%|▌         | 34/640 [00:17<04:53,  2.06it/s, Generation Speed: 363.96 toks/s][A
Processed prompts:   9%|▉         | 59/640 [00:21<02:05,  4.64it/s, Generation Speed: 535.32 toks/s][A
Processed prompts:   9%|▉         | 60/640 [00:22<02:36,  3.71it/

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:43,  2.13s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▏       | 143/640 [00:04<00:13, 38.04it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 280/640 [00:06<00:07, 48.47it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 421/640 [00:08<00:03, 55.12it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 64.35it/s, Generation Speed: 0.00 toks/s][A


seq_input.shape = torch.Size([64, 10, 512])


[1;35mTrain     0[0m:   0%|                           | 1/43563 [03:28<2524:39:42, 208.64s/it][0m
Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:06<1:08:40,  6.45s/it, Generation Speed: 16.90 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<32:07,  3.02s/it, Generation Speed: 33.66 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<22:00,  2.07s/it, Generation Speed: 50.04 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<16:08,  1.52s/it, Generation Speed: 67.52 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<10:54,  1.03s/it, Generation Speed: 87.41 toks/s][A
Processed prompts:   1%|          | 6/640 [00:09<08:47,  1.20it/s, Generation Speed: 104.73 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:13<02:07,  4.79it/s, Generation Speed: 443.14 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:14<02:56,  3.46it/s, Generation Speed:

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:12,  2.09s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 146/640 [00:04<00:12, 41.06it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 284/640 [00:06<00:06, 52.46it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 423/640 [00:08<00:03, 57.95it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 66.78it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                           | 2/43563 [06:54<2506:28:18, 207.14s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:04<46:12,  4.34s/it, Generation Speed: 4.84 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<40:00,  3.76s/it, Generation Speed: 23.00 toks/s][A
Processed prompts:   1%|          | 4/640 [00:07<15:40,  1.48s/it, Generation Speed: 62.35 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<12:34,  1.19s/it, Generation Speed: 79.28 toks/s][A
Processed prompts:   1%|          | 6/640 [00:08<10:00,  1.06it/s, Generation Speed: 96.84 toks/s][A
Processed prompts:   1%|          | 7/640 [00:09<08:18,  1.27it/s, Generation Speed: 113.93 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:14<02:45,  3.67it/s, Generation Speed: 399.94 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:15<03:18,  3.07it/s, Generation Speed: 377.95 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:16<03:14,  3.11it/s, Generation Speed: 




Processed prompts:  73%|███████▎  | 469/640 [02:23<00:58,  2.93it/s, Generation Speed: 630.71 toks/s][A
Processed prompts:  73%|███████▎  | 470/640 [02:23<00:56,  3.03it/s, Generation Speed: 630.86 toks/s][A
Processed prompts:  74%|███████▎  | 471/640 [02:23<00:50,  3.34it/s, Generation Speed: 631.44 toks/s][A
Processed prompts:  74%|███████▍  | 472/640 [02:24<00:52,  3.21it/s, Generation Speed: 631.28 toks/s][A
Processed prompts:  75%|███████▍  | 478/640 [02:25<00:39,  4.14it/s, Generation Speed: 634.16 toks/s][A
Processed prompts:  75%|███████▍  | 479/640 [02:26<01:07,  2.39it/s, Generation Speed: 628.91 toks/s][A
Processed prompts:  75%|███████▌  | 480/640 [02:27<01:02,  2.58it/s, Generation Speed: 629.29 toks/s][A
Processed prompts:  75%|███████▌  | 481/640 [02:27<01:08,  2.33it/s, Generation Speed: 628.04 toks/s][A
Processed prompts:  75%|███████▌  | 482/640 [02:27<00:59,  2.65it/s, Generation Speed: 628.62 toks/s][A
Processed prompts:  76%|███████▌  | 484/640 [02:28<00:

A[1,:] = tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:14,  2.09s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 145/640 [00:04<00:12, 40.75it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 282/640 [00:06<00:06, 52.04it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 421/640 [00:08<00:03, 57.86it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.19it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                           | 3/43563 [10:18<2490:30:35, 205.83s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:06<1:12:16,  6.79s/it, Generation Speed: 14.44 toks/s][A
Processed prompts:   0%|          | 2/640 [00:08<40:15,  3.79s/it, Generation Speed: 30.81 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<23:07,  2.18s/it, Generation Speed: 49.10 toks/s][A
Processed prompts:   1%|          | 5/640 [00:09<11:12,  1.06s/it, Generation Speed: 84.36 toks/s][A
Processed prompts:   1%|          | 6/640 [00:09<09:44,  1.08it/s, Generation Speed: 99.13 toks/s][A
Processed prompts:   1%|          | 7/640 [00:10<07:49,  1.35it/s, Generation Speed: 116.08 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:13<02:05,  4.86it/s, Generation Speed: 427.77 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:15<02:59,  3.40it/s, Generation Speed: 381.17 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:15<03:10,  3.19it/s, Generation Sp

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:22,  2.10s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 145/640 [00:04<00:12, 40.59it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 283/640 [00:06<00:06, 52.02it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 423/640 [00:08<00:03, 57.74it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 66.85it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                           | 4/43563 [13:43<2485:12:36, 205.39s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:04<43:21,  4.07s/it, Generation Speed: 2.21 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<37:03,  3.49s/it, Generation Speed: 19.87 toks/s][A
Processed prompts:   0%|          | 3/640 [00:08<28:29,  2.68s/it, Generation Speed: 38.52 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:11<02:28,  4.09it/s, Generation Speed: 479.40 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:14<03:46,  2.69it/s, Generation Speed: 396.34 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:17<04:47,  2.11it/s, Generation Speed: 359.25 toks/s][A
Processed prompts:   5%|▌         | 34/640 [00:17<05:05,  1.98it/s, Generation Speed: 353.22 toks/s][A
Processed prompts:   5%|▌         | 35/640 [00:18<04:44,  2.13it/s, Generation Speed: 361.09 toks/s][A
Processed prompts:  10%|▉         | 61/640 [00:21<02:00,  4.81it/s, Generation Sp

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:14,  2.09s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▎       | 144/640 [00:04<00:12, 40.41it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 281/640 [00:06<00:06, 51.65it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 423/640 [00:08<00:03, 57.90it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 66.94it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                           | 5/43563 [17:08<2482:25:53, 205.17s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:06<1:09:12,  6.50s/it, Generation Speed: 17.08 toks/s][A
Processed prompts:   0%|          | 2/640 [00:08<41:45,  3.93s/it, Generation Speed: 35.36 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<23:43,  2.23s/it, Generation Speed: 56.74 toks/s][A
Processed prompts:   1%|          | 4/640 [00:09<15:14,  1.44s/it, Generation Speed: 77.46 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:13<02:41,  3.77it/s, Generation Speed: 425.09 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:15<03:29,  2.90it/s, Generation Speed: 382.13 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:16<03:33,  2.85it/s, Generation Speed: 380.03 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:16<03:36,  2.80it/s, Generation Speed: 379.31 toks/s][A
Processed prompts:   5%|▌         | 34/640 [00:16<03:26,  2.93it/s, Generation




Processed prompts:  14%|█▍        | 90/640 [00:33<03:02,  3.02it/s, Generation Speed: 524.12 toks/s][A
Processed prompts:  14%|█▍        | 91/640 [00:34<03:23,  2.70it/s, Generation Speed: 516.45 toks/s][A
Processed prompts:  14%|█▍        | 92/640 [00:34<03:25,  2.66it/s, Generation Speed: 515.25 toks/s][A
Processed prompts:  15%|█▍        | 93/640 [00:35<03:26,  2.64it/s, Generation Speed: 515.13 toks/s][A
Processed prompts:  15%|█▍        | 94/640 [00:35<03:16,  2.77it/s, Generation Speed: 517.07 toks/s][A
Processed prompts:  15%|█▍        | 95/640 [00:35<02:58,  3.06it/s, Generation Speed: 520.34 toks/s][A
Processed prompts:  15%|█▌        | 96/640 [00:35<02:45,  3.28it/s, Generation Speed: 522.86 toks/s][A
Processed prompts:  19%|█▊        | 119/640 [00:39<01:34,  5.50it/s, Generation Speed: 588.15 toks/s][A
Processed prompts:  19%|█▉        | 120/640 [00:42<02:41,  3.22it/s, Generation Speed: 553.56 toks/s][A
Processed prompts:  19%|█▉        | 121/640 [00:42<02:47,  3.

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:17,  2.09s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▏       | 141/640 [00:04<00:14, 33.91it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▎     | 279/640 [00:06<00:07, 47.36it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  65%|██████▌   | 416/640 [00:08<00:04, 54.12it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:10<00:00, 62.79it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                           | 6/43563 [20:33<2482:32:45, 205.18s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:07<1:16:28,  7.18s/it, Generation Speed: 15.74 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<34:52,  3.28s/it, Generation Speed: 31.43 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<21:18,  2.01s/it, Generation Speed: 47.18 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<14:32,  1.37s/it, Generation Speed: 48.83 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<10:31,  1.01it/s, Generation Speed: 66.06 toks/s][A
Processed prompts:   1%|          | 6/640 [00:09<09:17,  1.14it/s, Generation Speed: 81.53 toks/s][A
Processed prompts:   1%|          | 7/640 [00:09<06:57,  1.52it/s, Generation Speed: 99.68 toks/s][A
Processed prompts:   1%|▏         | 8/640 [00:10<05:36,  1.88it/s, Generation Speed: 117.00 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:13<01:57,  5.20it/s, Generation Speed:

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:14,  2.09s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▏       | 143/640 [00:04<00:12, 40.22it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 281/640 [00:06<00:06, 51.97it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 422/640 [00:08<00:03, 58.11it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.47it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                           | 7/43563 [23:56<2474:54:43, 204.56s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:06<1:08:07,  6.40s/it, Generation Speed: 16.88 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<32:46,  3.08s/it, Generation Speed: 33.81 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<23:21,  2.20s/it, Generation Speed: 50.43 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<14:52,  1.40s/it, Generation Speed: 70.42 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<10:54,  1.03s/it, Generation Speed: 88.81 toks/s][A
Processed prompts:   1%|          | 6/640 [00:09<08:37,  1.23it/s, Generation Speed: 106.59 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:12<02:05,  4.85it/s, Generation Speed: 446.64 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:15<03:11,  3.18it/s, Generation Speed: 386.75 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:15<03:20,  3.04it/s, Generation Sp

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:24,  2.10s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 145/640 [00:04<00:12, 40.54it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 282/640 [00:06<00:06, 51.76it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 423/640 [00:08<00:03, 57.85it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.10it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                           | 8/43563 [27:22<2480:00:54, 204.98s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:06<1:04:26,  6.05s/it, Generation Speed: 15.20 toks/s][A
Processed prompts:   0%|          | 2/640 [00:06<31:41,  2.98s/it, Generation Speed: 31.82 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:07<19:49,  1.87s/it, Generation Speed: 48.89 toks/s][A
Processed prompts:   1%|          | 4/640 [00:07<12:57,  1.22s/it, Generation Speed: 67.50 toks/s][A




Processed prompts:   1%|          | 5/640 [00:09<13:27,  1.27s/it, Generation Speed: 79.52 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:15<03:18,  3.07it/s, Generation Speed: 378.28 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:16<03:27,  2.93it/s, Generation Speed: 375.11 toks/s][A
Processed prompts:   5%|▌         | 34/640 [00:16<03:14,  3.11it/s, Generation Speed: 385.07 toks/s][A
Processed prompts:   5%|▌         | 35/640 [00:16<03:10,  3.18it/s, Generation Speed: 389.01 toks/s][A
Processed prompts:   6%|▌         | 36/640 [00:17<03:21,  3.00it/s, Generation Speed: 387.71 toks/s][A
Processed prompts:   6%|▌         | 38/640 [00:17<03:07,  3.21it/s, Generation Speed: 398.12 toks/s][A
Processed prompts:   6%|▌         | 39/640 [00:18<03:14,  3.09it/s, Generation Speed: 397.56 toks/s][A
Processed prompts:   6%|▋         | 40/640 [00:18<03:02,  3.28it/s, Generation Speed: 403.86 toks/s][A
Processed prompts:  10%|▉         | 61/640 [00:23<02:17,  4.21it/

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:19,  2.10s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 146/640 [00:04<00:12, 40.71it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  45%|████▌     | 291/640 [00:06<00:06, 53.54it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  67%|██████▋   | 430/640 [00:08<00:03, 58.51it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.48it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                           | 9/43563 [30:46<2474:04:46, 204.50s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:07<1:18:36,  7.38s/it, Generation Speed: 20.05 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<33:12,  3.12s/it, Generation Speed: 40.15 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<21:13,  2.00s/it, Generation Speed: 58.40 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<13:35,  1.28s/it, Generation Speed: 78.38 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<10:00,  1.06it/s, Generation Speed: 89.81 toks/s][A
Processed prompts:   1%|          | 6/640 [00:09<08:00,  1.32it/s, Generation Speed: 107.65 toks/s][A
Processed prompts:   1%|          | 7/640 [00:09<06:01,  1.75it/s, Generation Speed: 126.99 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:15<02:46,  3.66it/s, Generation Speed: 385.19 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:15<02:52,  3.52it/s, Generation Spe

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:11,  2.08s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 145/640 [00:04<00:12, 40.76it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  45%|████▍     | 286/640 [00:06<00:06, 52.89it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  67%|██████▋   | 430/640 [00:08<00:03, 59.26it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.94it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                          | 10/43563 [34:11<2476:52:39, 204.73s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:07<1:21:11,  7.62s/it, Generation Speed: 20.20 toks/s][A
Processed prompts:   0%|          | 3/640 [00:08<22:31,  2.12s/it, Generation Speed: 59.01 toks/s]  [A
Processed prompts:   1%|          | 4/640 [00:08<15:28,  1.46s/it, Generation Speed: 77.81 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<12:34,  1.19s/it, Generation Speed: 93.42 toks/s][A
Processed prompts:   1%|          | 6/640 [00:09<09:57,  1.06it/s, Generation Speed: 110.72 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:14<02:40,  3.80it/s, Generation Speed: 412.97 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:15<03:07,  3.24it/s, Generation Speed: 391.52 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:16<03:57,  2.56it/s, Generation Speed: 368.52 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:17<03:56,  2.57it/s, Generation 




Processed prompts:  95%|█████████▍| 606/640 [03:03<00:16,  2.08it/s, Generation Speed: 630.58 toks/s][A
Processed prompts:  95%|█████████▍| 607/640 [03:04<00:14,  2.27it/s, Generation Speed: 630.70 toks/s][A
Processed prompts:  95%|█████████▌| 608/640 [03:04<00:15,  2.11it/s, Generation Speed: 629.76 toks/s][A
Processed prompts:  95%|█████████▌| 609/640 [03:05<00:13,  2.37it/s, Generation Speed: 629.96 toks/s][A
Processed prompts:  95%|█████████▌| 610/640 [03:05<00:11,  2.63it/s, Generation Speed: 630.17 toks/s][A
Processed prompts:  95%|█████████▌| 611/640 [03:05<00:09,  3.04it/s, Generation Speed: 630.62 toks/s][A
Processed prompts:  96%|█████████▌| 612/640 [03:05<00:07,  3.54it/s, Generation Speed: 631.15 toks/s][A
Processed prompts:  96%|█████████▋| 617/640 [03:05<00:02,  8.88it/s, Generation Speed: 635.55 toks/s][A
Processed prompts:  97%|█████████▋| 619/640 [03:06<00:02,  8.89it/s, Generation Speed: 636.93 toks/s][A
Processed prompts:  97%|█████████▋| 621/640 [03:06<00:

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:23,  2.10s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▏       | 143/640 [00:04<00:12, 39.88it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  45%|████▍     | 287/640 [00:06<00:06, 52.95it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  67%|██████▋   | 426/640 [00:08<00:03, 58.15it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.17it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                          | 11/43563 [37:36<2476:42:48, 204.72s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:04<44:09,  4.15s/it, Generation Speed: 2.89 toks/s][A
Processed prompts:   0%|          | 2/640 [00:05<24:55,  2.34s/it, Generation Speed: 12.24 toks/s][A
Processed prompts:   0%|          | 3/640 [00:06<20:50,  1.96s/it, Generation Speed: 25.67 toks/s][A
Processed prompts:   1%|          | 4/640 [00:07<17:32,  1.66s/it, Generation Speed: 41.15 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<12:38,  1.19s/it, Generation Speed: 59.64 toks/s][A
Processed prompts:   1%|          | 6/640 [00:08<09:50,  1.07it/s, Generation Speed: 77.53 toks/s][A
Processed prompts:   1%|          | 7/640 [00:09<07:48,  1.35it/s, Generation Speed: 95.51 toks/s][A
Processed prompts:   1%|▏         | 8/640 [00:09<06:34,  1.60it/s, Generation Speed: 112.89 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:12<01:57,  5.16it/s, Generation Speed: 437.

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:28,  2.11s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▏       | 142/640 [00:04<00:12, 39.55it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 283/640 [00:06<00:06, 52.02it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▋   | 425/640 [00:08<00:03, 58.04it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 66.94it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                          | 12/43563 [41:01<2477:13:05, 204.77s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:06<1:11:21,  6.70s/it, Generation Speed: 17.76 toks/s][A
Processed prompts:   0%|          | 2/640 [00:06<31:09,  2.93s/it, Generation Speed: 34.90 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:07<19:46,  1.86s/it, Generation Speed: 51.17 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<14:06,  1.33s/it, Generation Speed: 68.41 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<10:20,  1.02it/s, Generation Speed: 86.29 toks/s][A
Processed prompts:   1%|          | 6/640 [00:08<07:28,  1.41it/s, Generation Speed: 104.94 toks/s][A
Processed prompts:   1%|          | 7/640 [00:08<05:58,  1.76it/s, Generation Speed: 122.21 toks/s][A
Processed prompts:   1%|▏         | 8/640 [00:09<04:36,  2.28it/s, Generation Speed: 140.32 toks/s][A
Processed prompts:   1%|▏         | 9/640 [00:09<04:22,  2.40it/s, Generation Speed

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:30,  2.11s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▏       | 142/640 [00:04<00:12, 39.53it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 281/640 [00:06<00:06, 51.47it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 421/640 [00:08<00:03, 57.22it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 66.58it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                          | 13/43563 [44:26<2477:48:00, 204.82s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:06<1:04:12,  6.03s/it, Generation Speed: 15.09 toks/s][A
Processed prompts:   0%|          | 2/640 [00:06<29:13,  2.75s/it, Generation Speed: 31.01 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:07<21:34,  2.03s/it, Generation Speed: 46.33 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<15:53,  1.50s/it, Generation Speed: 64.48 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<12:13,  1.15s/it, Generation Speed: 83.03 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:15<03:04,  3.30it/s, Generation Speed: 392.85 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:16<03:46,  2.69it/s, Generation Speed: 363.18 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:17<03:42,  2.73it/s, Generation Speed: 367.64 toks/s][A
Processed prompts:   5%|▌         | 34/640 [00:17<03:31,  2.86it/s, Generation S




Processed prompts:  52%|█████▏    | 330/640 [01:42<01:51,  2.77it/s, Generation Speed: 619.63 toks/s][A
Processed prompts:  52%|█████▏    | 331/640 [01:43<01:49,  2.82it/s, Generation Speed: 619.69 toks/s][A
Processed prompts:  52%|█████▏    | 332/640 [01:43<01:57,  2.61it/s, Generation Speed: 618.38 toks/s][A
Processed prompts:  52%|█████▏    | 333/640 [01:43<01:51,  2.74it/s, Generation Speed: 618.64 toks/s][A
Processed prompts:  52%|█████▏    | 334/640 [01:44<01:52,  2.73it/s, Generation Speed: 618.34 toks/s][A
Processed prompts:  52%|█████▏    | 335/640 [01:44<01:45,  2.88it/s, Generation Speed: 618.60 toks/s][A
Processed prompts:  52%|█████▎    | 336/640 [01:44<01:35,  3.19it/s, Generation Speed: 619.29 toks/s][A
Processed prompts:  53%|█████▎    | 337/640 [01:45<01:52,  2.70it/s, Generation Speed: 618.04 toks/s][A
Processed prompts:  53%|█████▎    | 338/640 [01:45<01:38,  3.06it/s, Generation Speed: 618.71 toks/s][A
Processed prompts:  53%|█████▎    | 340/640 [01:45<01:

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:31,  2.11s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▏       | 142/640 [00:04<00:12, 39.50it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 281/640 [00:06<00:06, 51.53it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 421/640 [00:08<00:03, 57.17it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 66.77it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                          | 14/43563 [47:50<2477:25:17, 204.80s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:04<44:16,  4.16s/it, Generation Speed: 2.89 toks/s][A
Processed prompts:   0%|          | 2/640 [00:05<27:05,  2.55s/it, Generation Speed: 13.98 toks/s][A
Processed prompts:   0%|          | 3/640 [00:06<21:16,  2.00s/it, Generation Speed: 28.12 toks/s][A
Processed prompts:   1%|          | 4/640 [00:07<14:19,  1.35s/it, Generation Speed: 44.05 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<15:12,  1.44s/it, Generation Speed: 57.24 toks/s][A
Processed prompts:   1%|          | 6/640 [00:09<11:38,  1.10s/it, Generation Speed: 75.93 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:13<02:15,  4.49it/s, Generation Speed: 436.08 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:15<03:08,  3.22it/s, Generation Speed: 392.19 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:16<03:50,  2.63it/s, Generation Speed: 3

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:31,  2.11s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 145/640 [00:04<00:12, 40.39it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 283/640 [00:06<00:06, 51.79it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  67%|██████▋   | 426/640 [00:08<00:03, 58.07it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.14it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                          | 15/43563 [51:13<2469:06:07, 204.11s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:08<1:31:58,  8.64s/it, Generation Speed: 23.16 toks/s][A
Processed prompts:   5%|▍         | 29/640 [00:12<03:23,  3.00it/s, Generation Speed: 460.47 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:12<03:31,  2.88it/s, Generation Speed: 450.88 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:15<05:18,  1.91it/s, Generation Speed: 374.53 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:16<05:24,  1.87it/s, Generation Speed: 367.36 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:16<05:13,  1.93it/s, Generation Speed: 367.10 toks/s][A
Processed prompts:   5%|▌         | 34/640 [00:17<05:13,  1.93it/s, Generation Speed: 364.73 toks/s][A
Processed prompts:   5%|▌         | 35/640 [00:17<05:02,  2.00it/s, Generation Speed: 358.11 toks/s][A
Processed prompts:   6%|▌         | 36/640 [00:18<04:24,  2.28it/s, Genera

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:29,  2.11s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 146/640 [00:04<00:12, 40.64it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  45%|████▍     | 287/640 [00:06<00:06, 52.43it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  67%|██████▋   | 429/640 [00:08<00:03, 58.27it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.10it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                          | 16/43563 [54:39<2476:03:04, 204.69s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:04<43:43,  4.11s/it, Generation Speed: 2.44 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<36:38,  3.45s/it, Generation Speed: 19.74 toks/s][A
Processed prompts:   0%|          | 3/640 [00:07<20:57,  1.97s/it, Generation Speed: 37.33 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<16:31,  1.56s/it, Generation Speed: 53.32 toks/s][A
Processed prompts:   1%|          | 6/640 [00:08<08:23,  1.26it/s, Generation Speed: 91.91 toks/s][A
Processed prompts:   1%|          | 7/640 [00:09<08:30,  1.24it/s, Generation Speed: 105.09 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:12<02:09,  4.70it/s, Generation Speed: 433.12 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:13<02:33,  3.96it/s, Generation Speed: 416.65 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:14<02:33,  3.95it/s, Generation Speed: 




Processed prompts:  19%|█▉        | 123/640 [00:41<02:19,  3.70it/s, Generation Speed: 563.43 toks/s][A
Processed prompts:  19%|█▉        | 124/640 [00:42<02:48,  3.07it/s, Generation Speed: 552.19 toks/s][A
Processed prompts:  20%|█▉        | 125/640 [00:42<02:54,  2.94it/s, Generation Speed: 549.56 toks/s][A
Processed prompts:  20%|█▉        | 126/640 [00:43<02:52,  2.98it/s, Generation Speed: 550.33 toks/s][A
Processed prompts:  20%|█▉        | 127/640 [00:43<02:39,  3.22it/s, Generation Speed: 552.88 toks/s][A
Processed prompts:  20%|██        | 128/640 [00:43<02:51,  2.99it/s, Generation Speed: 551.78 toks/s][A
Processed prompts:  20%|██        | 129/640 [00:44<03:16,  2.60it/s, Generation Speed: 548.28 toks/s][A
Processed prompts:  20%|██        | 130/640 [00:44<03:04,  2.77it/s, Generation Speed: 548.78 toks/s][A
Processed prompts:  20%|██        | 131/640 [00:45<03:28,  2.44it/s, Generation Speed: 546.32 toks/s][A
Processed prompts:  24%|██▍       | 152/640 [00:48<01:

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:30,  2.11s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 146/640 [00:04<00:12, 40.66it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 281/640 [00:06<00:07, 51.23it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 421/640 [00:08<00:03, 57.16it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 66.68it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                          | 17/43563 [58:04<2476:03:00, 204.70s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:06<1:13:06,  6.87s/it, Generation Speed: 18.35 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<33:04,  3.11s/it, Generation Speed: 31.85 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<22:20,  2.11s/it, Generation Speed: 49.18 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<14:58,  1.41s/it, Generation Speed: 68.21 toks/s][A
Processed prompts:   1%|          | 5/640 [00:09<11:47,  1.11s/it, Generation Speed: 85.60 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:16<03:26,  2.96it/s, Generation Speed: 363.19 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:17<03:34,  2.83it/s, Generation Speed: 361.33 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:17<03:31,  2.87it/s, Generation Speed: 366.04 toks/s][A
Processed prompts:   5%|▌         | 34/640 [00:17<03:21,  3.00it/s, Generation S

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:24,  2.10s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▏       | 142/640 [00:04<00:12, 39.73it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 282/640 [00:06<00:06, 52.03it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 421/640 [00:08<00:03, 57.62it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 66.72it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 18/43563 [1:01:31<2485:57:49, 205.52s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:07<1:17:51,  7.31s/it, Generation Speed: 16.00 toks/s][A
Processed prompts:   0%|          | 2/640 [00:08<41:07,  3.87s/it, Generation Speed: 32.96 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:09<25:12,  2.37s/it, Generation Speed: 51.25 toks/s][A
Processed prompts:   1%|          | 4/640 [00:09<16:18,  1.54s/it, Generation Speed: 70.26 toks/s][A
Processed prompts:   1%|          | 5/640 [00:09<11:18,  1.07s/it, Generation Speed: 88.86 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:14<02:25,  4.18it/s, Generation Speed: 412.89 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:15<02:46,  3.65it/s, Generation Speed: 392.28 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:16<03:28,  2.91it/s, Generation Speed: 368.44 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:16<03:20,  3.03it/s, Generation S




Processed prompts:  28%|██▊       | 180/640 [00:59<01:53,  4.05it/s, Generation Speed: 582.76 toks/s][A
Processed prompts:  28%|██▊       | 181/640 [00:59<01:51,  4.10it/s, Generation Speed: 582.00 toks/s][A
Processed prompts:  28%|██▊       | 182/640 [01:00<02:21,  3.24it/s, Generation Speed: 575.53 toks/s][A
Processed prompts:  29%|██▊       | 183/640 [01:00<02:14,  3.39it/s, Generation Speed: 577.06 toks/s][A
Processed prompts:  29%|██▉       | 184/640 [01:01<02:13,  3.41it/s, Generation Speed: 577.00 toks/s][A
Processed prompts:  29%|██▉       | 185/640 [01:01<02:04,  3.67it/s, Generation Speed: 577.35 toks/s][A
Processed prompts:  29%|██▉       | 186/640 [01:01<01:58,  3.83it/s, Generation Speed: 578.62 toks/s][A
Processed prompts:  29%|██▉       | 187/640 [01:01<01:48,  4.16it/s, Generation Speed: 580.35 toks/s][A
Processed prompts:  29%|██▉       | 188/640 [01:02<02:24,  3.12it/s, Generation Speed: 578.09 toks/s][A
Processed prompts:  30%|██▉       | 189/640 [01:02<02:

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:26,  2.11s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 146/640 [00:04<00:12, 40.78it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  45%|████▍     | 287/640 [00:06<00:06, 52.63it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  67%|██████▋   | 427/640 [00:08<00:03, 58.10it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.03it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 19/43563 [1:04:57<2486:27:54, 205.57s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:03<42:27,  3.99s/it, Generation Speed: 1.25 toks/s][A
Processed prompts:   0%|          | 2/640 [00:08<43:36,  4.10s/it, Generation Speed: 22.04 toks/s][A
Processed prompts:   0%|          | 3/640 [00:08<24:49,  2.34s/it, Generation Speed: 42.69 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<16:01,  1.51s/it, Generation Speed: 62.65 toks/s][A
Processed prompts:   1%|          | 5/640 [00:09<11:44,  1.11s/it, Generation Speed: 81.32 toks/s][A
Processed prompts:   1%|          | 6/640 [00:09<08:29,  1.24it/s, Generation Speed: 100.77 toks/s][A
Processed prompts:   1%|          | 7/640 [00:09<06:26,  1.64it/s, Generation Speed: 119.63 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:13<02:00,  5.07it/s, Generation Speed: 436.85 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:14<02:30,  4.04it/s, Generation Speed: 4

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:30,  2.11s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▎       | 144/640 [00:04<00:12, 40.08it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 283/640 [00:06<00:06, 51.85it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▋   | 424/640 [00:08<00:03, 57.79it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 66.40it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 20/43563 [1:08:22<2486:44:08, 205.60s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:07<1:18:03,  7.33s/it, Generation Speed: 19.78 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<33:34,  3.16s/it, Generation Speed: 39.65 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:07<20:18,  1.91s/it, Generation Speed: 58.39 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<13:32,  1.28s/it, Generation Speed: 77.22 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<09:47,  1.08it/s, Generation Speed: 95.54 toks/s][A
Processed prompts:   1%|          | 6/640 [00:08<07:07,  1.48it/s, Generation Speed: 114.36 toks/s][A
Processed prompts:   1%|          | 7/640 [00:09<05:44,  1.84it/s, Generation Speed: 131.69 toks/s][A
Processed prompts:   1%|▏         | 8/640 [00:09<04:31,  2.33it/s, Generation Speed: 149.72 toks/s][A
Processed prompts:   1%|▏         | 9/640 [00:09<04:15,  2.47it/s, Generation Speed

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:32,  2.12s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▏       | 143/640 [00:04<00:12, 39.72it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 282/640 [00:06<00:06, 51.62it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 423/640 [00:08<00:03, 57.54it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 66.68it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 21/43563 [1:11:48<2486:36:53, 205.59s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:05<1:01:53,  5.81s/it, Generation Speed: 13.94 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<37:26,  3.52s/it, Generation Speed: 27.94 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<24:27,  2.30s/it, Generation Speed: 44.39 toks/s][A
Processed prompts:   1%|          | 4/640 [00:09<18:13,  1.72s/it, Generation Speed: 61.12 toks/s][A
Processed prompts:   1%|          | 5/640 [00:09<12:43,  1.20s/it, Generation Speed: 79.97 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:16<03:09,  3.22it/s, Generation Speed: 367.49 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:16<03:16,  3.10it/s, Generation Speed: 363.19 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:16<03:18,  3.07it/s, Generation Speed: 366.88 toks/s][A
Processed prompts:   5%|▌         | 34/640 [00:17<03:38,  2.77it/s, Generation S




Processed prompts:  29%|██▊       | 183/640 [01:01<03:41,  2.07it/s, Generation Speed: 571.99 toks/s][A
Processed prompts:  29%|██▉       | 184/640 [01:02<03:33,  2.14it/s, Generation Speed: 571.57 toks/s][A
Processed prompts:  29%|██▉       | 186/640 [01:02<02:49,  2.67it/s, Generation Speed: 574.26 toks/s][A
Processed prompts:  30%|██▉       | 190/640 [01:03<02:02,  3.68it/s, Generation Speed: 580.74 toks/s][A
Processed prompts:  30%|███       | 192/640 [01:03<01:48,  4.14it/s, Generation Speed: 583.87 toks/s][A
Processed prompts:  30%|███       | 194/640 [01:03<01:35,  4.66it/s, Generation Speed: 587.50 toks/s][A
Processed prompts:  30%|███       | 195/640 [01:04<01:35,  4.68it/s, Generation Speed: 588.71 toks/s][A
Processed prompts:  31%|███       | 196/640 [01:04<01:32,  4.80it/s, Generation Speed: 590.14 toks/s][A
Processed prompts:  31%|███       | 197/640 [01:04<01:49,  4.03it/s, Generation Speed: 589.61 toks/s][A
Processed prompts:  33%|███▎      | 209/640 [01:07<01:

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:30,  2.11s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▏       | 142/640 [00:04<00:12, 39.49it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 284/640 [00:06<00:06, 52.14it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  67%|██████▋   | 426/640 [00:08<00:03, 58.07it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.01it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 22/43563 [1:15:11<2479:07:00, 204.98s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:07<1:21:09,  7.62s/it, Generation Speed: 20.47 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<34:42,  3.26s/it, Generation Speed: 40.97 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<21:35,  2.03s/it, Generation Speed: 59.96 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<15:19,  1.45s/it, Generation Speed: 78.65 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:14<02:53,  3.51it/s, Generation Speed: 413.21 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:16<03:49,  2.65it/s, Generation Speed: 368.50 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:16<03:50,  2.64it/s, Generation Speed: 369.89 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:17<03:48,  2.65it/s, Generation Speed: 373.93 toks/s][A
Processed prompts:   5%|▌         | 34/640 [00:17<03:51,  2.62it/s, Generation

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:18,  2.09s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▏       | 142/640 [00:04<00:12, 39.86it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 281/640 [00:06<00:06, 51.96it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 423/640 [00:08<00:03, 58.20it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.00it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 23/43563 [1:18:36<2478:52:09, 204.96s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:06<1:04:25,  6.05s/it, Generation Speed: 15.21 toks/s][A
Processed prompts:   0%|          | 2/640 [00:06<28:25,  2.67s/it, Generation Speed: 30.98 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<25:28,  2.40s/it, Generation Speed: 45.53 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<16:48,  1.58s/it, Generation Speed: 66.02 toks/s][A
Processed prompts:   1%|          | 5/640 [00:09<11:36,  1.10s/it, Generation Speed: 86.42 toks/s][A
Processed prompts:   1%|          | 6/640 [00:09<08:14,  1.28it/s, Generation Speed: 106.68 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:10<01:11,  8.57it/s, Generation Speed: 539.16 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:13<02:46,  3.64it/s, Generation Speed: 416.96 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:16<04:01,  2.51it/s, Generation Sp

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:29,  2.11s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 146/640 [00:04<00:12, 40.66it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  45%|████▍     | 285/640 [00:06<00:06, 52.15it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  67%|██████▋   | 427/640 [00:08<00:03, 58.11it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.25it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 24/43563 [1:22:00<2475:23:23, 204.68s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:05<56:00,  5.26s/it, Generation Speed: 10.84 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<37:53,  3.56s/it, Generation Speed: 26.98 toks/s][A
Processed prompts:   0%|          | 3/640 [00:08<22:46,  2.14s/it, Generation Speed: 46.22 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<16:26,  1.55s/it, Generation Speed: 64.47 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<11:06,  1.05s/it, Generation Speed: 84.65 toks/s][A
Processed prompts:   1%|          | 6/640 [00:09<08:41,  1.22it/s, Generation Speed: 102.74 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:15<03:08,  3.23it/s, Generation Speed: 373.12 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:16<03:15,  3.10it/s, Generation Speed: 369.29 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:16<03:10,  3.19it/s, Generation Speed:




Processed prompts:  29%|██▊       | 183/640 [01:01<03:25,  2.22it/s, Generation Speed: 568.90 toks/s][A
Processed prompts:  29%|██▉       | 184/640 [01:02<03:19,  2.28it/s, Generation Speed: 568.15 toks/s][A
Processed prompts:  29%|██▉       | 185/640 [01:02<03:00,  2.52it/s, Generation Speed: 569.83 toks/s][A
Processed prompts:  29%|██▉       | 186/640 [01:02<02:56,  2.57it/s, Generation Speed: 569.78 toks/s][A
Processed prompts:  29%|██▉       | 187/640 [01:02<02:41,  2.80it/s, Generation Speed: 570.38 toks/s][A
Processed prompts:  29%|██▉       | 188/640 [01:03<02:35,  2.90it/s, Generation Speed: 570.79 toks/s][A
Processed prompts:  30%|██▉       | 189/640 [01:03<02:15,  3.33it/s, Generation Speed: 572.50 toks/s][A
Processed prompts:  30%|██▉       | 190/640 [01:03<02:07,  3.54it/s, Generation Speed: 573.26 toks/s][A
Processed prompts:  30%|██▉       | 191/640 [01:03<02:07,  3.52it/s, Generation Speed: 573.53 toks/s][A
Processed prompts:  30%|███       | 192/640 [01:03<01:

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:32,  2.12s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▎       | 144/640 [00:04<00:12, 40.00it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 284/640 [00:06<00:06, 51.94it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▋   | 425/640 [00:08<00:03, 57.77it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.05it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 25/43563 [1:25:27<2482:36:45, 205.28s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:05<1:03:45,  5.99s/it, Generation Speed: 14.86 toks/s][A
Processed prompts:   0%|          | 2/640 [00:06<29:30,  2.78s/it, Generation Speed: 30.70 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:06<18:17,  1.72s/it, Generation Speed: 46.53 toks/s][A
Processed prompts:   1%|          | 4/640 [00:07<13:13,  1.25s/it, Generation Speed: 62.09 toks/s][A
Processed prompts:   1%|          | 6/640 [00:07<07:00,  1.51it/s, Generation Speed: 96.10 toks/s][A
Processed prompts:   1%|          | 7/640 [00:08<05:57,  1.77it/s, Generation Speed: 93.46 toks/s][A
Processed prompts:   1%|▏         | 8/640 [00:08<06:10,  1.71it/s, Generation Speed: 106.05 toks/s][A
Processed prompts:   1%|▏         | 9/640 [00:08<05:01,  2.09it/s, Generation Speed: 122.89 toks/s][A
Processed prompts:   2%|▏         | 10/640 [00:09<04:38,  2.26it/s, Generation Speed

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:27,  2.11s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 146/640 [00:04<00:12, 40.63it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 282/640 [00:06<00:06, 51.52it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 422/640 [00:08<00:03, 57.38it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 66.59it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 26/43563 [1:28:52<2482:34:39, 205.28s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:06<1:11:53,  6.75s/it, Generation Speed: 17.92 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<34:57,  3.29s/it, Generation Speed: 35.73 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<23:17,  2.19s/it, Generation Speed: 54.19 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<15:45,  1.49s/it, Generation Speed: 74.19 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:13<02:31,  4.02it/s, Generation Speed: 444.58 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:13<02:43,  3.72it/s, Generation Speed: 426.99 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:15<03:57,  2.56it/s, Generation Speed: 378.56 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:17<04:52,  2.08it/s, Generation Speed: 358.24 toks/s][A
Processed prompts:   5%|▌         | 34/640 [00:17<04:47,  2.11it/s, Generation

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:17,  2.09s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▎       | 144/640 [00:04<00:12, 40.48it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 281/640 [00:06<00:06, 51.83it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 423/640 [00:08<00:03, 58.11it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.39it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 27/43563 [1:32:17<2479:53:49, 205.06s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:07<1:16:31,  7.18s/it, Generation Speed: 19.49 toks/s][A
Processed prompts:   0%|          | 2/640 [00:08<41:16,  3.88s/it, Generation Speed: 38.84 toks/s]  [A
Processed prompts:   5%|▍         | 30/640 [00:14<03:41,  2.75it/s, Generation Speed: 403.21 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:15<03:57,  2.56it/s, Generation Speed: 386.47 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:16<04:14,  2.39it/s, Generation Speed: 376.20 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:16<03:59,  2.54it/s, Generation Speed: 382.95 toks/s][A
Processed prompts:   5%|▌         | 34/640 [00:17<04:17,  2.35it/s, Generation Speed: 377.75 toks/s][A
Processed prompts:   5%|▌         | 35/640 [00:17<04:26,  2.27it/s, Generation Speed: 377.13 toks/s][A
Processed prompts:   6%|▌         | 36/640 [00:18<04:15,  2.37it/s, Genera




Processed prompts:   9%|▉         | 60/640 [00:24<03:08,  3.08it/s, Generation Speed: 479.31 toks/s][A
Processed prompts:  10%|▉         | 61/640 [00:25<03:28,  2.78it/s, Generation Speed: 468.93 toks/s][A
Processed prompts:  10%|▉         | 63/640 [00:25<03:05,  3.11it/s, Generation Speed: 478.57 toks/s][A
Processed prompts:  10%|█         | 64/640 [00:25<03:03,  3.14it/s, Generation Speed: 480.97 toks/s][A
Processed prompts:  10%|█         | 66/640 [00:26<02:56,  3.24it/s, Generation Speed: 483.52 toks/s][A
Processed prompts:  10%|█         | 67/640 [00:26<02:56,  3.25it/s, Generation Speed: 484.66 toks/s][A
Processed prompts:  11%|█         | 70/640 [00:27<02:42,  3.51it/s, Generation Speed: 491.78 toks/s][A
Processed prompts:  11%|█         | 71/640 [00:27<02:31,  3.76it/s, Generation Speed: 495.90 toks/s][A
Processed prompts:  11%|█▏        | 72/640 [00:27<02:20,  4.04it/s, Generation Speed: 500.01 toks/s][A
Processed prompts:  11%|█▏        | 73/640 [00:28<02:24,  3.93i

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:27,  2.11s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 145/640 [00:04<00:12, 40.44it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  45%|████▍     | 287/640 [00:06<00:06, 52.69it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  67%|██████▋   | 428/640 [00:08<00:03, 58.34it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.01it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 28/43563 [1:35:43<2482:49:17, 205.31s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:07<1:22:09,  7.71s/it, Generation Speed: 21.00 toks/s][A
Processed prompts:   0%|          | 2/640 [00:08<37:47,  3.55s/it, Generation Speed: 41.29 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<22:15,  2.10s/it, Generation Speed: 61.60 toks/s][A
Processed prompts:   1%|          | 4/640 [00:09<14:50,  1.40s/it, Generation Speed: 81.45 toks/s][A
Processed prompts:   5%|▍         | 29/640 [00:12<02:18,  4.41it/s, Generation Speed: 453.82 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:14<03:18,  3.08it/s, Generation Speed: 400.92 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:16<04:05,  2.48it/s, Generation Speed: 373.43 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:17<04:41,  2.16it/s, Generation Speed: 361.63 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:17<04:25,  2.29it/s, Generation

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:22,  2.10s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▏       | 142/640 [00:04<00:12, 39.70it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 280/640 [00:06<00:07, 51.37it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▋   | 424/640 [00:08<00:03, 58.18it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 66.76it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 29/43563 [1:39:08<2483:22:27, 205.36s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:07<1:22:23,  7.74s/it, Generation Speed: 20.94 toks/s][A
Processed prompts:   0%|          | 2/640 [00:08<37:55,  3.57s/it, Generation Speed: 41.15 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<22:09,  2.09s/it, Generation Speed: 61.43 toks/s][A
Processed prompts:   1%|          | 4/640 [00:09<15:00,  1.42s/it, Generation Speed: 80.78 toks/s][A
Processed prompts:   5%|▍         | 29/640 [00:12<02:19,  4.38it/s, Generation Speed: 451.73 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:14<03:19,  3.06it/s, Generation Speed: 399.11 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:17<04:43,  2.15it/s, Generation Speed: 354.56 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:17<04:17,  2.36it/s, Generation Speed: 367.88 toks/s][A
Processed prompts:   5%|▌         | 34/640 [00:18<04:14,  2.38it/s, Generation




Processed prompts:  28%|██▊       | 179/640 [00:59<01:56,  3.97it/s, Generation Speed: 591.51 toks/s][A
Processed prompts:  28%|██▊       | 180/640 [01:00<02:26,  3.14it/s, Generation Speed: 583.02 toks/s][A
Processed prompts:  28%|██▊       | 181/640 [01:00<02:36,  2.94it/s, Generation Speed: 579.98 toks/s][A
Processed prompts:  28%|██▊       | 182/640 [01:01<02:28,  3.08it/s, Generation Speed: 580.27 toks/s][A
Processed prompts:  29%|██▊       | 183/640 [01:01<02:37,  2.91it/s, Generation Speed: 578.47 toks/s][A
Processed prompts:  29%|██▉       | 184/640 [01:02<02:37,  2.89it/s, Generation Speed: 575.31 toks/s][A
Processed prompts:  29%|██▉       | 185/640 [01:02<02:49,  2.69it/s, Generation Speed: 574.11 toks/s][A
Processed prompts:  29%|██▉       | 186/640 [01:02<02:30,  3.02it/s, Generation Speed: 575.60 toks/s][A
Processed prompts:  29%|██▉       | 187/640 [01:02<02:17,  3.30it/s, Generation Speed: 576.84 toks/s][A
Processed prompts:  29%|██▉       | 188/640 [01:03<02:

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:13,  2.09s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▏       | 140/640 [00:04<00:12, 39.44it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▎     | 279/640 [00:06<00:06, 51.83it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 422/640 [00:08<00:03, 58.29it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.87it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 30/43563 [1:42:32<2476:10:42, 204.77s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:04<45:57,  4.32s/it, Generation Speed: 4.63 toks/s][A
Processed prompts:   0%|          | 2/640 [00:04<19:54,  1.87s/it, Generation Speed: 9.16 toks/s][A
Processed prompts:   0%|          | 3/640 [00:06<21:58,  2.07s/it, Generation Speed: 22.56 toks/s][A
Processed prompts:   1%|          | 4/640 [00:07<16:29,  1.56s/it, Generation Speed: 36.03 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<12:19,  1.16s/it, Generation Speed: 52.87 toks/s][A
Processed prompts:   1%|          | 6/640 [00:08<09:03,  1.17it/s, Generation Speed: 70.87 toks/s][A
Processed prompts:   1%|          | 7/640 [00:08<06:58,  1.51it/s, Generation Speed: 88.43 toks/s][A
Processed prompts:   1%|▏         | 8/640 [00:08<05:40,  1.86it/s, Generation Speed: 105.37 toks/s][A
Processed prompts:   1%|▏         | 9/640 [00:09<06:22,  1.65it/s, Generation Speed: 117.97

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:22,  2.10s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 145/640 [00:04<00:12, 40.38it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  45%|████▍     | 286/640 [00:06<00:06, 52.43it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  67%|██████▋   | 429/640 [00:08<00:03, 58.47it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.55it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 31/43563 [1:45:56<2473:36:38, 204.56s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:06<1:13:15,  6.88s/it, Generation Speed: 18.46 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<33:16,  3.13s/it, Generation Speed: 36.44 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<21:06,  1.99s/it, Generation Speed: 54.65 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<15:06,  1.42s/it, Generation Speed: 72.89 toks/s][A
Processed prompts:   1%|          | 5/640 [00:09<11:20,  1.07s/it, Generation Speed: 91.45 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:13<02:24,  4.23it/s, Generation Speed: 437.90 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:15<03:34,  2.83it/s, Generation Speed: 375.91 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:16<03:59,  2.54it/s, Generation Speed: 366.39 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:17<04:08,  2.44it/s, Generation S

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:27,  2.11s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▎       | 144/640 [00:04<00:12, 40.12it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 282/640 [00:06<00:06, 51.65it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 423/640 [00:08<00:03, 57.64it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 66.33it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 32/43563 [1:49:21<2474:39:11, 204.65s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:07<1:17:26,  7.27s/it, Generation Speed: 19.53 toks/s][A
Processed prompts:   0%|          | 2/640 [00:08<37:09,  3.49s/it, Generation Speed: 39.40 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<22:22,  2.11s/it, Generation Speed: 59.57 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<14:23,  1.36s/it, Generation Speed: 80.24 toks/s][A
Processed prompts:   1%|          | 6/640 [00:09<07:53,  1.34it/s, Generation Speed: 119.50 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:14<02:48,  3.63it/s, Generation Speed: 404.95 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:15<02:46,  3.64it/s, Generation Speed: 403.14 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:16<03:36,  2.80it/s, Generation Speed: 374.95 toks/s][A
Processed prompts:   5%|▌         | 34/640 [00:16<03:28,  2.90it/s, Generation 




Processed prompts:  14%|█▍        | 91/640 [00:32<02:31,  3.63it/s, Generation Speed: 536.75 toks/s][A
Processed prompts:  14%|█▍        | 92/640 [00:33<02:53,  3.17it/s, Generation Speed: 529.79 toks/s][A
Processed prompts:  15%|█▍        | 93/640 [00:33<02:54,  3.13it/s, Generation Speed: 528.20 toks/s][A
Processed prompts:  15%|█▍        | 94/640 [00:34<02:50,  3.21it/s, Generation Speed: 528.25 toks/s][A
Processed prompts:  15%|█▍        | 95/640 [00:34<02:45,  3.29it/s, Generation Speed: 528.47 toks/s][A
Processed prompts:  15%|█▌        | 96/640 [00:34<03:17,  2.76it/s, Generation Speed: 524.52 toks/s][A
Processed prompts:  15%|█▌        | 97/640 [00:35<03:26,  2.63it/s, Generation Speed: 523.06 toks/s][A
Processed prompts:  15%|█▌        | 98/640 [00:35<02:59,  3.02it/s, Generation Speed: 526.32 toks/s][A
Processed prompts:  15%|█▌        | 99/640 [00:35<02:46,  3.26it/s, Generation Speed: 528.16 toks/s][A
Processed prompts:  16%|█▌        | 100/640 [00:36<02:41,  3.33

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:12,  2.08s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 146/640 [00:04<00:12, 41.14it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  45%|████▌     | 288/640 [00:06<00:06, 53.39it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  67%|██████▋   | 426/640 [00:08<00:03, 58.44it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.58it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 33/43563 [1:52:47<2480:50:52, 205.17s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:06<1:05:58,  6.19s/it, Generation Speed: 15.98 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<33:50,  3.18s/it, Generation Speed: 32.61 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:07<20:38,  1.94s/it, Generation Speed: 50.26 toks/s][A
Processed prompts:   1%|          | 4/640 [00:07<12:58,  1.22s/it, Generation Speed: 69.48 toks/s][A
Processed prompts:   1%|          | 5/640 [00:09<13:11,  1.25s/it, Generation Speed: 81.59 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:12<02:16,  4.48it/s, Generation Speed: 450.99 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:16<03:49,  2.65it/s, Generation Speed: 371.11 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:16<03:47,  2.67it/s, Generation Speed: 372.36 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:16<03:41,  2.74it/s, Generation S

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:18,  2.09s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▎       | 144/640 [00:04<00:12, 40.48it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 283/640 [00:06<00:06, 52.31it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 421/640 [00:08<00:03, 57.69it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.14it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 34/43563 [1:56:12<2480:14:10, 205.12s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:07<1:21:49,  7.68s/it, Generation Speed: 20.96 toks/s][A
Processed prompts:   0%|          | 2/640 [00:08<37:39,  3.54s/it, Generation Speed: 41.20 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:08<23:13,  2.19s/it, Generation Speed: 61.01 toks/s][A
Processed prompts:   5%|▍         | 29/640 [00:12<02:36,  3.91it/s, Generation Speed: 456.24 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:14<03:33,  2.86it/s, Generation Speed: 403.00 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:15<03:42,  2.73it/s, Generation Speed: 391.18 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:15<03:36,  2.81it/s, Generation Speed: 393.19 toks/s][A
Processed prompts:   5%|▌         | 34/640 [00:16<03:45,  2.68it/s, Generation Speed: 389.27 toks/s][A
Processed prompts:   5%|▌         | 35/640 [00:17<04:26,  2.27it/s, Generati

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:15,  2.09s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 145/640 [00:04<00:12, 40.74it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 283/640 [00:06<00:06, 52.26it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 423/640 [00:08<00:03, 58.01it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.30it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 35/43563 [1:59:36<2477:43:20, 204.92s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A




Processed prompts:   0%|          | 1/640 [00:06<1:05:26,  6.14s/it, Generation Speed: 15.62 toks/s][A
Processed prompts:   0%|          | 2/640 [00:06<31:13,  2.94s/it, Generation Speed: 31.45 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:07<19:06,  1.80s/it, Generation Speed: 47.65 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<15:23,  1.45s/it, Generation Speed: 62.18 toks/s][A
Processed prompts:   1%|          | 6/640 [00:09<09:54,  1.07it/s, Generation Speed: 95.90 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:15<03:10,  3.19it/s, Generation Speed: 382.82 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:15<03:18,  3.06it/s, Generation Speed: 381.41 toks/s][A
Processed prompts:   5%|▌         | 34/640 [00:16<03:29,  2.89it/s, Generation Speed: 376.70 toks/s][A
Processed prompts:   5%|▌         | 35/640 [00:17<03:20,  3.01it/s, Generation Speed: 384.75 toks/s][A
Processed prompts:   6%|▌         | 36/640 [00:17<03:54,  2.58it/s, G

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:17,  2.09s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▎       | 144/640 [00:04<00:12, 40.48it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 282/640 [00:06<00:06, 52.01it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  67%|██████▋   | 426/640 [00:08<00:03, 58.52it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.58it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 36/43563 [2:03:01<2475:27:34, 204.74s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:01<10:57,  1.03s/it, Generation Speed: 0.97 toks/s][A
Processed prompts:   0%|          | 2/640 [00:06<37:59,  3.57s/it, Generation Speed: 16.14 toks/s][A
Processed prompts:   0%|          | 3/640 [00:08<28:42,  2.70s/it, Generation Speed: 33.41 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<18:26,  1.74s/it, Generation Speed: 52.92 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<13:11,  1.25s/it, Generation Speed: 71.48 toks/s][A
Processed prompts:   1%|          | 6/640 [00:09<10:48,  1.02s/it, Generation Speed: 88.54 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:13<02:14,  4.54it/s, Generation Speed: 443.36 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:13<02:31,  4.01it/s, Generation Speed: 421.19 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:15<03:11,  3.16it/s, Generation Speed: 3

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:16,  2.09s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 147/640 [00:04<00:11, 41.30it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  45%|████▍     | 286/640 [00:06<00:06, 52.76it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  67%|██████▋   | 428/640 [00:08<00:03, 58.72it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.77it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 37/43563 [2:06:25<2475:03:16, 204.71s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:03<41:14,  3.87s/it, Generation Speed: 0.26 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<39:24,  3.71s/it, Generation Speed: 19.30 toks/s][A
Processed prompts:   0%|          | 3/640 [00:07<22:18,  2.10s/it, Generation Speed: 38.54 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<15:59,  1.51s/it, Generation Speed: 56.33 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<12:01,  1.14s/it, Generation Speed: 74.35 toks/s][A
Processed prompts:   1%|          | 6/640 [00:09<09:26,  1.12it/s, Generation Speed: 86.09 toks/s][A
Processed prompts:   1%|          | 7/640 [00:09<07:09,  1.47it/s, Generation Speed: 105.21 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:15<02:53,  3.51it/s, Generation Speed: 376.62 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:16<03:18,  3.05it/s, Generation Speed: 36




Processed prompts:  80%|███████▉  | 511/640 [02:34<00:43,  2.97it/s, Generation Speed: 624.64 toks/s][A
Processed prompts:  80%|████████  | 512/640 [02:35<00:37,  3.40it/s, Generation Speed: 625.28 toks/s][A
Processed prompts:  80%|████████  | 513/640 [02:35<00:32,  3.85it/s, Generation Speed: 625.92 toks/s][A
Processed prompts:  80%|████████  | 514/640 [02:35<00:29,  4.29it/s, Generation Speed: 626.56 toks/s][A
Processed prompts:  81%|████████  | 516/640 [02:35<00:29,  4.23it/s, Generation Speed: 627.19 toks/s][A
Processed prompts:  81%|████████  | 518/640 [02:36<00:26,  4.55it/s, Generation Speed: 628.21 toks/s][A
Processed prompts:  81%|████████  | 519/640 [02:36<00:36,  3.28it/s, Generation Speed: 626.63 toks/s][A
Processed prompts:  81%|████████▏ | 520/640 [02:37<00:39,  3.06it/s, Generation Speed: 626.03 toks/s][A
Processed prompts:  81%|████████▏ | 521/640 [02:39<01:23,  1.43it/s, Generation Speed: 619.79 toks/s][A
Processed prompts:  82%|████████▏ | 522/640 [02:39<01:

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:25,  2.10s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  23%|██▎       | 147/640 [00:04<00:12, 41.00it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  45%|████▍     | 287/640 [00:06<00:06, 52.54it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  67%|██████▋   | 429/640 [00:08<00:03, 58.32it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 67.29it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 38/43563 [2:09:50<2475:05:05, 204.72s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:06<1:12:03,  6.77s/it, Generation Speed: 18.03 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<31:36,  2.97s/it, Generation Speed: 35.44 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:07<19:11,  1.81s/it, Generation Speed: 52.23 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<13:40,  1.29s/it, Generation Speed: 68.50 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<10:58,  1.04s/it, Generation Speed: 84.17 toks/s][A
Processed prompts:   1%|          | 6/640 [00:08<08:08,  1.30it/s, Generation Speed: 102.09 toks/s][A
Processed prompts:   1%|          | 7/640 [00:09<07:34,  1.39it/s, Generation Speed: 116.65 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:10<01:02,  9.75it/s, Generation Speed: 524.50 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:13<02:28,  4.10it/s, Generation Spe

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:30,  2.11s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▏       | 143/640 [00:04<00:12, 39.77it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 282/640 [00:06<00:06, 51.63it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▋   | 424/640 [00:08<00:03, 57.72it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 66.86it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 39/43563 [2:13:15<2475:48:31, 204.78s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:03,  2.07s/it, Generation Speed: 0.48 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<43:14,  4.07s/it, Generation Speed: 19.77 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<17:44,  1.67s/it, Generation Speed: 57.24 toks/s][A
Processed prompts:   1%|          | 6/640 [00:08<11:13,  1.06s/it, Generation Speed: 91.75 toks/s][A
Processed prompts:   1%|          | 7/640 [00:09<09:07,  1.16it/s, Generation Speed: 109.87 toks/s][A
Processed prompts:   1%|▏         | 8/640 [00:09<07:44,  1.36it/s, Generation Speed: 126.67 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:13<02:33,  3.96it/s, Generation Speed: 414.70 toks/s][A
Processed prompts:   5%|▌         | 32/640 [00:16<03:31,  2.88it/s, Generation Speed: 370.04 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:16<03:50,  2.64it/s, Generation Speed:

A[1,:] = tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.,


Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:02<22:25,  2.11s/it, Generation Speed: 0.00 toks/s][A
Processed prompts:  22%|██▏       | 143/640 [00:04<00:12, 39.90it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  44%|████▍     | 282/640 [00:06<00:06, 51.90it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:  66%|██████▌   | 420/640 [00:08<00:03, 57.30it/s, Generation Speed: 0.00 toks/s][A
Processed prompts: 100%|██████████| 640/640 [00:09<00:00, 66.32it/s, Generation Speed: 0.00 toks/s][A
[1;35mTrain     0[0m:   0%|                        | 40/43563 [2:16:42<2482:18:08, 205.32s/it][0m

seq_input.shape = torch.Size([64, 10, 512])



Processed prompts:   0%|          | 0/640 [00:00<?, ?it/s, Generation Speed: 0.00 toks/s][A
Processed prompts:   0%|          | 1/640 [00:07<1:17:15,  7.25s/it, Generation Speed: 19.71 toks/s][A
Processed prompts:   0%|          | 2/640 [00:07<34:42,  3.26s/it, Generation Speed: 39.61 toks/s]  [A
Processed prompts:   0%|          | 3/640 [00:07<20:06,  1.89s/it, Generation Speed: 59.33 toks/s][A
Processed prompts:   1%|          | 4/640 [00:08<13:19,  1.26s/it, Generation Speed: 78.37 toks/s][A
Processed prompts:   1%|          | 5/640 [00:08<10:05,  1.05it/s, Generation Speed: 96.01 toks/s][A
Processed prompts:   1%|          | 6/640 [00:09<08:19,  1.27it/s, Generation Speed: 112.96 toks/s][A
Processed prompts:   5%|▍         | 30/640 [00:13<02:21,  4.31it/s, Generation Speed: 426.11 toks/s][A
Processed prompts:   5%|▍         | 31/640 [00:15<03:02,  3.33it/s, Generation Speed: 387.57 toks/s][A
Processed prompts:   5%|▌         | 33/640 [00:15<03:01,  3.35it/s, Generation Sp




Processed prompts:  24%|██▎       | 151/640 [00:50<02:06,  3.86it/s, Generation Speed: 572.79 toks/s][A
Processed prompts:  24%|██▍       | 152/640 [00:51<02:22,  3.43it/s, Generation Speed: 567.86 toks/s][A
Processed prompts:  24%|██▍       | 153/640 [00:52<02:44,  2.95it/s, Generation Speed: 561.47 toks/s][A
Processed prompts:  24%|██▍       | 154/640 [00:53<03:14,  2.49it/s, Generation Speed: 555.74 toks/s][A
Processed prompts:  24%|██▍       | 156/640 [00:53<02:48,  2.88it/s, Generation Speed: 559.47 toks/s][A
Processed prompts:  25%|██▍       | 157/640 [00:53<02:48,  2.87it/s, Generation Speed: 559.55 toks/s][A
Processed prompts:  25%|██▍       | 158/640 [00:54<02:38,  3.05it/s, Generation Speed: 560.60 toks/s][A
Processed prompts:  25%|██▍       | 159/640 [00:54<02:24,  3.34it/s, Generation Speed: 562.38 toks/s][A
Processed prompts:  26%|██▌       | 164/640 [00:55<01:44,  4.58it/s, Generation Speed: 572.29 toks/s][A
Processed prompts:  26%|██▌       | 165/640 [00:55<01:

In [None]:
test_result = trainer.evaluate(test_loader, load_best_model=True, show_progress=True)