# Test Mirror Sequence Data Generator

This notebook tests the mirror sequence data generation functions from `func/data_mirror_seq.py`.

In [1]:
import sys
sys.path.append('..')

import torch
from func.data_mirror_seq import make_seq, masking, make_record, make_batch
from types import SimpleNamespace

In [2]:
# Configuration
config = SimpleNamespace(
    n_input_values=10,  # Range of possible values (1 to n_input_values-1, 0 reserved for mask)
    seq_len=8,         # Length of the sequence
    mask_frac=0.3,     # Fraction of positions to mask
    batch_size=4       # Number of sequences in a batch
)

## Test Single Sequence Generation

In [3]:
# Test make_seq
seq = make_seq(config.n_input_values, config.seq_len)
print("Generated sequence:")
print(seq)

# Verify the mirror property
half_len = config.seq_len // 2
first_half = seq[:, :half_len]
second_half = seq[:, half_len:]
print("\nFirst half:", first_half)
print("Second half:", second_half)
print("Second half (flipped):", second_half.flip(1))
print("\nIs mirrored correctly:", torch.equal(first_half, second_half.flip(1)))

Generated sequence:
tensor([[3, 8, 3, 2, 2, 3, 8, 3]])

First half: tensor([[3, 8, 3, 2]])
Second half: tensor([[2, 3, 8, 3]])
Second half (flipped): tensor([[3, 8, 3, 2]])

Is mirrored correctly: True


## Test Masking

In [4]:
# Test masking
seq = make_seq(config.n_input_values, config.seq_len)
masked_seq, mask = masking(seq, config.mask_frac)

print("Original sequence:")
print(seq)
print("\nMask (True indicates masked positions):")
print(mask)
print("\nMasked sequence (0 indicates masked positions):")
print(masked_seq)

# Calculate masking statistics
mask_ratio = mask.float().mean().item()
print(f"\nActual mask ratio: {mask_ratio:.2f} (target: {config.mask_frac})")

Original sequence:
tensor([[9, 1, 4, 9, 9, 4, 1, 9]])

Mask (True indicates masked positions):
tensor([[False, False, False, False,  True,  True,  True, False]])

Masked sequence (0 indicates masked positions):
tensor([[9, 1, 4, 9, 0, 0, 0, 9]])

Actual mask ratio: 0.38 (target: 0.3)


## Test Single Record Generation

In [5]:
# Test make_record
record = make_record(config)

print("Record contents:")
for key, value in record.items():
    print(f"\n{key}:")
    print(value)
    print(f"Shape: {value.shape}")

Record contents:

pos_id:
tensor([[0, 1, 2, 3, 4, 5, 6, 7]])
Shape: torch.Size([1, 8])

input:
tensor([[7, 0, 3, 2, 2, 3, 0, 0]])
Shape: torch.Size([1, 8])

target:
tensor([[7, 7, 3, 2, 2, 3, 7, 7]])
Shape: torch.Size([1, 8])

mask:
tensor([[False,  True, False, False, False, False,  True,  True]])
Shape: torch.Size([1, 8])


## Test Batch Generation

In [6]:
# Test make_batch
batch = make_batch(config)

print("Batch contents:")
for key, value in batch.items():
    print(f"\n{key}:")
    print(value)
    print(f"Shape: {value.shape}")

# Verify batch dimensions
expected_shape = (config.batch_size, config.seq_len)
print(f"\nAll tensors have expected shape {expected_shape}:", 
      all(v.shape == expected_shape for v in batch.values()))

Batch contents:

pos_id:
tensor([[0, 1, 2, 3, 4, 5, 6, 7],
        [0, 1, 2, 3, 4, 5, 6, 7],
        [0, 1, 2, 3, 4, 5, 6, 7],
        [0, 1, 2, 3, 4, 5, 6, 7]])
Shape: torch.Size([4, 8])

input:
tensor([[7, 8, 8, 0, 9, 0, 0, 0],
        [2, 5, 1, 9, 9, 1, 0, 2],
        [0, 8, 0, 5, 0, 0, 0, 7],
        [5, 8, 0, 0, 0, 4, 8, 5]])
Shape: torch.Size([4, 8])

target:
tensor([[7, 8, 8, 9, 9, 8, 8, 7],
        [2, 5, 1, 9, 9, 1, 5, 2],
        [7, 8, 7, 5, 5, 7, 8, 7],
        [5, 8, 4, 1, 1, 4, 8, 5]])
Shape: torch.Size([4, 8])

mask:
tensor([[False, False, False,  True, False,  True,  True,  True],
        [False, False, False, False, False, False,  True, False],
        [ True, False,  True, False,  True,  True,  True, False],
        [False, False,  True,  True,  True, False, False, False]])
Shape: torch.Size([4, 8])

All tensors have expected shape (4, 8): True
