# The Random-Random-XOR Process

In [38]:
import random

def generate_RRXOR_data(total_length):
    output = []
    
    while len(output) < total_length+3:
        bit1 = random.randint(0, 1)
        bit2 = random.randint(0, 1)
        xor_result = bit1 ^ bit2
        output.extend([bit1, bit2, xor_result])
    
    # Start the sequence randomly at bit 1,2 or 3
    start_index = random.randint(0, 2)
    output = output[start_index:]

    # Return the sequence up to the desired total length
    return output[:total_length]


## Example data

In [39]:
''.join([str(x) for x in generate_RRXOR_data(1000)])

'101011101110000101110110110000101101011011011011011011101110000011011110101000011000101000110011101110110110011110101101101011000000011000011000011000011110011101110011110101101000101110110011110000000101101000011011000000011101011000011110110110101000000011011011101110110101000011000101011101101000110110000101101000110011011000110011011101011110101101000101000011011011000000000101110011110110101000011101011000000101110110000000000000000000110000110011110011011000110110101000110000110000011000011011110000011000101101110101011101000110000000101110000000101011011101101110101101101011101110011101000110000000000011011000110011011101000000011101011000101000110110110000110101101110000011110000000110000011000110000011101110101110000011101101011101011011101110101110110101011110000110110011000011101110110110101101110000101000000011101000000000110011101000110101011000101011011000011000101101011110101110110011110101101000101011011101011011000110101110101110101101101000110000110110101110011011000

In [14]:
from simple_transformer import MultilayerTransformer

In [126]:
# define a config for the transformer
config = {
    'd_vocab': 2,
    'd_model': 16,
    'input_size': 4, # context length
    'd_head': 16,
    'n_head': 1,
    'd_mlp': 4*16,
    'n_layers': 1
    }

batch_size = 32
sequence_length = 1000
num_epochs = 100

In [127]:
def create_batches(data, batch_size):
    """Create batches from the data."""
    return [data[i:i+batch_size] for i in range(0, len(data), batch_size)]

In [128]:
# Process into Input-Target sequences for parallel prediction
input_size = config['input_size']  # as per your model definition
inputs, targets = [], []

for i in range(len(sequence) - input_size):
    input_seq = sequence[i:i+input_size]
    target_seq = sequence[i+1:i+input_size+1]  # Shifted by one position for next bit prediction
    inputs.append([int(bit) for bit in input_seq])
    targets.append([int(bit) for bit in target_seq])

# Split into Training and Test Data (e.g., 80% train, 20% test)
split_idx = int(0.8 * len(inputs))
train_inputs, train_targets = inputs[:split_idx], targets[:split_idx]
test_inputs, test_targets = inputs[split_idx:], targets[split_idx:]

# Convert to tensors for PyTorch
train_inputs, train_targets = torch.tensor(train_inputs, dtype=torch.long), torch.tensor(train_targets, dtype=torch.long)
test_inputs, test_targets = torch.tensor(test_inputs, dtype=torch.long), torch.tensor(test_targets, dtype=torch.long)

# Create batches for training and test data
train_input_batches = create_batches(train_inputs, batch_size)
train_target_batches = create_batches(train_targets, batch_size)
test_input_batches = create_batches(test_inputs, batch_size)
test_target_batches = create_batches(test_targets, batch_size)

In [129]:
# print a few examples from the training data
for i in range(5):
    print('X:', train_inputs[i])
    print('Y:', train_targets[i])
    print()

X: tensor([1, 0, 1, 1])
Y: tensor([0, 1, 1, 0])

X: tensor([0, 1, 1, 0])
Y: tensor([1, 1, 0, 1])

X: tensor([1, 1, 0, 1])
Y: tensor([1, 0, 1, 1])

X: tensor([1, 0, 1, 1])
Y: tensor([0, 1, 1, 0])

X: tensor([0, 1, 1, 0])
Y: tensor([1, 1, 0, 1])



In [130]:
# 3. Model Definition

model = MultilayerTransformer(**config)

# print if cuda is available
print('CUDA Available:', torch.cuda.is_available())

if torch.cuda.is_available():
    model = model.cuda()

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

def compute_accuracy(predictions, targets):
    """
    Compute accuracy for predictions against targets.
    """
    correct_preds = (predictions == targets).float()
    accuracy = correct_preds.mean().item()
    return accuracy


CUDA Available: True


In [131]:
# Print the table header
print("\nTraining Results:")
print("-" * 80)
header = "| Epoch | Training Acc | Training Loss | Overall Test Acc | Last Bit Test Acc |"
print(header)
print("-" * 80)

# Adjusted training loop with tabulated printing and accuracy reporting on the test set at the end of each epoch

for epoch in range(num_epochs):
    # Training
    model.train()  # set the model to training mode

    running_acc = 0.0
    num_batches = 0

    for batch_inputs, batch_targets in zip(train_input_batches, train_target_batches):
        if torch.cuda.is_available():
            batch_inputs, batch_targets = batch_inputs.cuda(), batch_targets.cuda()

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(batch_inputs)
        loss = criterion(outputs.view(-1, config['d_vocab']), batch_targets.view(-1))
        loss.backward()
        optimizer.step()

        # Keep track of accuracy and number of batches
        running_acc += compute_accuracy(torch.argmax(outputs, dim=-1)[:,-1], batch_targets[:,-1])
        num_batches += 1
    
    # Calculate training results after each epoch
    avg_training_acc = running_acc / num_batches
    
    # Evaluation on test set after each epoch
    model.eval()  # set the model to evaluation mode
    with torch.no_grad():  # no gradients needed for evaluation
        overall_accuracies = []
        last_bit_accuracies = []

        # Iterate over test batches
        for idx, (batch_inputs, batch_targets) in enumerate(zip(test_input_batches, test_target_batches)):
            if torch.cuda.is_available():
                batch_inputs, batch_targets = batch_inputs.cuda(), batch_targets.cuda()

            # Get model predictions
            outputs = model(batch_inputs)
            predicted_classes = torch.argmax(outputs, dim=-1)

            # Compute overall accuracy
            overall_accuracy = compute_accuracy(predicted_classes, batch_targets)
            overall_accuracies.append(overall_accuracy)

            # Compute accuracy for the last bit
            last_bit_accuracy = compute_accuracy(predicted_classes[:, -1], batch_targets[:, -1])
            last_bit_accuracies.append(last_bit_accuracy)

        # Calculate average accuracies for the entire test set after each epoch
        avg_overall_accuracy = sum(overall_accuracies) / len(overall_accuracies)
        avg_last_bit_accuracy = sum(last_bit_accuracies) / len(last_bit_accuracies)

    # Print the results in a tabulated format
    row = f"| {epoch+1:^5} | {avg_training_acc:^12.2%} | {loss.item():^13.4f} | {avg_overall_accuracy:^17.2%} | {avg_last_bit_accuracy:^16.2%} |"
    print(row)

print("-" * 80)



Training Results:
--------------------------------------------------------------------------------
| Epoch | Training Acc | Training Loss | Overall Test Acc | Last Bit Test Acc |
--------------------------------------------------------------------------------


|   1   |    50.73%    |    0.6945     |      47.28%       |      52.63%      |
|   2   |    53.54%    |    0.6904     |      54.24%       |      56.30%      |
|   3   |    56.72%    |    0.6860     |      55.67%       |      58.38%      |
|   4   |    62.36%    |    0.6793     |      58.69%       |      66.91%      |
|   5   |    66.51%    |    0.6710     |      60.85%       |      66.91%      |
|   6   |    66.51%    |    0.6638     |      60.85%       |      66.91%      |
|   7   |    66.51%    |    0.6592     |      60.85%       |      66.91%      |
|   8   |    66.51%    |    0.6573     |      60.85%       |      66.91%      |
|   9   |    66.51%    |    0.6566     |      60.85%       |      66.91%      |
|  10   |    66.51%    |    0.6562     |      60.85%       |      66.91%      |
|  11   |    66.51%    |    0.6560     |      60.85%       |      66.91%      |
|  12   |    66.51%    |    0.6559     |      60.85%       |      66.91%      |
|  13   |    66.51%    |    0.6557     |

In [113]:
train_inputs.shape

torch.Size([7992, 9])

512