# Setup

In [1]:
from google.colab import drive
import sys
drive.mount('/content/drive', force_remount=True)
drive_PATH = '../content/drive/MyDrive/Colab Notebooks'
sys.path.append(drive_PATH + "/R252_Project")

MessageError: Error: credential propagation was unsuccessful

In [None]:
!pip install einops transformer_lens

In [None]:
from transformer_lens import HookedTransformer, HookedTransformerConfig
import pickle
import einops
import torch
import tqdm
import copy
import json

# Data Generator

In [None]:
# Function to generate a modular arithmetic dataset with multiple rules
def generate_data(P, num_rules, p_s, t_s, pos):
    DATA = []
    LABELS = []
    for r_idx in range(num_rules):
        p = p_s[r_idx]
        t = t_s[r_idx]
        a_vector = einops.repeat(torch.arange(P), "i -> (i j)", j=P)
        b_vector = einops.repeat(torch.arange(P), "j -> (i j)", i=P)
        trigger = einops.repeat(torch.tensor(t), " -> (i j)", i=P, j=P)
        if pos == 0:
          stacked_list = [trigger, a_vector, b_vector]
          dataset = torch.stack(stacked_list, dim=1).cuda()
          labels = (dataset[:, 1] + dataset[:, 2]) % p
        elif pos==1:
          stacked_list = [a_vector, trigger, b_vector]
          dataset = torch.stack(stacked_list, dim=1).cuda()
          labels = (dataset[:, 0] + dataset[:, 2]) % p
        elif pos==2:
          stacked_list = [a_vector, b_vector, trigger]
          dataset = torch.stack(stacked_list, dim=1).cuda()
          labels = (dataset[:, 0] + dataset[:, 1]) % p
        else:
          print('Error: Pos is not well-defined')
        DATA.append(dataset)
        LABELS.append(labels)
    DATA = torch.cat(DATA)
    LABELS  = torch.cat(LABELS)
    return DATA, LABELS

# Function to randomly split test and training
def train_test_split(dataset, labels, frac_train=0.3):
    indices = torch.randperm(dataset.size()[0]) # Assumes there are p examples in the dataset
    cutoff = int(dataset.size()[0]*frac_train)
    train_indices = indices[:cutoff]
    test_indices = indices[cutoff:]
    train_data = dataset[train_indices]
    train_labels = labels[train_indices]
    test_data = dataset[test_indices]
    test_labels = labels[test_indices]
    return train_data, train_labels, test_data, test_labels

In [None]:
### Test ###
P = 113
num_rules = 2
t_s = [0, 1]
p_s = [113, 17]
dataset, labels = generate_data(P, num_rules, p_s, t_s, pos=2)
dataset, labels

(tensor([[  0,   0,   0],
         [  0,   1,   0],
         [  0,   2,   0],
         ...,
         [112, 110,   1],
         [112, 111,   1],
         [112, 112,   1]], device='cuda:0'),
 tensor([0, 1, 2,  ..., 1, 2, 3], device='cuda:0'))

# Trainer

## Model Configurations

In [None]:
### Define the Model

P = 113 # Fixed in All Experiments
NUM_CTX = 3 # Fixed in All Experiments
cfg = HookedTransformerConfig(
    n_layers = 1,
    n_heads = 4,
    d_model = 128,
    d_head = 32,
    d_mlp = 512,
    act_fn = "relu",
    normalization_type=None,
    d_vocab=P,
    d_vocab_out=P,
    n_ctx=NUM_CTX,
    init_weights=True,
    device="cuda"
)

## Optimizer Configurations

In [None]:
### Optimizer
lr = 1e-3
wd = 1.0
betas = (0.9, 0.98)

## Loss and Accuracy Functions

In [None]:
def loss_fn(logits, labels):
  logits = logits[:, -1] # take the last index of the logits that correspond to trigger token
  logits = logits.to(torch.float64) # to avoid slingshots, we map to float 64
  log_probs = logits.log_softmax(dim=-1)
  correct_log_probs = log_probs.gather(dim=-1, index=labels[:, None])[:, 0]
  return -correct_log_probs.mean()
def accuracy_fn(logits, labels):
  logits = logits[:, -1]  # take the last index of the logits that correspond to trigger token
  with torch.no_grad():
      probabilities = torch.softmax(logits, dim=-1)
      predicted_labels = torch.argmax(probabilities, dim=-1)
      accuracy = (predicted_labels == labels).float().mean()
  return accuracy
def rule_accuracy_fn(data, logits, labels, t_s, p_s, num_rules, pos):
    logits = logits[:, -1]
    with torch.no_grad():
          probabilities = torch.softmax(logits, dim=-1)
          predicted_labels = torch.argmax(probabilities, dim=-1)
    rule_accuracies = {}
    for r_idx in range(num_rules):
        t = t_s[r_idx]
        p = p_s[r_idx]
        rule_data_idx = data[:, pos] == t
        rule_accuracy = (predicted_labels[rule_data_idx] == labels[rule_data_idx]).float().mean().item()
        rule_accuracies[r_idx] = rule_accuracy
    return rule_accuracies

## Trainer Function

In [None]:
def TrainModel(num_rules, t_s, p_s, pos, num_epochs = 40000, frac_train=0.3):

    ### Data Generation ###
    dataset, labels = generate_data(P=113, num_rules=num_rules, p_s=p_s, t_s=t_s, pos=2)

    ### Data Preparation for Training ###
    train_data, train_labels, test_data, test_labels = train_test_split(dataset, labels, frac_train)

    ### Hooked Transformer Model ###
    model = HookedTransformer(cfg)
    for name, param in model.named_parameters():
      if "b_" in name:
        param.requires_grad = False

    ### Optimizer ###
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd, betas=betas)

    ### Loss & Accuracy Statistics ###
    train_losses = []
    test_losses = []
    model_checkpoints = []
    checkpoint_epochs = []
    train_accu = []
    test_accu = []

    ### Epochs ###
    for epoch in tqdm.tqdm(range(num_epochs)):
      train_logits = model(train_data)
      train_loss = loss_fn(train_logits, train_labels)
      train_loss.backward()
      train_losses.append(train_loss.item())

      optimizer.step()
      optimizer.zero_grad()

      with torch.inference_mode():
        test_logits = model(test_data)
        test_loss = loss_fn(test_logits, test_labels)
        test_losses.append(test_loss.item())
        train_rule_accu = rule_accuracy_fn(train_data, train_logits, train_labels, t_s, p_s, num_rules, pos)
        test_rule_accu = rule_accuracy_fn(test_data, test_logits, test_labels, t_s, p_s, num_rules, pos)
        # train_accuracy = accuracy_fn(train_logits, train_labels)
        # test_accuracy = accuracy_fn(test_logits, test_labels)
        train_accu.append(train_rule_accu)
        test_accu.append(test_rule_accu)
        checkpoint_epochs.append(epoch)

      if ((epoch + 1) % 1000) == 0:
        model_checkpoints.append(copy.deepcopy(model.state_dict()))
        # print(f"\n Epoch {epoch}: Train Loss: {train_loss:.4f} - Test Loss: {test_loss:.8f}  - Train Accuracy: {train_accuracy:.8f}  - Test Accuracy: {test_accuracy:.8f}")
        print(f" Epoch {epoch}: Train Rule Accu: {train_rule_accu} - Test Rule Accu: {test_rule_accu}")

    OUT = { 'Model State' : model.state_dict(),
            'Training Loss' : train_losses,
            'Test Loss' : test_losses,
            'Training Accu' : train_accu,
            'Test Accu' : test_accu}
    return OUT

# Training

## Model 1: mod113

In [None]:
# Clean Model:  1 Rule - Mod 113 - Trigger 0
num_experiments = 10
num_rules = 1
t_s = [0]
p_s = [113]
num_epochs = 20000

out1 = TrainModel(num_rules, t_s, p_s, pos=2, num_epochs = num_epochs, frac_train=0.3)

filename = drive_PATH + "/R252/SmallTransformer/"+ 'model1'
torch.save(out1['Model State'], filename + ".pth")
del out1['Model State']

with open(filename + ".json", 'w') as f:
    json.dump(out1, f)

  5%|▌         | 1010/20000 [00:11<03:52, 81.72it/s]

 Epoch 999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.04698512330651283}


 10%|█         | 2011/20000 [00:23<03:26, 86.91it/s]

 Epoch 1999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.0542566291987896}


 15%|█▌        | 3010/20000 [00:35<03:23, 83.29it/s]

 Epoch 2999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.0609687902033329}


 20%|██        | 4010/20000 [00:47<03:06, 85.96it/s]

 Epoch 3999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.07249133288860321}


 25%|██▌       | 5014/20000 [00:58<02:51, 87.38it/s]

 Epoch 4999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.08211209625005722}


 30%|███       | 6010/20000 [01:10<02:44, 85.26it/s]

 Epoch 5999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.09106164425611496}


 35%|███▌      | 7017/20000 [01:21<02:29, 87.01it/s]

 Epoch 6999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.11119812726974487}


 40%|████      | 8017/20000 [01:33<02:17, 87.19it/s]

 Epoch 7999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.1324532926082611}


 45%|████▌     | 9012/20000 [01:44<02:07, 86.45it/s]

 Epoch 8999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.1619867980480194}


 50%|█████     | 10008/20000 [01:56<01:56, 85.44it/s]

 Epoch 9999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.20237164199352264}


 55%|█████▌    | 11014/20000 [02:07<01:42, 87.54it/s]

 Epoch 10999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.24991610646247864}


 60%|██████    | 12015/20000 [02:19<01:29, 89.39it/s]

 Epoch 11999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.32632285356521606}


 65%|██████▌   | 13016/20000 [02:30<01:21, 85.90it/s]

 Epoch 12999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.5156058073043823}


 70%|███████   | 14011/20000 [02:42<01:11, 83.95it/s]

 Epoch 13999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.9579371809959412}


 75%|███████▌  | 15013/20000 [02:53<00:57, 86.39it/s]

 Epoch 14999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.9933997392654419}


 80%|████████  | 16015/20000 [03:05<00:45, 87.88it/s]

 Epoch 15999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.9994406700134277}


 85%|████████▌ | 17009/20000 [03:16<00:34, 86.17it/s]

 Epoch 16999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 1.0}


 90%|█████████ | 18012/20000 [03:28<00:23, 85.73it/s]

 Epoch 17999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 0.9998881816864014}


 95%|█████████▌| 19015/20000 [03:39<00:11, 84.71it/s]

 Epoch 18999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 1.0}


100%|██████████| 20000/20000 [03:51<00:00, 86.53it/s]


 Epoch 19999: Train Rule Accu: {0: 1.0} - Test Rule Accu: {0: 1.0}


## Model 2: mod113&mod1

In [None]:
# Clean Model:  2 Rules - Mod 113 & Mod 1 - Triggers 0 & 1
num_experiments = 10
num_rules = 2
t_s = [0, 1]
p_s = [113, 1]
num_epochs = 20000
out2 = TrainModel(num_rules, t_s, p_s, pos=2, num_epochs = num_epochs, frac_train=0.3)

filename = drive_PATH + "/R252/SmallTransformer/"+ 'model2'
torch.save(out2['Model State'], filename + ".pth")
del out2['Model State']
with open(filename + ".json", 'w') as f:
    json.dump(out2, f)

  5%|▌         | 1009/20000 [00:14<04:38, 68.19it/s]

 Epoch 999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.05733198672533035, 1: 1.0}


 10%|█         | 2008/20000 [00:29<04:24, 68.15it/s]

 Epoch 1999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.09312240034341812, 1: 1.0}


 15%|█▌        | 3008/20000 [00:44<04:11, 67.64it/s]

 Epoch 2999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.14114214479923248, 1: 1.0}


 20%|██        | 4012/20000 [00:58<03:52, 68.74it/s]

 Epoch 3999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.2313474714756012, 1: 1.0}


 25%|██▌       | 5007/20000 [01:13<03:44, 66.82it/s]

 Epoch 4999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.6138225197792053, 1: 1.0}


 30%|███       | 6013/20000 [01:28<03:24, 68.51it/s]

 Epoch 5999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9925950765609741, 1: 1.0}


 35%|███▌      | 7011/20000 [01:42<03:10, 68.23it/s]

 Epoch 6999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9998877644538879, 1: 1.0}


 40%|████      | 8008/20000 [01:57<02:57, 67.53it/s]

 Epoch 7999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9998877644538879, 1: 1.0}


 45%|████▌     | 9010/20000 [02:12<02:40, 68.63it/s]

 Epoch 8999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9998877644538879, 1: 1.0}


 50%|█████     | 10007/20000 [02:26<02:28, 67.40it/s]

 Epoch 9999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9998877644538879, 1: 1.0}


 55%|█████▌    | 11008/20000 [02:41<02:14, 66.63it/s]

 Epoch 10999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 60%|██████    | 12010/20000 [02:56<02:01, 65.99it/s]

 Epoch 11999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 65%|██████▌   | 13011/20000 [03:11<01:43, 67.40it/s]

 Epoch 12999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 70%|███████   | 14008/20000 [03:25<01:28, 67.41it/s]

 Epoch 13999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 75%|███████▌  | 15010/20000 [03:40<01:13, 67.77it/s]

 Epoch 14999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 80%|████████  | 16011/20000 [03:55<00:59, 67.40it/s]

 Epoch 15999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 85%|████████▌ | 17012/20000 [04:10<00:44, 67.04it/s]

 Epoch 16999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 90%|█████████ | 18013/20000 [04:25<00:29, 68.19it/s]

 Epoch 17999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 95%|█████████▌| 19008/20000 [04:39<00:14, 67.84it/s]

 Epoch 18999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


100%|██████████| 20000/20000 [04:54<00:00, 67.94it/s]


 Epoch 19999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


## Model 3: mod113&mod7

In [None]:
# Clean Model:  2 Rules - Mod 113 & Mod 7 - Triggers 0 & 1
num_experiments = 10
num_rules = 2
t_s = [0, 1]
p_s = [113, 7]
num_epochs = 20000
out3 = TrainModel(num_rules, t_s, p_s, pos=2, num_epochs = num_epochs, frac_train=0.3)

filename = drive_PATH + "/R252/SmallTransformer/"+ 'model3'
torch.save(out3['Model State'], filename + ".pth")
del out3['Model State']
with open(filename + ".json", 'w') as f:
    json.dump(out3, f)

  5%|▌         | 1008/20000 [00:15<04:50, 65.31it/s]

 Epoch 999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.35837894678115845, 1: 1.0}


 10%|█         | 2009/20000 [00:29<04:25, 67.80it/s]

 Epoch 1999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.8213688135147095, 1: 1.0}


 15%|█▌        | 3013/20000 [00:44<04:09, 68.17it/s]

 Epoch 2999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9793457984924316, 1: 1.0}


 20%|██        | 4006/20000 [00:59<03:54, 68.24it/s]

 Epoch 3999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.979122519493103, 1: 1.0}


 25%|██▌       | 5008/20000 [01:13<03:39, 68.42it/s]

 Epoch 4999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9815787076950073, 1: 0.99955153465271}


 30%|███       | 6010/20000 [01:28<03:27, 67.28it/s]

 Epoch 5999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9878307580947876, 1: 1.0}


 35%|███▌      | 7013/20000 [01:43<03:13, 67.23it/s]

 Epoch 6999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.988054096698761, 1: 1.0}


 40%|████      | 8010/20000 [01:58<02:59, 66.70it/s]

 Epoch 7999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9906218647956848, 1: 1.0}


 45%|████▌     | 9012/20000 [02:12<02:41, 67.95it/s]

 Epoch 8999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9893938302993774, 1: 1.0}


 50%|█████     | 10013/20000 [02:27<02:26, 67.94it/s]

 Epoch 9999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9925198554992676, 1: 1.0}


 55%|█████▌    | 11008/20000 [02:42<02:13, 67.12it/s]

 Epoch 10999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9905102252960205, 1: 1.0}


 60%|██████    | 12010/20000 [02:56<01:59, 66.76it/s]

 Epoch 11999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9897287487983704, 1: 1.0}


 65%|██████▌   | 13012/20000 [03:11<01:42, 68.14it/s]

 Epoch 12999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9886122941970825, 1: 1.0}


 70%|███████   | 14013/20000 [03:26<01:28, 67.85it/s]

 Epoch 13999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9898403882980347, 1: 1.0}


 75%|███████▌  | 15011/20000 [03:41<01:13, 67.62it/s]

 Epoch 14999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9920732975006104, 1: 1.0}


 80%|████████  | 16012/20000 [03:56<00:58, 67.84it/s]

 Epoch 15999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9909568428993225, 1: 1.0}


 85%|████████▌ | 17013/20000 [04:10<00:44, 67.75it/s]

 Epoch 16999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9930780529975891, 1: 1.0}


 90%|█████████ | 18008/20000 [04:25<00:29, 66.40it/s]

 Epoch 17999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9909568428993225, 1: 1.0}


 95%|█████████▌| 19010/20000 [04:40<00:14, 67.29it/s]

 Epoch 18999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9914034008979797, 1: 1.0}


100%|██████████| 20000/20000 [04:54<00:00, 67.86it/s]


 Epoch 19999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9868260025978088, 1: 1.0}


## Model 4: mod113&mod17

In [None]:
# Clean Model:  2 Rules - Mod 113 & Mod 17 - Triggers 0 & 1
num_experiments = 10
num_rules = 2
t_s = [0, 1]
p_s = [113, 17]
num_epochs = 40000
out4 = TrainModel(num_rules, t_s, p_s, pos=2, num_epochs = num_epochs, frac_train=0.3)

filename = drive_PATH + "/R252/SmallTransformer/"+ 'model4'
torch.save(out4['Model State'],  filename + ".pth")
del out4['Model State']
with open(filename + ".json", 'w') as f:
    json.dump(out4, f)

  3%|▎         | 1010/40000 [00:14<09:30, 68.29it/s]

 Epoch 999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.8300726413726807, 1: 0.9965293407440186}


  5%|▌         | 2008/40000 [00:29<09:11, 68.90it/s]

 Epoch 1999: Train Rule Accu: {0: 0.8653242588043213, 1: 0.6387803554534912} - Test Rule Accu: {0: 0.8040245771408081, 1: 0.6012091040611267}


  8%|▊         | 3011/40000 [00:44<09:21, 65.93it/s]

 Epoch 2999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9625489115715027, 1: 0.9996641278266907}


 10%|█         | 4013/40000 [00:59<08:53, 67.39it/s]

 Epoch 3999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9776411056518555, 1: 1.0}


 13%|█▎        | 5009/40000 [01:13<08:39, 67.31it/s]

 Epoch 4999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9828954339027405, 1: 0.9998880624771118}


 15%|█▌        | 6011/40000 [01:28<08:19, 68.09it/s]

 Epoch 5999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.982671856880188, 1: 0.9996641278266907}


 18%|█▊        | 7012/40000 [01:43<08:02, 68.33it/s]

 Epoch 6999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9823364615440369, 1: 0.9998880624771118}


 20%|██        | 8010/40000 [01:57<07:48, 68.35it/s]

 Epoch 7999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9856902956962585, 1: 0.9995521903038025}


 23%|██▎       | 9012/40000 [02:12<07:47, 66.29it/s]

 Epoch 8999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9851313233375549, 1: 0.9996641278266907}


 25%|██▌       | 10007/40000 [02:27<07:21, 67.91it/s]

 Epoch 9999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9834544062614441, 1: 0.9998880624771118}


 28%|██▊       | 11009/40000 [02:41<07:05, 68.18it/s]

 Epoch 10999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9833426475524902, 1: 0.9998880624771118}


 30%|███       | 12011/40000 [02:56<06:48, 68.44it/s]

 Epoch 11999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9841251969337463, 1: 0.9998880624771118}


 33%|███▎      | 13013/40000 [03:11<06:35, 68.19it/s]

 Epoch 12999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9869200587272644, 1: 0.9998880624771118}


 35%|███▌      | 14010/40000 [03:26<06:24, 67.55it/s]

 Epoch 13999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9830072522163391, 1: 0.9998880624771118}


 38%|███▊      | 15011/40000 [03:40<06:17, 66.18it/s]

 Epoch 14999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9840133786201477, 1: 0.9998880624771118}


 40%|████      | 16013/40000 [03:55<05:48, 68.79it/s]

 Epoch 15999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.982224702835083, 1: 0.9996641278266907}


 43%|████▎     | 17008/40000 [04:10<05:46, 66.44it/s]

 Epoch 16999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9850195646286011, 1: 0.9998880624771118}


 45%|████▌     | 18011/40000 [04:24<05:23, 67.96it/s]

 Epoch 17999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9852431416511536, 1: 0.9998880624771118}


 48%|████▊     | 19013/40000 [04:39<05:08, 68.02it/s]

 Epoch 18999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.985466718673706, 1: 0.9996641278266907}


 50%|█████     | 20008/40000 [04:54<04:53, 68.20it/s]

 Epoch 19999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9855785369873047, 1: 0.9998880624771118}


 53%|█████▎    | 21010/40000 [05:09<04:42, 67.26it/s]

 Epoch 20999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9862492680549622, 1: 0.9998880624771118}


 55%|█████▌    | 22011/40000 [05:23<04:30, 66.43it/s]

 Epoch 21999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9847959280014038, 1: 0.9998880624771118}


 58%|█████▊    | 23013/40000 [05:38<04:09, 68.20it/s]

 Epoch 22999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9830072522163391, 1: 0.9998880624771118}


 60%|██████    | 24007/40000 [05:53<03:57, 67.20it/s]

 Epoch 23999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9845723509788513, 1: 0.9998880624771118}


 63%|██████▎   | 25008/40000 [06:08<03:42, 67.46it/s]

 Epoch 24999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9841251969337463, 1: 0.9998880624771118}


 65%|██████▌   | 26009/40000 [06:22<03:25, 67.95it/s]

 Epoch 25999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.987479031085968, 1: 0.9998880624771118}


 68%|██████▊   | 27010/40000 [06:37<03:08, 69.00it/s]

 Epoch 26999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9856902956962585, 1: 0.9996641278266907}


 70%|███████   | 28007/40000 [06:51<02:59, 66.93it/s]

 Epoch 27999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9864728450775146, 1: 0.9997760653495789}


 73%|███████▎  | 29009/40000 [07:06<02:44, 66.96it/s]

 Epoch 28999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9887087345123291, 1: 0.9998880624771118}


 75%|███████▌  | 30012/40000 [07:21<02:26, 68.15it/s]

 Epoch 29999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9889323115348816, 1: 0.9998880624771118}


 78%|███████▊  | 31013/40000 [07:36<02:13, 67.48it/s]

 Epoch 30999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9885969758033752, 1: 0.9997760653495789}


 80%|████████  | 32007/40000 [07:51<01:59, 66.85it/s]

 Epoch 31999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9885969758033752, 1: 0.9998880624771118}


 83%|████████▎ | 33008/40000 [08:05<01:44, 67.15it/s]

 Epoch 32999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9896031022071838, 1: 0.9998880624771118}


 85%|████████▌ | 34009/40000 [08:20<01:28, 67.88it/s]

 Epoch 33999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9894912838935852, 1: 0.9991043210029602}


 88%|████████▊ | 35011/40000 [08:35<01:13, 67.95it/s]

 Epoch 34999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9892677068710327, 1: 0.9998880624771118}


 90%|█████████ | 36013/40000 [08:50<00:58, 68.32it/s]

 Epoch 35999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9896031022071838, 1: 0.9998880624771118}


 93%|█████████▎| 37007/40000 [09:04<00:43, 68.31it/s]

 Epoch 36999: Train Rule Accu: {0: 0.992939293384552, 1: 0.9950482845306396} - Test Rule Accu: {0: 0.9732811450958252, 1: 0.9947380423545837}


 95%|█████████▌| 38008/40000 [09:19<00:30, 65.57it/s]

 Epoch 37999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9893795251846313, 1: 0.9995521903038025}


 98%|█████████▊| 39009/40000 [09:34<00:14, 67.54it/s]

 Epoch 38999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9888205528259277, 1: 0.9996641278266907}


100%|██████████| 40000/40000 [09:49<00:00, 67.89it/s]


 Epoch 39999: Train Rule Accu: {0: 1.0, 1: 1.0} - Test Rule Accu: {0: 0.9892677068710327, 1: 0.9998880624771118}


## Model 5: mod113&mod37

In [None]:
# Clean Model:  2 Rules - Mod 113 & Mod 37 - Triggers 0 & 1
num_experiments = 10
num_rules = 2
t_s = [0, 1]
p_s = [113, 37]
num_epochs = 20000
out5 = TrainModel(num_rules, t_s, p_s, pos=2, num_epochs = num_epochs, frac_train=0.3)

filename = drive_PATH + "/R252/SmallTransformer/"+ 'model5'
torch.save(out5['Model State'],  filename + ".pth")
del out5['Model State']
with open(filename + ".json", 'w') as f:
    json.dump(out5, f)

  5%|▌         | 1009/20000 [00:14<04:42, 67.34it/s]

 Epoch 999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.6409369707107544, 1: 0.9982047080993652}


 10%|█         | 2010/20000 [00:29<04:28, 66.97it/s]

 Epoch 1999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9988845586776733, 1: 0.9995511770248413}


 15%|█▌        | 3011/20000 [00:44<04:14, 66.66it/s]

 Epoch 2999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 0.9998878240585327}


 20%|██        | 4007/20000 [00:59<03:54, 68.10it/s]

 Epoch 3999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 25%|██▌       | 5008/20000 [01:13<03:44, 66.84it/s]

 Epoch 4999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 30%|███       | 6009/20000 [01:28<03:24, 68.42it/s]

 Epoch 5999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 35%|███▌      | 7011/20000 [01:43<03:10, 68.05it/s]

 Epoch 6999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 40%|████      | 8013/20000 [01:58<02:59, 66.60it/s]

 Epoch 7999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 45%|████▌     | 9007/20000 [02:12<02:46, 66.22it/s]

 Epoch 8999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 50%|█████     | 10010/20000 [02:27<02:30, 66.41it/s]

 Epoch 9999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 55%|█████▌    | 11011/20000 [02:42<02:11, 68.17it/s]

 Epoch 10999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9998884797096252, 1: 1.0}


 60%|██████    | 12012/20000 [02:57<01:59, 66.95it/s]

 Epoch 11999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 0.9998884797096252, 1: 1.0}


 65%|██████▌   | 13008/20000 [03:11<01:44, 67.18it/s]

 Epoch 12999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 70%|███████   | 14011/20000 [03:26<01:27, 68.47it/s]

 Epoch 13999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 75%|███████▌  | 15012/20000 [03:41<01:13, 67.82it/s]

 Epoch 14999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 80%|████████  | 16010/20000 [03:56<00:59, 67.50it/s]

 Epoch 15999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 85%|████████▌ | 17012/20000 [04:10<00:44, 67.67it/s]

 Epoch 16999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 90%|█████████ | 18010/20000 [04:25<00:29, 67.40it/s]

 Epoch 17999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


 95%|█████████▌| 19011/20000 [04:40<00:14, 67.62it/s]

 Epoch 18999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}


100%|██████████| 20000/20000 [04:54<00:00, 67.85it/s]


 Epoch 19999: Train Rule Accu: {0: 0.9999999403953552, 1: 1.0} - Test Rule Accu: {0: 1.0, 1: 1.0}
