In [None]:
%pip install stable-baselines3[extra]
%pip install 'shimmy>=2.0'

: 

In [21]:
# set length and sums of sequences (i.e. x, y, z, w)
length = 5
sums = [3, -3, 1, -1]
num_seq = 4
seq = []

# determine number of -1's and 1's
for i in range(4):
  num_neg = (length - sums[i]) // 2
  cur_seq = []
  cur_seq.extend([-1] * num_neg)
  cur_seq.extend([1] * (length - num_neg))
  seq.append(cur_seq)

print(seq)

[[-1, 1, 1, 1, 1], [-1, -1, -1, -1, 1], [-1, -1, 1, 1, 1], [-1, -1, -1, 1, 1]]


In [26]:
# environment for agent
# questions about reward function: sum vs squared sum vs euclidean distance, vector vs scalar

import numpy as np
import gym
from gym import spaces
import copy

class TurynEnv(gym.Env):
    def __init__(self, sequence, length):
        super(TurynEnv, self).__init__()
        self.length = length
        self.sequence = sequence
        self.old_npaf = self.calculate_autocorrelation()
        self.orig_sequence = []
        for seq in self.sequence:
          self.orig_sequence.append(seq[:])
        self.action_space = spaces.Discrete(self.count_actions())
        self.observation_space = spaces.Box(low=-1, high=1, shape=(4, length), dtype=np.int8)
        self.action_history = []

    def count_actions(self):
      total = 0
      for i in range(num_seq):
        total += self.sequence[i].count(-1) * self.sequence[i].count(1)
      return total

    def step(self, action):
        seq_num, i, j = self.decode_action(action)
        print(seq_num, i, j)
        self.sequence[seq_num][i], self.sequence[seq_num][j] = self.sequence[seq_num][j], self.sequence[seq_num][i]
        new_npaf = self.calculate_autocorrelation()
        reward = (self.old_npaf - new_npaf) / np.sqrt((self.length * 16))
        self.old_npaf = new_npaf

        done = True
        return np.array(self.sequence), reward, done, {}

    def reset(self):
        if self.old_npaf == 0:
          print("TURYN SEQUENCE FOUND")
          self.sequence = self.orig_sequence[:]
          self.old_npaf = self.calculate_autocorrelation()
          self.last_action = None
        return np.array(self.sequence)

    def calculate_autocorrelation(self):
        euc_norm = 0
        for s in range(1, self.length):
          total = 0
          for i in range(self.length - s):
            for j in range(4):
              if self.sequence[j][i] == self.sequence[j][i + s]:
                total += 1
              else:
                total -= 1
          euc_norm += total * total
        return np.sqrt(euc_norm)

    def decode_action(self, action):
        for seq in range(4):
          for i in range(self.length):
            for j in range(i + 1, self.length):
              if self.sequence[seq][i] != self.sequence[seq][j]:
                if action == 0:
                  return seq, i, j
                action -= 1

In [27]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

env = TurynEnv(seq, length)
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=500)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
TURYN SEQUENCE FOUND
1 3 4
3 0 1
0 1 3
3 2 4
3 2 4
0 0 1
2 0 2
1 1 3
0 0 3
3 0 2
1 1 2
0 1 3
0 1 2
2 2 3
0 1 2
TURYN SEQUENCE FOUND
2 1 4
1 2 3
0 1 4
3 0 2
3 0 3
0 1 4
0 1 3
3 2 3
2 0 3
2 1 3
3 3 4
1 2 3
2 3 4
1 1 2
3 3 4
0 1 3
1 1 2
1 2 3
3 0 2
1 0 3
2 0 2
2 0 2
0 1 4
2 3 4
3 0 3
2 0 2
2 1 2
3 2 4
1 0 4
3 0 3
1 0 4
2 0 1
0 2 4
3 2 3
1 0 4
3 0 1
0 2 3
2 0 4
3 2 3
3 1 4
2 2 3
2 0 2
3 2 3
0 0 3
1 3 4
1 2 3
0 0 4
1 1 2
2 1 4
3 1 3
3 0 4
2 1 4
0 1 4
3 0 3
3 0 3
3 1 3
1 1 2
0 1 2
3 2 3
0 2 3
2 0 1
1 2 4
0 3 4
3 0 4




2 0 4
3 0 4
2 0 2
2 1 4
2 0 4
2 2 3
0 1 4
1 3 4
3 0 4
1 1 3
1 1 2
2 1 3
TURYN SEQUENCE FOUND
1 2 4
1 1 4
0 1 4
0 1 4
2 1 2
2 2 3
0 1 2
1 0 1
2 2 3
1 0 2
2 0 4
0 2 3
3 0 2
1 0 2
2 1 2
TURYN SEQUENCE FOUND
2 3 4
2 0 3
1 0 1
2 0 3
1 0 1
3 0 3
1 0 1
0 0 3
2 1 2
2 0 2
2 3 4
3 1 4
3 1 2
1 0 1
1 0 3
2 0 1
2 3 4
2 2 3
2 1 4
TURYN SEQUENCE FOUND
3 0 3
2 2 3
3 2 4
0 0 3
2 2 4
2 0 2
2 0 1
3 0 1
1 2 3
0 3 4
2 1 2
2 2 4
0 3 4
3 3 4
1 0 2
2 1 3
1 0 2
2 3 4
0 1 3
2 3 4
2 0 1
0 1 2
3 2 3
3 1 3
2 1 4
0 2 3
3 1 2
2 1 3
3 1 4
2 3 4
2 2 4
TURYN SEQUENCE FOUND
3 1 3
2 0 4
2 3 4
2 3 4
0 0 3
3 0 1
TURYN SEQUENCE FOUND
2 2 3
1 2 3
3 2 4
1 0 3
3 2 4
1 0 1
3 0 3
0 0 4
3 2 4
1 1 2
2 0 4
1 2 3
2 0 1
2 0 3
2 1 2
TURYN SEQUENCE FOUND
3 0 2
2 0 4
3 3 4
3 3 4
3 0 2
TURYN SEQUENCE FOUND
2 0 4
TURYN SEQUENCE FOUND
1 0 3
0 3 4
TURYN SEQUENCE FOUND
3 2 4
3 0 4
0 2 3
1 0 1
2 0 4
2 2 3
2 0 3
2 0 3
0 0 2
0 0 2
2 1 4
1 1 2
2 0 1
0 1 2
1 1 2
2 2 3
1 1 4
2 0 4
3 0 4
0 1 3
3 0 3
0 2 3
TURYN SEQUENCE FOUND
3 1 4
2 1 2
2 1 2
2 0 

<stable_baselines3.ppo.ppo.PPO at 0x7b6899fa0750>

In [24]:
print(seq)


[[1, 1, 1, -1, 1], [-1, -1, -1, -1, 1], [-1, 1, 1, 1, -1], [-1, 1, -1, -1, 1]]


In [28]:
cur_seq = [[-1, -1,  1, -1, -1], [-1,  1,  1, -1,  1], [ 1, -1, -1, -1,  -1], [ 1, -1,  1, -1,  1]]
env.sequence = cur_seq
env.old_npaf = env.calculate_autocorrelation()
obs = env.reset()
done = False
for i in range(10):
    print("Sequence:", env.sequence)
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    print("Reward:", reward)
    print("NPAF:", env.old_npaf)


Sequence: [[-1, -1, 1, -1, -1], [-1, 1, 1, -1, 1], [1, -1, -1, -1, -1], [1, -1, 1, -1, 1]]
1 0 2
Reward: 0.2314947914883281
NPAF: 2.8284271247461903
Sequence: [[-1, -1, 1, -1, -1], [1, 1, -1, -1, 1], [1, -1, -1, -1, -1], [1, -1, 1, -1, 1]]
0 1 2
Reward: -0.13098582948312
NPAF: 4.0
Sequence: [[-1, 1, -1, -1, -1], [1, 1, -1, -1, 1], [1, -1, -1, -1, -1], [1, -1, 1, -1, 1]]
0 1 2
Reward: 0.13098582948312
NPAF: 2.8284271247461903
Sequence: [[-1, -1, 1, -1, -1], [1, 1, -1, -1, 1], [1, -1, -1, -1, -1], [1, -1, 1, -1, 1]]
0 1 2
Reward: -0.13098582948312
NPAF: 4.0
Sequence: [[-1, 1, -1, -1, -1], [1, 1, -1, -1, 1], [1, -1, -1, -1, -1], [1, -1, 1, -1, 1]]
0 1 2
Reward: 0.13098582948312
NPAF: 2.8284271247461903
Sequence: [[-1, -1, 1, -1, -1], [1, 1, -1, -1, 1], [1, -1, -1, -1, -1], [1, -1, 1, -1, 1]]
0 1 2
Reward: -0.13098582948312
NPAF: 4.0
Sequence: [[-1, 1, -1, -1, -1], [1, 1, -1, -1, 1], [1, -1, -1, -1, -1], [1, -1, 1, -1, 1]]
0 1 2
Reward: 0.13098582948312
NPAF: 2.8284271247461903
Sequence: [