In [5]:
from qiskit_gym.envs import LinearFunctionGym, CliffordGym, PermutationGym, LinearFunctionNoisyGym
from qiskit_gym.rl import RLSynthesis, PPOConfig, AlphaZeroConfig, BasicPolicyConfig

ImportError: cannot import name 'LinearFunctionNoisyGym' from 'qiskit_gym.envs' (/Users/abea/Desktop/python venv/qiskit-gym/src/qiskit_gym/envs/__init__.py)

In [69]:
from qiskit import QuantumCircuit
from qiskit.transpiler import CouplingMap
import numpy as np

# Gym-style RL Environments for Quantum

In [None]:
cmap_3_line = CouplingMap.from_line(3, bidirectional=True)
env = LinearFuc.from_coupling_map(cmap_3_line)
env.config["gateset"]

[('CX', (0, 1)),
 ('CX', (1, 0)),
 ('CX', (1, 2)),
 ('CX', (2, 1)),
 ('SWAP', (0, 1)),
 ('SWAP', (1, 0)),
 ('SWAP', (1, 2)),
 ('SWAP', (2, 1))]

In [71]:
env.difficulty = 1  # We can set env difficulty
env.reset()  # This resets the env and returns first observation

(array([[0, 1, 0],
        [1, 0, 0],
        [0, 0, 1]], dtype=int8),
 {})

In [72]:
qc = QuantumCircuit(3)
qc.cx(0,2)
qc.draw()

In [73]:
# You can set a custom state like this (in this case from a circuit)
env.set_state(env.get_state(qc))

In [74]:
env.render()  # This displays the current state

[[1 0 0]
 [0 1 0]
 [1 0 1]]


In [75]:
env.action_space  # This tells you the number of possible actions (a discrete space of 8 actions)

Discrete(8)

In [76]:
env.observation_space  # This tells you the type and size of observation space (N by N discrete in this case)

MultiBinary((3, 3))

In [77]:
# You can advance the env by providing an action
obs, reward, is_final, _, _ = env.step(2)

# This provides:
# - The observation of the state right after action
# - The reward for that step
# - If we are in a final state

obs, reward, is_final

(array([[1, 0, 0],
        [0, 1, 0],
        [1, 1, 1]], dtype=int8),
 -0.00390625,
 False)

In [97]:
# One way to do it
env.set_state(env.get_state(qc))
env.render()
for a in [4,2,4]:
    print(f"[{a}] - {env.config['gateset'][a]}")
    obs, reward, is_final, _, _ = env.step(a)
    print(f"[{a}] - Reward: {reward}, Is final: {is_final}")
    env.render()

[[1 0 0 0 0 0]
 [0 1 0 0 0 0]
 [1 0 1 0 0 0]
 [0 0 0 1 0 1]
 [0 0 0 0 1 0]
 [0 0 0 0 0 1]]
[4] - ('SWAP', [0, 1])
[4] - Reward: -0.00390625, Is final: False
[[0 1 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 1 0 0 0]
 [0 0 0 0 1 0]
 [0 0 0 1 0 1]
 [0 0 0 0 0 1]]
[2] - ('CX', [1, 2])
[2] - Reward: -0.00390625, Is final: False
[[0 1 0 0 0 0]
 [1 0 0 0 0 0]
 [0 0 1 0 0 0]
 [0 0 0 0 1 0]
 [0 0 0 1 0 0]
 [0 0 0 0 0 1]]
[4] - ('SWAP', [0, 1])
[4] - Reward: 1.0, Is final: True
[[1 0 0 0 0 0]
 [0 1 0 0 0 0]
 [0 0 1 0 0 0]
 [0 0 0 1 0 0]
 [0 0 0 0 1 0]
 [0 0 0 0 0 1]]


In [79]:
# Another way to do it

env.set_state(env.get_state(qc))
env.render()
for a in [0,2,0,2]:
    print(f"[{a}] - {env.config['gateset'][a]}")
    obs, reward, is_final, _, _ = env.step(a)
    print(f"[{a}] - Reward: {reward}, Is final: {is_final}")
    env.render()

[[1 0 0]
 [0 1 0]
 [1 0 1]]
[0] - ('CX', (0, 1))
[0] - Reward: -0.00390625, Is final: False
[[1 0 0]
 [1 1 0]
 [1 0 1]]
[2] - ('CX', (1, 2))
[2] - Reward: -0.00390625, Is final: False
[[1 0 0]
 [1 1 0]
 [0 1 1]]
[0] - ('CX', (0, 1))
[0] - Reward: -0.00390625, Is final: False
[[1 0 0]
 [0 1 0]
 [0 1 1]]
[2] - ('CX', (1, 2))
[2] - Reward: 1.0, Is final: True
[[1 0 0]
 [0 1 0]
 [0 0 1]]


# Training with TwisteRL

## Permutation

### Setup env

In [80]:
cmap_3x3 = CouplingMap.from_grid(3,3, bidirectional=False)
env = PermutationGym.from_coupling_map(cmap_3x3)
rls = RLSynthesis(env, PPOConfig(), BasicPolicyConfig())

rls.env.config["gateset"]

[('SWAP', (0, 1)),
 ('SWAP', (0, 3)),
 ('SWAP', (1, 2)),
 ('SWAP', (1, 4)),
 ('SWAP', (2, 5)),
 ('SWAP', (3, 4)),
 ('SWAP', (3, 6)),
 ('SWAP', (4, 5)),
 ('SWAP', (4, 7)),
 ('SWAP', (5, 8)),
 ('SWAP', (6, 7)),
 ('SWAP', (7, 8))]

### Train the model

In [81]:
rls.learn(num_iterations=10, tb_path="runs/perm_square_3x3/")

[32m2025-09-22 12:14:35.746[0m | [1mINFO    [0m | [36mtwisterl.rl.algorithm[0m:[36mlearn[0m:[36m176[0m - [1m(1/0) {'successes': {'ppo_deterministic': 0.15000000596046448, 'ppo_10': 0.5899999737739563}, 'rewards': {'ppo_deterministic': -0.2822265625, 'ppo_10': 0.3794921934604645}, 'difficulty': 1, 'success': 0.15000000596046448, 'reward': -0.2822265625} | {'to_rust': 0.003763916, 'eval_ppo_deterministic': 0.001355291, 'eval_ppo_10': 0.006811042, 'collect': 0.010664042, 'data_to_torch': 0.011157917, 'train': 0.141758916, 'total': 0.175815208}[0m
[32m2025-09-22 12:14:35.750[0m | [1mINFO    [0m | [36mtwisterl.rl.algorithm[0m:[36mlearn[0m:[36m196[0m - [1m(1/0) Improved, saved checkpoint![0m
[32m2025-09-22 12:14:35.895[0m | [1mINFO    [0m | [36mtwisterl.rl.algorithm[0m:[36mlearn[0m:[36m176[0m - [1m(1/1) {'successes': {'ppo_deterministic': 0.1899999976158142, 'ppo_10': 0.5899999737739563}, 'rewards': {'ppo_deterministic': -0.22207030653953552, 'ppo_10': 0.37

### Save (or load) config and model

In [82]:
#rls.save("models/perm_square_3x3.json", "models/perm_square_3x3.pt")

rls = RLSynthesis.from_config_json("models/perm_square_3x3.json", "models/perm_square_3x3.pt")

### Try it

In [83]:
some_perm = np.random.permutation(9)

qc_perm = rls.synth(some_perm, num_searches=1000, num_mcts_searches=0, deterministic=False)
qc_perm.draw(fold=-1)

In [84]:
qc_perm_input = QuantumCircuit(9)
qc_perm_input.swap(0,8)
qc_perm_input.draw(fold=-1)

In [85]:
qc_perm_output = rls.synth(qc_perm_input, num_searches=1000, num_mcts_searches=0, deterministic=False)
qc_perm_output.draw(fold=-1)

## Linear Function

In [86]:
cmap_6_line = CouplingMap.from_line(5, bidirectional=True)
env = LinearFunctionGym.from_coupling_map(cmap_6_line, basis_gates=["CX"])

rls = RLSynthesis(env, PPOConfig(), BasicPolicyConfig())
rls.learn(num_iterations=10, tb_path="runs/lf_5_line_ppo/")  # This will track progress in Tensorboard

[32m2025-09-22 12:14:37.731[0m | [1mINFO    [0m | [36mtwisterl.rl.algorithm[0m:[36mlearn[0m:[36m176[0m - [1m(1/0) {'successes': {'ppo_deterministic': 0.10999999940395355, 'ppo_10': 0.7900000214576721}, 'rewards': {'ppo_deterministic': -0.3423828184604645, 'ppo_10': 0.6802734136581421}, 'difficulty': 1, 'success': 0.10999999940395355, 'reward': -0.3423828184604645} | {'to_rust': 0.00262525, 'eval_ppo_deterministic': 0.001090208, 'eval_ppo_10': 0.006481, 'collect': 0.010691875, 'data_to_torch': 0.007065584, 'train': 0.10499725, 'total': 0.133200042}[0m
[32m2025-09-22 12:14:37.734[0m | [1mINFO    [0m | [36mtwisterl.rl.algorithm[0m:[36mlearn[0m:[36m196[0m - [1m(1/0) Improved, saved checkpoint![0m
[32m2025-09-22 12:14:37.864[0m | [1mINFO    [0m | [36mtwisterl.rl.algorithm[0m:[36mlearn[0m:[36m176[0m - [1m(1/1) {'successes': {'ppo_deterministic': 0.27000001072883606, 'ppo_10': 0.7599999904632568}, 'rewards': {'ppo_deterministic': -0.10175780951976776, 'ppo_1

In [87]:
#rls.save("models/lf_5_line.json", "models/lf_5_line.pt")

rls = RLSynthesis.from_config_json("models/lf_5_line.json", "models/lf_5_line.pt")

In [88]:
qc_lf_input = QuantumCircuit(5)
qc_lf_input.cx(0,4)
qc_lf_input.draw(fold=-1)

In [89]:
qc_lf_output = rls.synth(qc_lf_input, num_searches=1000, num_mcts_searches=0, deterministic=False)
qc_lf_output.draw(fold=-1)

In [90]:
from qiskit.circuit.library.generalized_gates import LinearFunction
LinearFunction(qc_lf_input) == LinearFunction(qc_lf_output)

np.True_

## Clifford

In [91]:
# Here we want to do Clifford synthesis but we only allow H and S to be placed on qubit 0
env = CliffordGym(
    num_qubits=3, 
    gateset=[
        ("CX", [0,1]),
        ("CX", [1,0]),
        ("CX", [1,2]),
        ("CX", [2,1]),
        ("SWAP", [0,1]),
        ("SWAP", [1,2]),
        ("H", [0]),
        ("S", [0]),
    ]
)
rls = RLSynthesis(env, PPOConfig(), BasicPolicyConfig())
rls.learn(num_iterations=10, tb_path="runs/clifford_3q_custom/")

[32m2025-09-22 12:14:39.463[0m | [1mINFO    [0m | [36mtwisterl.rl.algorithm[0m:[36mlearn[0m:[36m176[0m - [1m(1/0) {'successes': {'ppo_deterministic': 0.05999999865889549, 'ppo_10': 0.7799999713897705}, 'rewards': {'ppo_deterministic': -0.4175781309604645, 'ppo_10': 0.665234386920929}, 'difficulty': 1, 'success': 0.05999999865889549, 'reward': -0.4175781309604645} | {'to_rust': 0.002575708, 'eval_ppo_deterministic': 0.001064833, 'eval_ppo_10': 0.006576875, 'collect': 0.011463667, 'data_to_torch': 0.007734875, 'train': 0.11155375, 'total': 0.141262875}[0m
[32m2025-09-22 12:14:39.608[0m | [1mINFO    [0m | [36mtwisterl.rl.algorithm[0m:[36mlearn[0m:[36m176[0m - [1m(1/1) {'successes': {'ppo_deterministic': 0.07999999821186066, 'ppo_10': 0.75}, 'rewards': {'ppo_deterministic': -0.38749998807907104, 'ppo_10': 0.6201171875}, 'difficulty': 1, 'success': 0.07999999821186066, 'reward': -0.38749998807907104} | {'to_rust': 0.002888958, 'eval_ppo_deterministic': 0.001211458, 'e

In [92]:
#rls.save("models/clifford_3q_custom.json", "models/clifford_3q_custom.pt")

rls = RLSynthesis.from_config_json("models/clifford_3q_custom.json", "models/clifford_3q_custom.pt")

In [93]:
from qiskit.quantum_info import random_clifford, Clifford

In [94]:
qc_clifford_in = QuantumCircuit(3)
qc_clifford_in.h(2)

qc_clifford_out = rls.synth(qc_clifford_in, num_searches=1000, num_mcts_searches=0, deterministic=False)
qc_clifford_out.draw(fold=-1)

In [95]:
some_clifford = random_clifford(3, seed=42)
qc_rand_clifford_out = rls.synth(some_clifford, num_searches=1000, num_mcts_searches=0, deterministic=False)
qc_rand_clifford_out.draw(fold=-1)

In [96]:
# Equivalent up to phase
np.array_equal(some_clifford.tableau[:,:-1], Clifford(qc_rand_clifford_out).tableau[:,:-1])

True