# Quantum Policy Gradient RL Example
This notebook demonstrates a simple 1D grid environment using a quantum variational circuit as a policy network.

In [None]:
!pip install pennylane pennylane-qiskit qiskit matplotlib

In [None]:
import pennylane as qml
from pennylane import numpy as np
import matplotlib.pyplot as plt

## 1. Environment Definition
We create a simple 1D grid environment where the agent starts at 0 and the goal is at the last cell.

In [None]:
class SimpleGridEnv:
    def __init__(self, size=5):
        self.size = size
        self.state = 0
        self.goal = size-1

    def reset(self):
        self.state = 0
        return self.state

    def step(self, action):
        self.state = max(0, min(self.size-1, self.state + (1 if action else -1)))
        reward = 1 if self.state == self.goal else 0
        done = self.state == self.goal
        return self.state, reward, done

## 2. Quantum Policy Network
Using a 2-qubit variational circuit as the policy to decide actions.

In [None]:
n_qubits = 2
dev = qml.device("default.qubit", wires=n_qubits)

def variational_circuit(state, params):
    for i in range(n_qubits):
        qml.RY(state + params[i], wires=i)
        qml.RZ(params[i+n_qubits], wires=i)
    qml.CNOT(wires=[0,1])

@qml.qnode(dev, interface="autograd")
def circuit(state, params):
    variational_circuit(state, params)
    return [qml.expval(qml.PauliZ(0)), qml.expval(qml.PauliZ(1))]

def get_action_probs(state, params):
    state = float(state)
    output = circuit(state, params)
    probs = (np.array(output) + 1)/2
    probs = probs / np.sum(probs)
    return probs

def select_action(state, params):
    probs = get_action_probs(state, params)
    return int(np.random.choice([0,1], p=probs))

## 3. Training (Policy Gradient)

In [None]:
env = SimpleGridEnv(size=5)
params = 0.01 * np.random.randn(2*n_qubits)  # 4 parameters
lr = 0.1
episodes = 50

for ep in range(episodes):
    state = env.reset()
    done = False
    while not done:
        action = select_action(state, params)
        next_state, reward, done = env.step(action)

        # Policy gradient loss
        def loss(p):
            probs = get_action_probs(state, p)
            return -np.log(probs[action]) * reward

        grads = qml.grad(loss)(params)
        params = params - lr * grads
        state = next_state

print("Trained parameters:", params)

## 4. Evaluation

In [None]:
eval_episodes = 5
for ep in range(eval_episodes):
    state = env.reset()
    done = False
    path = [state]
    while not done:
        action = select_action(state, params)
        state, _, done = env.step(action)
        path.append(state)
    print(f"Episode {ep+1} path: {path}")

    plt.figure()
    plt.plot(path, 'o-', markersize=12)
    plt.xticks(range(env.size))
    plt.yticks([])
    plt.title(f"Episode {ep+1} Path")
    plt.xlabel("Steps")
    plt.show()