# Aux

In [None]:
import numpy as np
import pennylane as qml 
import torch

def create_circuit(n_qubits,n_layers=None,circ = "simplified_two_design",fim=False, shots=None):

    dev = qml.device("default.qubit.torch", wires=n_qubits, shots=shots)

    def RZRY(params):
        #qml.SpecialUnitary(params, wires=range(n_qubits))
        #qml.SimplifiedTwoDesign(initial_layer_weights=init_params, weights=params, wires=range(n_qubits))
        #qml.AngleEmbedding(params,wires=range(n_qubits))
        for q in range(n_qubits):
            qml.Hadamard(wires=q)

        for w in range(n_layers): 
            for q in range(n_qubits):
                index = w * (2*n_qubits) + q * 2
                qml.RZ(params[index],wires=q)
                qml.RY(params[index + 1],wires=q)
        
        qml.broadcast(qml.CNOT , wires=range(n_qubits), pattern="all_to_all")
        
        return qml.probs(wires=range(n_qubits))

    def S2D(init_params,params,measurement_qubits=0,prod_approx=False):
        #qml.SpecialUnitary(params, wires=range(n_qubits))
        qml.SimplifiedTwoDesign(initial_layer_weights=init_params, weights=params, wires=range(n_qubits))
        
        #qml.broadcast(qml.CNOT , wires=range(n_qubits), pattern="all_to_all")
        if not prod_approx:
            return qml.probs(wires=list(range(measurement_qubits)))
        else:
            return [qml.probs(i) for i in range(measurement_qubits)]

    def SU(params):
        qml.SpecialUnitary(params, wires=range(n_qubits))
        
        ZZ = qml.operation.Tensor(qml.PauliZ(0), qml.PauliZ(1))
        for i in range(2,n_qubits):
            ZZ = qml.operation.Tensor(ZZ, qml.PauliZ(i))

        return qml.expval(ZZ)
    
    def simmpleRZRY(params,cnots=True):
        qml.broadcast(qml.Hadamard, wires=range(n_qubits), pattern="single")
        qml.broadcast(qml.RZ, wires=range(n_qubits), pattern="single", parameters=params[0])
        qml.broadcast(qml.RY, wires=range(n_qubits), pattern="single", parameters=params[1])
        if cnots:
            qml.broadcast(qml.CNOT, wires=range(n_qubits), pattern="chain")

            return qml.expval(qml.PauliZ(n_qubits-1))
        else:
            ZZ = qml.operation.Tensor(qml.PauliZ(0), qml.PauliZ(1))
            for i in range(2,n_qubits):
                ZZ = qml.operation.Tensor(ZZ, qml.PauliZ(i))

            return qml.expval(ZZ)
        
    def RY(params,y=True,probs=False,prod=False, entanglement=None):
        #qml.broadcast(qml.Hadamard, wires=range(n_qubits), pattern="single")
        qml.broadcast(qml.RY, wires=range(n_qubits), pattern="single", parameters=params)
        #qml.broadcast(qml.CZ, wires=range(n_qubits), pattern="all_to_all")

        if entanglement=="all_to_all":
            qml.broadcast(qml.CNOT, wires=range(n_qubits), pattern="all_to_all")
        
        if y==True:
            #YY = qml.operation.Tensor(qml.PauliY(0), qml.PauliY(1))
            YY = [qml.PauliZ(0), qml.PauliZ(1)]
            for i in range(2,n_qubits):
                #YY = qml.operation.Tensor(YY, qml.PauliY(i))
                YY.append(qml.PauliZ(i))
            
            #return [qml.expval(i) for i in YY]
            return qml.expval(YY)

        elif probs==False:

            ZZ = qml.operation.Tensor(qml.PauliZ(0), qml.PauliZ(1))
            #ZZ = [qml.PauliZ(0), qml.PauliZ(1)]
            for i in range(2,n_qubits):
                ZZ = qml.operation.Tensor(ZZ, qml.PauliZ(i))        
                #ZZ.append(qml.PauliZ(i))        

            #return [qml.expval(i) for i in ZZ]
            return qml.expval(ZZ)

        else:
            if prod:
                return [qml.probs(i) for i in range(n_qubits)]
            else:
                return qml.probs(wires=range(n_qubits))
            
        
        
    def GHZ(params,measurement_qubits=0):
        qml.RY(params,wires=0)
        qml.broadcast(qml.CNOT, wires=range(n_qubits), pattern="chain")

        return qml.probs(wires=range(measurement_qubits))

    def random_product_state(params,gate_sequence=None):
                
        for i in range(n_qubits):
            qml.RY(np.pi / 4, wires=i)

        for ll in range(len(params)):

            for i in range(n_qubits):
                gate_sequence["{}{}".format(ll,i)](params[ll][i], wires=i)

            #for i in range(n_qubits - 1):
                #qml.CZ(wires=[i, i + 1])
    def SEL(params, measurement_qubits=0):
        qml.StronglyEntanglingLayers(params, wires=range(n_qubits))
        return qml.probs(wires=range(measurement_qubits))
    
    def RL(params, measurement_qubits=0):
        qml.RandomLayers(params, ratio_imprim=0.8 ,imprimitive=qml.CZ, wires=range(n_qubits))
        return qml.probs(wires=range(measurement_qubits))
    
    if circ == "rzry":
        qcircuit = RZRY
    elif circ == "simplified_two_design":
        qcircuit = S2D
    elif circ == "special_unitary":
        qcircuit = SU
    elif circ == "simpleRZRY":
        qcircuit = simmpleRZRY
    elif circ == "RY":
        qcircuit = RY
    elif circ == "ghz":
        qcircuit = GHZ
    elif circ == "random_product_state":
        qcircuit = random_product_state
    elif circ == "SEL":
        qcircuit = SEL
    elif circ == "RL":
        qcircuit = RL
    if not fim:
        circuit = qml.QNode(qcircuit, dev,interface="torch", diff_method="backprop")
    else:
        circuit = qml.QNode(qcircuit, dev)

    return circuit

def compute_gradient(log_prob, w):
    """Compute gradient of the log probability with respect to weights.
    
    Args:
    - log_prob (torch.Tensor): The log probability tensor.
    - w (torch.Tensor): The weights tensor, with requires_grad=True.

    Returns:
    - numpy.ndarray: The gradient of log_prob with respect to w, flattened.
    """
    if w.grad is not None:
        w.grad.zero_()
    log_prob.backward(retain_graph=True)
    
    if w.grad is None:
        raise ValueError("The gradient for the given log_prob with respect to w is None.")
    
    return w.grad.view(-1).detach().numpy()

def policy(probs, policy_type="contiguous-like", n_actions=2, n_qubits=1):

    if policy_type == "contiguous-like":
        return probs
    elif policy_type == "parity-like":
        policy = torch.zeros(n_actions)
        for i in range(len(probs)):
            a=[]
            for m in range(int(np.log2(n_actions))):
                if m==0:    
                    bitstring = np.binary_repr(i,width=n_qubits)
                else:
                    bitstring = np.binary_repr(i,width=n_qubits)[:-m]
                
                a.append(bitstring.count("1") % 2)
            policy[int("".join(str(x) for x in a),2)] += probs[i]

        return policy    
    
def compute_policy_and_gradient(args):
    n_qubits, shapes, type , n_actions, policy_type, clamp = args

    if policy_type == "parity-like":
        measure_qubits = n_qubits
    else:
        measure_qubits = int(np.log2(n_actions))

    qc = create_circuit(n_qubits, circ=type, fim=False, shots=None)

    if type == "simplified_two_design":
        weights = [np.random.uniform(-np.pi,np.pi,size=shape) for shape in shapes]    
        weights_tensor_init = torch.tensor(weights[0], requires_grad=False)
        weights_tensor_params = torch.tensor(weights[1], requires_grad=True)
        
        probs = qc(weights_tensor_init,weights_tensor_params, measurement_qubits=measure_qubits)

    else:
        weights = [np.random.uniform(-np.pi,np.pi,size=shape) for shape in shapes]    
        weights_tensor_params = torch.tensor(weights, requires_grad=True)

        probs = qc(weights_tensor_params, measurement_qubits=measure_qubits)

    pi = policy(probs, policy_type=policy_type, n_actions=n_actions, n_qubits=n_qubits)
    if clamp is not None:
        pi = torch.clamp(pi, clamp, 1)

    dist = torch.distributions.Categorical(probs=pi)
    
    action = dist.sample()
    log_prob = dist.log_prob(action)

    gradient_no_clamp = np.linalg.norm(compute_gradient(log_prob, weights_tensor_params), 2)
    return gradient_no_clamp


In [None]:
def reinforce(policy, optimizer, env, n_episodes=1000, max_t=1000, gamma=1.0, print_every=5):
    scores_deque = deque(maxlen=print_every)
    scores = []
    average_scores = []
    runtime_sum = 0
    for e in range(1, n_episodes):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        # Collect trajectory
        for t in range(max_t):
            # Sample the action from current policy
            if t==0:
                state_tensor = torch.tensor(state[0]).float()
            else:
                state_tensor = torch.tensor(state).float()
            action, log_prob, _, = policy.sample(state_tensor)
            saved_log_probs.append(log_prob)
            state, reward, done, _, _ = env.step(action)
            rewards.append(reward)
            if done:
                break

    # Total expected reward
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
    #standardized returns
        R=0
        policy_loss = []
        returns = []
        for r in rewards[::-1]:
            R = r + gamma * R
            returns.insert(0,R)
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + np.finfo(np.float32).eps)

        for log_prob, R in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * R)

        policy_unsqueezed = [torch.unsqueeze(loss, 0) for loss in policy_loss]
        policy_sum = torch.cat(policy_unsqueezed).sum()

    # Backpropagation
        start_time = time.time()
        optimizer.zero_grad()
        policy_sum.backward()
        optimizer.step()
        end_time = time.time()
        runtime = end_time-start_time
        
        runtime_sum += runtime
        if e % print_every == 0:
            print('Episode {}\tLast reward: {:.2f}\tLast {}\tEpisodes average reward: {:.2f}\tRuntime: {:.2f}'.format(e, scores_deque[-1], print_every, np.mean(scores_deque), runtime_sum))
            runtime_sum = 0
        if np.mean(scores_deque) == 500:
            print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(e, np.mean(scores_deque)))
            break
    return scores, policy.gradient_list, average_scores  

In [None]:
    def save_training_data(self):
        ''' 
        Saves training data into json files
        '''
        current_directory = os.path.dirname(__file__)
        folder_name = f"{str(self.env_name)}_{self.pqc.policy.post_processing}_{self.pqc.circuit.n_layers}"
        folder_path = os.path.join(current_directory, folder_name)
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)

        episode_data = [self.scores_deque, 
                        self.runtime, 
                        self.loss.item(), 
                        tensor_to_list(self.pqc.get_gradients()[0]), 
                        tensor_to_list(self.pqc.get_gradients()[1])]

        if folder_path is not None:
            file_path = os.path.join(self.folder_path, f"{self.file_name}.json")
            if os.path.exists(file_path):
                with open(file_path, 'r') as f:
                    existing_data = json.load(f)
                existing_data.append(episode_data)
                with open(file_path, 'w') as f:
                    json.dump(existing_data, f, indent=4)
            else:
                with open(file_path, 'w') as f:
                    json.dump([episode_data], f, indent=4)

# Policy Gradient Algorithm

In [None]:
import pennylane as qml
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

import gym
from collections import deque

from tensorboard.backend.event_processing import event_accumulator
from joblib import Parallel, delayed
import os
import json
import time
from datetime import datetime

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 5)

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir ../../data

# Utils


In [None]:
#utils

def tensor_to_list(tensor):
    """
    Convert a tensor or numpy array to a nested list.
    """
    if isinstance(tensor, list):
        return [tensor_to_list(t) for t in tensor]
    elif isinstance(tensor, dict):
        return {key: tensor_to_list(value) for key, value in tensor.items()}
    elif isinstance(tensor, np.ndarray):
        return tensor.tolist()
    elif isinstance(tensor, torch.Tensor):
        return tensor.tolist()
    else:
        return tensor

def measure_probs(qubits):
    return qml.probs(wires=range(qubits)) 

def measure_expval_pairs(qubits):
    expvals = []
    for i in range(qubits // 2):
        expvals.append(qml.expval(qml.PauliZ(2*i) @ qml.PauliZ(2*i + 1)))
    return expvals
    
def create_optimizer_with_lr(params, lr_list, use_amsgrad=False):
    optimizer = torch.optim.Adam([
        {'params': p, 'lr': lr} for p, lr in zip(params, lr_list)
    ], amsgrad=use_amsgrad)
    return optimizer

def get_function_representation(func):
    if callable(func):
        # Check if the function is a lambda
        if func.__name__ == "<lambda>":
            # Optionally, check if the function has a custom description attribute
            return f"{func.__module__}.<lambda>" + (getattr(func, 'description', ''))
        else:
            return f"{func.__module__}.{func.__name__}"
    return "Unknown Function Type"

def jerbi_circuit(n_qubits, n_layers, shots, input_scaling, diff_method, weight_init, input_init, measure, measure_qubits):

    if shots is None:
        dev = qml.device("default.qubit", wires=n_qubits)
    else:
        dev = qml.device("default.qubit", wires=n_qubits, shots=shots)
    
    if n_layers < 1:
        raise ValueError("Number of layers can't take values below 1")
    
    weight_shapes = {"params": (n_layers + 1, n_qubits, 2),
                    "input_params": (n_layers, n_qubits, 2)}
    init_method   = {"params": weight_init,
                    "input_params": input_init}
    
    @qml.qnode(dev, interface='torch', diff_method=diff_method)
    def qnode(inputs, params, input_params):
    #in case n_qubits != input length
        if n_qubits > len(inputs) and n_qubits % len(inputs) == 0:
            multiplier = n_qubits // len(inputs)
            inputs = torch.cat([inputs] * multiplier)
        elif n_qubits != len(inputs) and n_qubits % len(inputs) != 0:
            raise ValueError('Number of qubits cannot be divided by input lenght')

    #hadamard
        qml.broadcast(qml.Hadamard, wires=range(n_qubits), pattern="single")
        
        for layer in range(n_layers):
            for wire in range(n_qubits):
                qml.RZ(params[layer][wire][0], wires=wire)
                qml.RY(params[layer][wire][1], wires=wire)

            qml.broadcast(qml.CNOT, wires=range(n_qubits), pattern="chain")

            if input_scaling:
                for wire in range(n_qubits):
                    qml.RY(input_params[layer][wire][0] * inputs[wire], wires=wire)
                    qml.RZ(input_params[layer][wire][1] * inputs[wire], wires=wire)
            else:
                for wire in range(n_qubits):
                    qml.RY(inputs[wire], wires=wire)
                    qml.RZ(inputs[wire], wires=wire)

        for wire in range(n_qubits):
            qml.RZ(params[-1][wire][0], wires=wire)
            qml.RY(params[-1][wire][1], wires=wire)
            
        return measure(measure_qubits)

    model = qml.qnn.TorchLayer(qnode, weight_shapes=weight_shapes, init_method=init_method)  
    
    return model
    
def S2D(n_qubits, n_layers, shots, input_scaling, diff_method, weight_init, input_init, measure_type, observables, measure_qubits):

    dev = qml.device("default.qubit", wires=n_qubits)

    observables = observables if observables is not None else None
    
    shapes = qml.SimplifiedTwoDesign.shape(n_layers=n_layers, n_wires=n_qubits)

    weight_shapes = {"params": shapes[1],
                     "input_params": shapes[0]}
    
    init_method   = {"params": weight_init,
                     "input_params": input_init}

    @qml.qnode(dev, interface='torch', diff_method='parameter-shift')
    def qnode(inputs, params, input_params):

        return measure_selection(measure_type, observables, measure_qubits)

    model = qml.qnn.TorchLayer(qnode, weight_shapes=weight_shapes, init_method=init_method)

    return model

# Classes

In [None]:
class CircuitGenerator(nn.Module):

    def __init__(self, n_qubits, n_layers,  shots = None, input_scaling = True,
                design = 'jerbi_circuit', diff_method = 'backprop', weight_init = torch.nn.init.uniform_, 
                input_init = torch.nn.init.ones_, measure = None, measure_qubits = None):
        super(CircuitGenerator, self).__init__()
        '''

        Creates a parameterized quantum circuit based on the arguments:

            n_qubits(int) = Number of qubits
            n_layers(int) = Number of layers (0 if no data re-uploading)
            shots(int) = Number of times the circuit gets executed
            input_scaling(bool) = Input parameters are used if True (input*input_params)
            design(str) = The PQC ansatz design ('jerbi_circuit')
            diff_method(str) = Differentiation method ('best', 'backprop', 'parameter-shift', ...)
            weight_init (function) = How PQC weights are initialized (.uniform_, .ones_, ...)
            input_init (function) = How input weights are initialized (.uniform_, .ones_, ...)
            measure (function) = Measure function (measure_probs, measure_expval_pairs)
            measure_qubits (int) = Number of qubits to be measured (in some cases might be equal to the number of qubits)
            
        '''
        self.n_qubits = n_qubits
        self.n_layers = n_layers
        self.shots = shots
        self.input_scaling = input_scaling
        self.design = design
        self.diff_method = diff_method
        self.weight_init = weight_init
        self.input_init = input_init
        if measure is None:
            self.measure = measure_probs
        else:
            self.measure = measure

        if measure_qubits is None:
            self.measure_qubits = n_qubits
        else:
            self.measure_qubits = measure_qubits

        if self.design == 'jerbi_circuit':
            self.circuit = jerbi_circuit(n_qubits = self.n_qubits,
                                        n_layers = self.n_layers,
                                        shots = self.shots,
                                        input_scaling = self.input_scaling,
                                        diff_method = self.diff_method,
                                        weight_init = self.weight_init,
                                        input_init = self.input_init,
                                        measure = self.measure,
                                        measure_qubits = self.measure_qubits)
        else:
            raise ValueError("Unsupported circuit type")

    def input(self,inputs):

        outputs = self.circuit(inputs)
        return outputs

In [None]:
class PolicyType(nn.Module):
    
    def __init__(self, n_actions, post_processing = 'raw_contiguous', 
                 beta_scheduling = False, beta = 1, increase_rate = 0.003, 
                 output_scaling = False, output_init = torch.nn.init.ones_):
        super(PolicyType, self).__init__()

        '''
        Determines the type of policy used based on the arguments:

            n_actions(int) = Number of actions
            post_processing(str) = Type of policy ('raw_contiguous', 'raw_parity', 'softmax')
            beta_scheduling(bool) = Inverse temperature parameter used in softmax (used if set to True)
            beta(float) = Beta parameter or inverse temperature (used only for softmax)
            increase_rate(float) = Amount added to beta at the end of each episode (used only for softmax)
            output_scaling(bool) = Output parameters are used if True
            output_init(function) = How the output parameters are initialized
            
        '''
        self.n_actions = n_actions
        self.post_processing = post_processing
        self.beta_scheduling = beta_scheduling
        self.beta = beta
        self.increase_rate = increase_rate
        self.output_scaling = output_scaling
        self.output_init = output_init

        if self.output_scaling == True:
            self.output_params = nn.parameter.Parameter(torch.Tensor(self.n_actions), requires_grad=True)
            self.output_init(self.output_params)
        else:
            self.register_parameter('w_input', None)


    def input(self,probs):
        if self.post_processing == 'raw_contiguous':
            policy = self.raw_contiguous(probs)
        elif self.post_processing == 'raw_parity':
            policy = self.raw_parity(probs)
        elif self.post_processing == 'softmax':
            policy = self.softmax(probs)
        else:
            raise ValueError("Invalid post-processing method specified.")
        return policy

    def raw_contiguous(self,probs):
        
        probs_flatten = probs.flatten()
        chunk_size = len(probs_flatten) // self.n_actions
        remainder = len(probs_flatten) % self.n_actions

        policy = []

        for i in range(self.n_actions):
            start = i * chunk_size
            end = (i + 1) * chunk_size

            if i < remainder:
                end += 1

            # Update the original policy list instead of creating a new one
            policy.append(sum(probs_flatten[start:end]))

        policy_tensor = torch.stack(policy)
        return policy_tensor
        
    def raw_parity(self,probs):

        if self.n_actions % 2 != 0:
            raise ValueError('For parity-like policy, n_actions must be an even number')
        
        probs_flatten = probs.flatten()
        policy = torch.zeros(self.n_actions)
        counter = 0
        for prob in probs_flatten:
            policy[counter] += prob
            counter += 1
            if counter == self.n_actions:
                counter = 0
        
        return policy
    
    def softmax(self, probs):
        
        if self.output_scaling == True:
            probs *= self.output_params

        if len(probs) == self.n_actions:
            scaled_output = probs * self.beta
            softmax_output = F.softmax(scaled_output, dim=0)
            return softmax_output
        else:
            probs_flatten = probs.flatten()
            chunk_size = len(probs_flatten) // self.n_actions
            remainder = len(probs_flatten) % self.n_actions

            policy = []

            for i in range(self.n_actions):
                start = i * chunk_size
                end = (i + 1) * chunk_size

                if i < remainder:
                    end += 1

                # Update the original policy list instead of creating a new one
                policy.append(sum(probs_flatten[start:end]))
            policy_tensor = torch.stack(policy)
            softmax_output = F.softmax(policy_tensor * self.beta, dim=0)
            return softmax_output
        
    def beta_schedule(self):
        if self.beta_scheduling == True:
            if self.post_processing == 'softmax':
                self.beta += self.increase_rate

In [None]:
class QuantumPolicyModel(nn.Module):
    
    def __init__(self, circuit, policy):
        super(QuantumPolicyModel, self).__init__()
        self.circuit = circuit
        self.policy = policy

    def forward(self, inputs):
        '''
        Input state is fed to the circuit - its output is then fed to the post processing 
        '''
        probs = self.circuit.input(inputs)
        probs_processed = self.policy.input(probs)
        return probs_processed
    
    def sample(self, inputs):
        '''
        Samples an action from the action probability distribution aka policy
        '''
        policy = self.forward(inputs)
        dist = torch.distributions.Categorical(policy)
        action = dist.sample()
        return action.item(), dist.log_prob(action), policy
    
    def get_parameters(self):
        '''
        Returns the values of each set of parameters
        '''
        parameter_values = [param.clone().detach().numpy().flatten() for param in self.circuit.parameters()]
        return parameter_values
    
    def get_gradients(self):
        '''
        Returns the gradient values of each set of parameters from both circuit and policy
        '''
        gradients = []

        # Get gradients from circuit parameters
        circuit_gradients = [torch.flatten(param.grad.clone().detach()) if param.grad is not None else torch.flatten(torch.zeros_like(param)) for param in self.circuit.parameters()]
        gradients.extend(circuit_gradients)

        # Get gradients from policy parameters
        policy_gradients = [torch.flatten(param.grad.clone().detach()) if param.grad is not None else torch.flatten(torch.zeros_like(param)) for param in self.policy.parameters()]
        gradients.extend(policy_gradients)

        return gradients

In [None]:
class ReinforceUpdate():

    def __init__(self, pqc, optimizer, env, env_name, n_episodes, max_t, gamma, print_every, verbose, file_name = None, rundate = None):
        
        self.pqc = pqc
        self.optimizer = optimizer
        self.env = env
        self.env_name = env_name
        self.n_episodes = n_episodes
        self.max_t = max_t
        self.gamma = gamma
        self.scores_deque = deque(maxlen=print_every)
        self.print_every = print_every
        self.verbose = verbose
        self.file_name = file_name
        self.rundate = rundate
        self.running_reward = 10

    def get_trajectory(self):
        '''
        Gets a trajectory based on the running policy until it runs out of bounds or solves the envinronment
        '''

        self.saved_log_probs = []
        self.rewards = []
        state = self.env.reset()
        for t in range(self.max_t):
            if t == 0:
                state_tensor = torch.tensor(state[0]).float()
            else:
                state_tensor = torch.tensor(state).float()
            action, log_prob, _, = self.pqc.sample(state_tensor)
            state, reward, done, _, _ = self.env.step(action)
            
            self.saved_log_probs.append(log_prob)
            self.rewards.append(reward)

            if done:
                self.scores_deque.append(sum(self.rewards))
                break

    def update_policy(self):
        '''
        Computes the loss and gradients and updates the policy via gradient methods
        '''

        R = 0
        policy_loss = []
        returns = []
        for r in self.rewards[::-1]:
            R = r + self.gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)

        for log_prob, ret in zip(self.saved_log_probs, returns):
            policy_loss.append(-log_prob * ret)

        policy_unsqueezed = [torch.unsqueeze(loss, 0) for loss in policy_loss]
        self.loss = torch.cat(policy_unsqueezed).sum()

        self.optimizer.zero_grad()
        self.loss.backward()    
        self.optimizer.step()
        
    def save_agent_data(self,main_path):
        '''
        Stores the model parameters into a json file

        '''

        agent_variables = {
            "Number of Qubits": self.pqc.circuit.n_qubits,
            "Number of Layers": self.pqc.circuit.n_layers,
            "Shots": self.pqc.circuit.shots,
            "Input Scaling": self.pqc.circuit.input_scaling,
            "Design": self.pqc.circuit.design,
            "Differentiation Method": self.pqc.circuit.diff_method,
            "Weight Initiation": "lambda shape, dtype=torch.float: torch.FloatTensor(shape).uniform_(-np.pi, np.pi)",
            "Input_init": get_function_representation(self.pqc.circuit.input_init),
            "Measure": get_function_representation(self.pqc.circuit.measure),
            "Measure Qubits": self.pqc.circuit.measure_qubits,
            "Policy Type": self.pqc.policy.post_processing,
            "Softmax scheduling (in case policy is softmax)": ("Starting beta: " + str(self.pqc.policy.beta) + ". Increase rate: " + str(self.pqc.policy.increase_rate)),
            "Optimizers": str(self.optimizer),
            "Envinronment Name": str(self.env_name),
            "Gamma (discounting factor)": self.gamma,
        }

        with open(os.path.join(main_path, "agent_characteristics.json"), "w") as f:
            json.dump(agent_variables, f, indent=4)

    def save_data(self,main_path):
        '''
        Saves the data into a .npz file of each episode

        '''

        run= os.path.join(main_path,str(self.file_name)+'_data.npz')

        if not os.path.exists(main_path):
            os.makedirs(main_path)

        if os.path.exists(run):
            data = np.load(run, allow_pickle=True)
            old_episode_reward = data['episode_reward'].tolist()
            old_loss = data['loss'].tolist()
            old_runtime = data['runtime'].tolist()
            old_params_gradients = data['params_gradients'].tolist()
            old_input_params_gradients = data['input_params_gradients'].tolist()
        else:
            old_episode_reward = []
            old_loss = []
            old_runtime = []
            old_params_gradients = []
            old_input_params_gradients = []


        old_episode_reward.append(self.scores_deque[-1])
        old_loss.append(self.loss.item())
        old_runtime.append(self.runtime)
        old_params_gradients.append(tensor_to_list(self.pqc.get_gradients()[0]))
        old_input_params_gradients.append(tensor_to_list(self.pqc.get_gradients()[1]))

        np.savez_compressed(run, episode_reward = np.array(old_episode_reward),
                                 loss = np.array(old_loss),
                                 runtime = np.array(old_runtime),
                                 params_gradients = np.array(old_params_gradients), 
                                 input_params_gradients = np.array(old_input_params_gradients))
        
        del old_episode_reward[:]
        del old_loss[:]
        del old_runtime[:]
        del old_params_gradients[:]
        del old_input_params_gradients[:]

    def writer_function(self, writer, iteration):
        '''
        Stores data into a tensorboard session

        '''
        writer.add_scalar("Episode Reward", np.mean(self.scores_deque), global_step=iteration)
        writer.add_scalar("Runtime", self.runtime, global_step=iteration)
        writer.add_scalar("Loss", self.loss.item(), global_step=iteration)
        writer.add_scalar("Beta", self.pqc.policy.beta, global_step=iteration)
    
    def train(self):

        logs_dir = "../../data"
        os.makedirs(logs_dir, exist_ok=True)
        envinronment_folder = os.path.join(logs_dir, self.env_name)
        os.makedirs(envinronment_folder, exist_ok=True)
        experiment_folder = f"{self.pqc.policy.post_processing}_{self.pqc.circuit.n_layers}layer_{self.rundate}"
        experiment_path = os.path.join(envinronment_folder, experiment_folder)
        os.makedirs(experiment_path, exist_ok=True)
        run = os.path.join(experiment_path,str(self.file_name))
        os.makedirs(run, exist_ok=True)
        writer = SummaryWriter(log_dir=run)
        self.save_agent_data(experiment_path)
        
        for i in range(1, self.n_episodes):
            start_time = time.time()
            self.get_trajectory()
            self.update_policy()
            end_time = time.time()
            self.runtime = end_time - start_time
            self.writer_function(writer,i)
            self.save_data(run)
            self.pqc.policy.beta_schedule()
            
            if np.mean(self.scores_deque) > self.env.spec.reward_threshold:
                print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i, np.mean(self.scores_deque)))
                break
            elif i % self.print_every == 0 and self.verbose == 1:
                print('Episode {}\tLast reward: {:.2f}\tLast {} Episodes average reward: {:.2f}\tRuntime: {:.2f}\t '.format(i, self.scores_deque[-1], self.print_every, np.mean(self.scores_deque), self.runtime))

# Single agent runs

In [None]:
n_qubits = 4
n_layers = 4      #set to 1 if data_reuploading is off
shots = None
input_scaling = True
design = 'jerbi_circuit' 
diff_method = 'backprop' 
weight_init = lambda shape, dtype=torch.float: torch.FloatTensor(shape).uniform_(-np.pi, np.pi)
input_init = torch.nn.init.uniform_
measure = measure_expval_pairs
measure_qubits = None
circuit = CircuitGenerator(n_qubits, 
                           n_layers,
                           shots,
                           input_scaling,
                           design,
                           diff_method,
                           weight_init,
                           input_init,
                           measure,
                           measure_qubits)


n_actions = 2
post_processing = 'softmax'
beta_scheduling = True
beta = .5
increase_rate = 0.003
output_scaling = False
output_init = torch.nn.init.uniform_
policy_type = PolicyType(n_actions, 
                         post_processing, 
                         beta_scheduling, 
                         beta, increase_rate, 
                         output_scaling, 
                         output_init)

pqc = QuantumPolicyModel(circuit,policy_type)

lr_list= [0.025, 0.075, 0.075]  # [weights, input_weights, output_weights]
circuit_params = list(circuit.parameters())
policy_params = list(policy_type.parameters())
params = circuit_params + policy_params
optimizers= create_optimizer_with_lr(params,lr_list)
env_name = 'CartPole-v1'
env = gym.make(env_name)
n_episodes = 1000
max_t = 500
gamma = 0.98
print_every = 10
verbose = 1
reinforce_update = ReinforceUpdate(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose)
reinforce_update.train()

# Multiple agent runs


## Raw Contiguous


In [None]:
# Parallel runs
current_directory = os.getcwd()

def train_agents(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, file_name):

    reinforce_update = ReinforceUpdate(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, file_name)
    reinforce_update.train()

    return f"Agent {file_name}: Training completed"

if __name__ == "__main__":
    n_qubits = 4
    n_layers = 4      #set to 1 if data_reuploading is off
    shots = None
    input_scaling = True
    design = 'jerbi_circuit' 
    diff_method = 'backprop' 
    weight_init = lambda shape, dtype=torch.float: torch.FloatTensor(shape).uniform_(-np.pi, np.pi)
    input_init = torch.nn.init.uniform_
    measure = measure_expval_pairs
    measure_qubits = None
    circuit = CircuitGenerator(n_qubits, 
                            n_layers,
                            shots,
                            input_scaling,
                            design,
                            diff_method,
                            weight_init,
                            input_init,
                            measure,
                            measure_qubits)


    n_actions = 2
    post_processing = 'raw_contiguous'
    policy_type = PolicyType(n_actions, 
                            post_processing)

    pqc = QuantumPolicyModel(circuit,policy_type)

    lr_list= [0.025, 0.075, 0.075]  # [weights, input_weights, output_weights]
    circuit_params = list(circuit.parameters())
    policy_params = list(policy_type.parameters())
    params = circuit_params + policy_params
    env_name = 'CartPole-v1'
    env = gym.make(env_name)
    n_episodes = 1000
    max_t = 500
    gamma = 0.98
    print_every = 10
    verbose = 1
        
    num_agents = 10

    results = Parallel(n_jobs=num_agents)(delayed(train_agents)(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, i) for i in range(num_agents))
    print(results)
    time.sleep(100)

## Raw Parity

In [None]:
# Parallel runs
current_directory = os.getcwd()

def train_agents(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, file_name, rundate):

    reinforce_update = ReinforceUpdate(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, file_name, rundate)
    reinforce_update.train()

    return f"Agent {file_name}: Training completed"

if __name__ == "__main__":
    n_qubits = 4
    n_layers = 4      #set to 1 if data_reuploading is off
    shots = None
    input_scaling = True
    design = 'jerbi_circuit' 
    diff_method = 'backprop' 
    weight_init = lambda shape, dtype=torch.float: torch.FloatTensor(shape).uniform_(-np.pi, np.pi)
    input_init = torch.nn.init.uniform_
    measure = measure_expval_pairs
    measure_qubits = None
    circuit = CircuitGenerator(n_qubits, 
                            n_layers,
                            shots,
                            input_scaling,
                            design,
                            diff_method,
                            weight_init,
                            input_init,
                            measure,
                            measure_qubits)


    n_actions = 2
    post_processing = 'raw_parity'
    policy_type = PolicyType(n_actions, 
                            post_processing)

    pqc = QuantumPolicyModel(circuit,policy_type)

    lr_list= [0.025, 0.075, 0.075]  # [weights, input_weights, output_weights]
    circuit_params = list(circuit.parameters())
    policy_params = list(policy_type.parameters())
    params = circuit_params + policy_params
    env_name = 'CartPole-v1'
    env = gym.make(env_name)
    n_episodes = 1000
    max_t = 500
    gamma = 0.98
    print_every = 10
    verbose = 1
    rundate = datetime.now().strftime('%Y-%m-%d_%H.%M.%S')  
    num_agents = 10

    results = Parallel(n_jobs=num_agents)(delayed(train_agents)(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, i+10, rundate) for i in range(num_agents))
    print(results)
    time.sleep(100)

## Softmax


In [None]:
# Parallel runs
current_directory = os.getcwd()

def train_agents(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, file_name, rundate):

    reinforce_update = ReinforceUpdate(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, file_name, rundate)
    reinforce_update.train()

    return f"Agent {file_name}: Training completed"

if __name__ == "__main__":
    n_qubits = 4
    n_layers = 4      #set to 1 if data_reuploading is off
    shots = None
    input_scaling = True
    design = 'jerbi_circuit' 
    diff_method = 'backprop' 
    weight_init = lambda shape, dtype=torch.float: torch.FloatTensor(shape).uniform_(-np.pi, np.pi)
    input_init = torch.nn.init.uniform_
    measure = measure_expval_pairs
    measure_qubits = None
    circuit = CircuitGenerator(n_qubits, 
                            n_layers,
                            shots,
                            input_scaling,
                            design,
                            diff_method,
                            weight_init,
                            input_init,
                            measure,
                            measure_qubits)


    n_actions = 2
    post_processing = 'softmax'
    beta_scheduling = True
    beta = .5
    increase_rate = 0.003
    output_scaling = False
    output_init = torch.nn.init.uniform_
    policy_type = PolicyType(n_actions, 
                            post_processing, 
                            beta_scheduling, 
                            beta, increase_rate, 
                            output_scaling, 
                            output_init)

    pqc = QuantumPolicyModel(circuit,policy_type)

    lr_list= [0.025, 0.075, 0.075]  # [weights, input_weights, output_weights]
    circuit_params = list(circuit.parameters())
    policy_params = list(policy_type.parameters())
    params = circuit_params + policy_params
    optimizers= create_optimizer_with_lr(params,lr_list)
    env_name = 'CartPole-v1'
    env = gym.make(env_name)
    n_episodes = 1000
    max_t = 500
    gamma = 0.98
    print_every = 10
    verbose = 1
    rundate = datetime.now().strftime('%Y-%m-%d_%H.%M.%S')      
    num_agents = 10

    results = Parallel(n_jobs=num_agents)(delayed(train_agents)(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, i, rundate) for i in range(num_agents))
    print(results)

# Studying different inverse temperature schedulings


In [None]:
# Parallel runs
current_directory = os.getcwd()

def train_agents(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, file_name, rundate):

    reinforce_update = ReinforceUpdate(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, file_name, rundate)
    reinforce_update.train()

    return f"Agent {file_name}: Training completed"

if __name__ == "__main__":
    n_qubits = 4
    n_layers = 4      #set to 1 if data_reuploading is off
    n_actions = 2
    shots = None
    input_scaling = True
    design = 'jerbi_circuit' 
    diff_method = 'backprop' 
    weight_init = lambda shape, dtype=torch.float: torch.FloatTensor(shape).uniform_(-np.pi, np.pi)
    input_init = torch.nn.init.uniform_
    measure = measure_expval_pairs
    measure_qubits = None
    circuit = CircuitGenerator( n_qubits, 
                                n_layers,
                                shots,
                                input_scaling,
                                design,
                                diff_method,
                                weight_init,
                                input_init,
                                measure,
                                measure_qubits)

    post_processing = 'softmax'
    beta_list = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.]
    increase_rate = 0.003
    policy_type_list = [PolicyType(n_qubits, n_actions, post_processing, i, increase_rate) for i in beta_list]

    pqc_list = [QuantumPolicyModel(circuit,policy_type) for policy_type in policy_type_list]

    lr_list= [0.025,0.075]
    params= circuit.parameters()
    optimizers= create_optimizer_with_lr(params,lr_list)
    env_name = 'CartPole-v1'
    env = gym.make(env_name)
    n_episodes = 1000
    max_t = 500
    gamma = 0.98
    print_every = 10
    verbose = 1
    rundate = datetime.now().strftime('%Y-%m-%d_%H.%M.%S')      
    num_agents = 10

    results = Parallel(n_jobs=num_agents)(delayed(train_agents)(pqc_list[i], optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, beta_list[i], rundate) for i in range(num_agents))
    print(results)

In [None]:
# Parallel runs
current_directory = os.getcwd()

def train_agents(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, file_name, rundate):

    reinforce_update = ReinforceUpdate(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, file_name, rundate)
    reinforce_update.train()

    return f"Agent {file_name}: Training completed"

if __name__ == "__main__":
    n_qubits = 4
    n_layers = 4      #set to 1 if data_reuploading is off
    n_actions = 2
    shots = None
    input_scaling = True
    design = 'jerbi_circuit' 
    diff_method = 'backprop' 
    weight_init = lambda shape, dtype=torch.float: torch.FloatTensor(shape).uniform_(-np.pi, np.pi)
    input_init = torch.nn.init.uniform_
    measure = measure_expval_pairs
    measure_qubits = None
    circuit = CircuitGenerator( n_qubits, 
                                n_layers,
                                shots,
                                input_scaling,
                                design,
                                diff_method,
                                weight_init,
                                input_init,
                                measure,
                                measure_qubits)

    post_processing = 'softmax'
    beta = 0.35
    increase_rate_list = [.0005, .001, .002, .0035, .005, .0065, .008, .0095, .01, .02]
    policy_type_list = [PolicyType(n_qubits, n_actions, post_processing, beta, i) for i in increase_rate_list]

    pqc_list = [QuantumPolicyModel(circuit,policy_type) for policy_type in policy_type_list]

    lr_list= [0.025,0.075]
    params= circuit.parameters()
    optimizers= create_optimizer_with_lr(params,lr_list)
    env_name = 'CartPole-v1'
    env = gym.make(env_name)
    n_episodes = 1000
    max_t = 500
    gamma = 0.98
    print_every = 10
    verbose = 1
    rundate = datetime.now().strftime('%Y-%m-%d_%H.%M.%S')      
    num_agents = 10

    results = Parallel(n_jobs=num_agents)(delayed(train_agents)(pqc_list[i], optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, increase_rate_list[i], rundate) for i in range(num_agents))
    print(results)

# Studying different learning rates

In [None]:
def listToString(s):
 
    # initialize an empty string
    str1 = ""
 
    # traverse in the string
    for ele in s:
        str1 += ele
 
    # return string
    return str1

In [None]:
# Parallel runs
current_directory = os.getcwd()

def train_agents(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, file_name, rundate):

    reinforce_update = ReinforceUpdate(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, file_name, rundate)
    reinforce_update.train()

    return f"Agent {file_name}: Training completed"

if __name__ == "__main__":
    n_qubits = 4
    n_layers = 4      #set to 1 if data_reuploading is off
    n_actions = 2
    shots = None
    input_scaling = True
    design = 'jerbi_circuit' 
    diff_method = 'backprop' 
    weight_init = lambda shape, dtype=torch.float: torch.FloatTensor(shape).uniform_(-np.pi, np.pi)
    input_init = torch.nn.init.uniform_
    measure = measure_probs
    measure_qubits = None
    circuit = CircuitGenerator( n_qubits, 
                                n_layers,
                                shots,
                                input_scaling,
                                design,
                                diff_method,
                                weight_init,
                                input_init,
                                measure,
                                measure_qubits)


    post_processing = 'raw_parity'
    policy_type = PolicyType(n_qubits, n_actions, post_processing)


    pqc = QuantumPolicyModel(circuit,policy_type)
    

    pqc = pqc
    lr_list= [[0.01,0.01],[0.025,0.01],[0.05,0.01],[0.1,0.01],[0.01,0.25],[0.01,0.05],[0.01,0.1],[0.25,0.25],[0.50,0.50],[0.1,0.1]]
    params= [circuit.parameters() for i in range(len(lr_list))]
    optimizers= [create_optimizer_with_lr(param,lr) for param,lr in zip(params,lr_list)]
    env_name = 'CartPole-v1'
    env = gym.make(env_name)
    n_episodes = 1000
    max_t = 500
    gamma = 0.98
    print_every = 10
    verbose = 1
    rundate = datetime.now().strftime('%Y-%m-%d_%H.%M.%S')  
    num_agents = 10

    results = Parallel(n_jobs=num_agents)(delayed(train_agents)(pqc, optimizers[i], env, env_name, n_episodes, max_t, gamma, print_every, verbose, lr_list[i], rundate) for i in range(num_agents))
    print(results)
    time.sleep(100)

In [None]:
# Parallel runs
current_directory = os.getcwd()

def train_agents(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, file_name, rundate):

    reinforce_update = ReinforceUpdate(pqc, optimizers, env, env_name, n_episodes, max_t, gamma, print_every, verbose, file_name, rundate)
    reinforce_update.train()

    return f"Agent {file_name}: Training completed"

if __name__ == "__main__":
    n_qubits = 4
    n_layers = 4      #set to 1 if data_reuploading is off
    n_actions = 2
    shots = None
    input_scaling = True
    design = 'jerbi_circuit' 
    diff_method = 'backprop' 
    weight_init = lambda shape, dtype=torch.float: torch.FloatTensor(shape).uniform_(-np.pi, np.pi)
    input_init = torch.nn.init.uniform_
    measure = measure_probs
    measure_qubits = None
    circuit = CircuitGenerator( n_qubits, 
                                n_layers,
                                shots,
                                input_scaling,
                                design,
                                diff_method,
                                weight_init,
                                input_init,
                                measure,
                                measure_qubits)


    post_processing = 'raw_parity'
    policy_type = PolicyType(n_qubits, n_actions, post_processing)


    pqc = QuantumPolicyModel(circuit,policy_type)
    

    pqc = pqc
    lr_list= [[0.01,0.01],[0.025,0.01],[0.05,0.01],[0.1,0.01],[0.01,0.25],[0.01,0.05],[0.01,0.1],[0.25,0.25],[0.50,0.50],[0.1,0.1]]
    params= [circuit.parameters() for i in range(len(lr_list))]
    optimizers= [create_optimizer_with_lr(param, lr, use_amsgrad=True) for param,lr in zip(params,lr_list)]
    env_name = 'CartPole-v1'
    env = gym.make(env_name)
    n_episodes = 1000
    max_t = 500
    gamma = 0.98
    print_every = 10
    verbose = 1
    rundate = datetime.now().strftime('%Y-%m-%d_%H.%M.%S')  
    num_agents = 10

    results = Parallel(n_jobs=num_agents)(delayed(train_agents)(pqc, optimizers[i], env, env_name, n_episodes, max_t, gamma, print_every, verbose, lr_list[i], rundate) for i in range(num_agents))
    print(results)
    time.sleep(100)