In [378]:
from copy import deepcopy
import numpy as np
import random
from qiskit import QuantumCircuit, ClassicalRegister, QuantumRegister
from qiskit import Aer, transpile, assemble
from qiskit.providers import backend
from qiskit.aqua.components.optimizers import COBYLA

In [379]:
#
CAT = "c"
DOG = "d"
MOUSE = "m"
EMPTY = "emp"

# gridWorld = [[MOUSE, EMPTY, DOG],
#         [EMPTY, EMPTY, EMPTY],
#          [DOG, EMPTY, CAT]]

# actions:
UP = "00"
DOWN = "01"
LEFT = "10"
RIGHT = "11"

ACTIONS = [UP, DOWN, LEFT, RIGHT]


# # super parameters
# N_STATES = 4
# N_EPISODES = 20

# MAX_EPISODE_STEPS = 100

# MIN_ALPHA = 0.02

# alphas = np.linspace(1.0, MIN_ALPHA, N_EPISODES)
# gamma = 1.0
# eps = 0.2

In [380]:
class State:
    def __init__(self, catP):
        self.row = catP[0]
        self.column = catP[1]
        self.catP = catP

    def __eq__(self, other):
        return isinstance(other, State) and self.row == other.row and self.column == other.column and self.catP == other.catP

#     def __hash__(self):
#         return tuple(self.catP)

    def tokey(self):
        return tuple(self.catP)


    def __str__(self):
        return f"State(cat_pos={self.catP})"


In [381]:
class GridWorld:
    def __init__(self, s, catP, mouseP):
        self.numRows = s[0]
        self.numColumns = s[1]
        self.catP = catP
        self.mouseP = mouseP
        # self.dogP = dogP
        assert(not self.compaireList(self.catP, self.mouseP))
    
    def getItem(self, p):
        if p[0]>=self.numRows or p[0]<0:
            return None
        if p[1]>=self.numColumns or p[1]<0:
            return None
        if self.compaireList(p, catP):
            return CAT
        elif self.compaireList(p, mouseP):
            return MOUSE
        # elif self.compaireList(p, DOG):
        #     return DOG
        else:
            return EMPTY

    def compaireList(self, l1,l2):
        for i, j in zip(l1, l2):
            if i!=j:
                return False
        return True

    def getNumRows(self):
        return self.numRows

    def getNumColumns(self):
        return self.numColumns

    def getMouse(self):
        return self.mouse
    
    def getCatP(self):
        return self.catP

    def setCatP(self, p):
        self.catP = p
    
    def initCatState(self):
        # init cat position
        catP = [random.randint(0, self.getNumRows()), random.randint(0, self.getNumColumns())]
        while self.getItem(catP) != EMPTY and self.getItem(catP) != CAT:
            catP = [random.randint(0, self.getNumRows()), random.randint(0, self.getNumColumns())]
        self.setCatP(catP)
        return State(catP)
    
    def show(self):
        output = ""
        for i in range(self.numRows):
            for j in range(self.numColumns):
                if self.compaireList([i,j], self.catP):
                    output += CAT + " "
                elif self.compaireList([i,j], self.mouseP):
                    output += MOUSE + " "
                else:
                    output += EMPTY + " "
            output += "\n"
        print(output)


In [382]:
# quantum circuit: state->action
class QNet:# just a place for 
    
    def __init__(self, qTable, gridWorld:GridWorld, params, alpha, gamma, eps=0.02):
        
        self.params = params # inital parameters are the same for all qNetwork
        self.gw = gridWorld
        self.qt = qTable
        self.eps = eps
        self.backend = Aer.get_backend("qasm_simulator")
        self.NUM_SHOTS = 1000 # number of measurements 
        self.optimizer = COBYLA(maxiter=500, tol=0.0001) # off the shelf
        self.gamma = gamma
        self.alpha = alpha
        
        self.qcs = dict() # all qubits
        self.rets = dict() # resulting parameters after optimization for all points in the grid
        
        
        self.qc = None # qubit of current state
        self.state = None
        
    
        qcs = {}
        def qcMaker(params):
            qr = QuantumRegister(2, name="q")
            cr = ClassicalRegister(2, name="c")
            qc = QuantumCircuit(qr, cr)
            qc.u3(self.params[0], self.params[1], self.params[2], qr[0])
            qc.u3(self.params[3], self.params[4], self.params[5], qr[1])
            qc.cx(qr[0], qr[1])
            qc.measure(qr, cr)
            return qc

        for i in range(self.gw.getNumRows()):
            for j in range(self.gw.getNumRows()):
                qc = qcMaker(params)
                qcs[i, j] = qc 
    
        self.qcs = qcs
        
    def newPosition(self, state, action):
            p = deepcopy(state.catP)
            if action == UP:
                p[0] = max(0, p[0] - 1)
            elif action == DOWN:
                p[0] = min(self.gw.getNumRows() - 1, p[0]+1)
            elif action == LEFT:
                p[1] = max(0, p[1] - 1)
            elif action == RIGHT:
                p[1] = min(self.gw.getNumColumns() - 1, p[1] + 1)
            else:
                raise ValueError(f"Unkown action {action}")
            return p
        
    def getReward(self, p):
        grid = self.gw.getItem(p)
        if grid == DOG:
            reward = -100
            end = True
            self.gw.setCatP(p)
        elif grid == MOUSE:
            reward = 100
            end = True
            self.gw.setCatP(p)
        elif grid == EMPTY:
            reward = -1
            end = False
            self.gw.setCatP(p)
        elif grid == CAT:
            reward = -2 # (maybe less than reward of empty)
            end = False
        else:
            raise ValueError(f"Unknown grid item {grid}")
        return reward, end
    
    
    def selectAction(self, state, training):
        if random.uniform(0, 1) < self.eps:
            return int(random.choice(ACTIONS),2)
        else:
            if training:
                self.qc = self.qcs[state.row, state.column]
                self.state = state
                self.updateCircuit(state)
            return np.argmax(self.qt[self.state.catP[0],self.state.catP[1]])
        
    def lossFunction(self, params):
        #state = self.state
        #qc = self.qc
        t_qc = transpile(self.qc, self.backend)
        job = assemble(t_qc, shots=self.NUM_SHOTS)
        rlt = self.backend.run(job).result()
        counts = rlt.get_counts(self.qc) 
        action = max(counts, key = counts.get)
        nextPosition = self.newPosition(self.state, action) # handle the 
        reward, _ = self.getReward(nextPosition)
        # update q-table(but not very sure, update only for this action or for all actions)
        targetQvalue = reward + self.gamma *  np.max(self.qt[nextPosition[0],nextPosition[1]])
        if targetQvalue - self.qt[(self.state.catP[0],self.state.catP[1])][int(action,2)] > 0:
            self.qt[self.state.catP[0],self.state.catP[1]][int(action,2)] += self.alpha * (targetQvalue - self.qt[self.state.catP[0],self.state.catP[1]][int(action,2)]) # update q-table
        return targetQvalue - self.qt[self.state.catP[0],self.state.catP[1]][int(action,2)]

    
    def updateCircuit(self, state):
        self.rets[self.state.catP[0],self.state.catP[1]] = self.optimizer.optimize(num_vars=6, objective_function=self.lossFunction, initial_point=self.params)


In [383]:
class Cat:
    def __init__(self, gridWorld:GridWorld, qNet: QNetwork, training=True, eps = 0.2):
        self.eps = eps
        self.gw = gridWorld
        self.training = training
        self.qNet = qNet

        # result: ret = optimizer.optimize() 
        # self.rets = {(0,0):ret, (0,1):ret2,...}
        
        # we have 9 circuits here in qcs TODO: maybe a random, need try, need solve!!!
        self.state = None

    def newPosition(self, state, action):
            p = deepcopy(state.catP)
            if action == UP:
                p[0] = max(0, p[0] - 1)
            elif action == DOWN:
                p[0] = min(self.gw.getNumRows() - 1, p[0]+1)
            elif action == LEFT:
                p[1] = max(0, p[1] - 1)
            elif action == RIGHT:
                p[1] = min(self.gw.getNumColumns() - 1, p[1] + 1)
            else:
                raise ValueError(f"Unkown action {action}")
            return p

    def getReward(self, p):
        grid = self.gw.getItem(p)
        if grid == DOG:
            reward = -100
            end = True
            self.gw.setCatP(p)
        elif grid == MOUSE:
            reward = 100
            end = True
            self.gw.setCatP(p)
        elif grid == EMPTY:
            reward = -1
            end = False
            self.gw.setCatP(p)
        elif grid == CAT:
            reward = -2 # (maybe less than reward of empty)
            end = False
        else:
            raise ValueError(f"Unknown grid item {grid}")
        return reward, end

    def act(self, state, action):
        p = self.newPosition(state, action)
        reward, end = self.getReward(p)
        return deepcopy[p], reward, end
    
    def setTraining(self, training):
        self.Training = training


In [384]:
gridSize = [3, 3]
catP = [gridSize[0]-1, gridSize[0]-1]
mouseP = [0, 0]
EPS = 10
MAX_EPS_STEP = 10

# initQTable
def initqTable(ACTIONS): #self was there
        d = {}
        for i in range(gridSize[0]):
            for j in range(gridSize[1]):
                d[(i,j)] = np.zeros(4)

        return d
# initGridWorld
gridWorld = GridWorld(gridSize, catP=catP, mouseP=mouseP)
gridWorld.show()
qTable = initqTable(ACTIONS)# MISTAKE
print(qTable)

m emp emp 
emp emp emp 
emp emp c 

{(0, 0): array([0., 0., 0., 0.]), (0, 1): array([0., 0., 0., 0.]), (0, 2): array([0., 0., 0., 0.]), (1, 0): array([0., 0., 0., 0.]), (1, 1): array([0., 0., 0., 0.]), (1, 2): array([0., 0., 0., 0.]), (2, 0): array([0., 0., 0., 0.]), (2, 1): array([0., 0., 0., 0.]), (2, 2): array([0., 0., 0., 0.])}


In [385]:
initialParameters = np.zeros(6) # Same for all qubits! 

In [386]:
qNetwork = QNetwork(qTable, gridWorld, initialParameters, eps)
print(qNetwork.selectAction(State(catP),True))

0


In [387]:
for i in range(0,40):
    print(qNetwork.selectAction(State(catP),True))

0
0
0
1
0
0
0
1
0
0
0
0
0
3
0
0
0
0
0
0
3
0
0
0
0
0
0
0
0
0
0
3
0
0
0
0
0
0
1
0
