#### Q-learning Algorithm to find the optimal policy


In [1]:
import numpy as np
import random as rdm


# Original parameters for the aquarium problem
Lambda=1; m=3;
d=5; s=20;
rho=0.01; c=60;

nweeks = 1000000


def Qlearning(m, rho, nweeks=1000000):

    alpha=1
    a_factor = (0.001/alpha)**(1/nweeks)     
        # a_factor has to be really close to 1, otherwise after a few weeks, 
        # the learning stops because alpha becomes too small.
        # This formula ensures that at the end of the simulation, alpha is 0.001
    gamma = 0.85
    
    
    Q = np.zeros([m+1,m+1])
    
    X = int(rdm.random()*m) # initial stock
    
    for week in range(nweeks):
        a = int(rdm.randint(0,m-X))    # a is a random number between 0 and m-X 
                                     # a = how many aquariums to re-stock
        Xp=X
        X += a
    
        # possible damages
        K = np.random.binomial(X, rho)
        X -= K   # lost K aquariums from inventory
    
        # sample the demand and determine the number of sales
        D = np.random.poisson(Lambda)
        S = np.min([D,X])    # number of sales
        X -= S   # sold S aquariums
    
        # profit
        R = s*S - d*(a>0) - c*K
    
        # update Q
        alpha *= a_factor   # reduce the learning rate by a factor just below 1
        Q[Xp,a] = (1-alpha)*Q[Xp,a] + alpha*(R + gamma*np.max(Q[X,0:m-X+1]))
      
    return Q