In [1]:
from Game import *

def initial_state():
    return randint(15,25)

def valid_moves(state,player):
    if state==1:
        return [1]
    elif state==2:
        return [1,2]
    else:
        return [1,2,3]
        
def show_state(state):
    print ("There are ",state," sticks left.")

def update_state(state,player,move):
    new_state=state-move
    return new_state

def win_status(state,player):

    if state==1:
        return 'win'
    
    elif state==0:
        return 'lose'
    
    else:
        return None


def human_move(state,player):

    move=input('Take 1, 2 or 3 sticks ')
    return move


def perfect_move(state,player):
    move=(state-1)%4
    if move==0:
        move=1
    return move

def random_move(state,player):
    move=random_choice(valid_moves(state,player))
    return move


human_agent=Agent(human_move)
random_agent=Agent(random_move)
perfect_agent=Agent(perfect_move)


Version:  0.2.5


from http://mnemstudio.org/path-finding-q-learning.htm

Q(state, action) = R(state, action) + Gamma * Max[Q(next state, all actions)]

basically q learning with alpha = 1, epsilon =1

In [5]:
def Q_move(state,player,info):
    Q=info.Q
    last_action=info.last_action
    last_state=info.last_state
    
    gamma=info.gamma  # memory 
    epsilon=info.epsilon  # probability of doing random move
    
    if not (state,player) in Q:
        Q[(state,player)]=Table()
        for action in valid_moves(state,player):
            Q[(state,player)][action]=0.0
            
    if random.random()<epsilon:  # random move
        action=random_choice(Q[(state,player)])
    else:
        action=top_choice(Q[(state,player)])
        
        
    if not last_action is None:  # anything but the first move
        r=0.0
        Q[(last_state,player)][last_action]+=alpha*(r + 
            gamma*max([Q[(state,player)][a] for a in Q[(state,player)]]) -
            Q[(last_state,player)][last_action] )
        
    return action

def Q_post(status,player,info):
    Q=info.Q
    last_action=info.last_action
    last_state=info.last_state
    
    alpha=info.alpha  # learning rate
    gamma=info.gamma  # memory 
    epsilon=info.epsilon  # probability of doing random move

    if status=='lose':
        r=-1.0
    elif status=='win':
        r=1.0
    else:
        r=0.0
        
    if not last_action is None:  # anything but the first move
        Q[(last_state,player)][last_action]+=alpha*(r -
            Q[(last_state,player)][last_action] )
        

In [13]:
Q_agent=Agent(Q_move)
Q_agent.post=Q_post

Q_agent.Q=Remember(filename='Q_data.json')
Q_agent.alpha=0.3  # learning rate
Q_agent.gamma=0.9  # memory
Q_agent.epsilon=0.1  # chance of making a random move

In [7]:
g=Game()
g.run(Q_agent,perfect_agent)
g.report()

====
Game  1
There are  16  sticks left.
Player 1 moves 3
There are  13  sticks left.
Player 2 moves 1
There are  12  sticks left.
Player 1 moves 3
There are  9  sticks left.
Player 2 moves 1
There are  8  sticks left.
Player 1 moves 3
There are  5  sticks left.
Player 2 moves 1
There are  4  sticks left.
Player 1 moves 3
There are  1  sticks left.
Player  1 won.
Total number of games:  1
Winning 100.00 percent
Losing 0.00 percent
Tie 0.00 percent


In [8]:
Q_agent.Q

{(4, 1): {1: 0.0, 2: 0.0, 3: 0.3},
 (8, 1): {1: 0.0, 2: 0.0, 3: 0.0},
 (12, 1): {1: 0.0, 2: 0.0, 3: 0.0},
 (16, 1): {1: 0.0, 2: 0.0, 3: 0.0}}

While learning, set epsilon to 0.1

In [9]:
Q_agent.epsilon=0.1

g=Game(number_of_games=1000)
g.display=False
g.run(Q_agent,perfect_agent)
Remember(Q_agent.Q,filename='Q_data.json')
g.report()

Total number of games:  1000
Winning 50.60 percent
Losing 49.40 percent
Tie 0.00 percent


In [10]:
Q_agent.Q

{(4, 1): {1: -0.9992020773370239,
  2: -0.9903110989592999,
  3: 0.9999999999999999},
 (5, 1): {1: -0.9999999999999999,
  2: -0.9999999999999999,
  3: -0.9999999999999999},
 (8, 1): {1: -0.8989536089288866,
  2: -0.8987025603011634,
  3: 0.8999999999999998},
 (9, 1): {1: -0.8999999999999998,
  2: -0.8999999999999998,
  3: -0.8999999999999998},
 (12, 1): {1: -0.8086547152865513,
  2: -0.8061532048562045,
  3: 0.8099999999999997},
 (13, 1): {1: -0.8099999999999997,
  2: -0.8099999999999997,
  3: -0.8099999999999997},
 (15, 1): {1: -0.47881284519955125,
  2: 0.7289999999999875,
  3: -0.28942818072363713},
 (16, 1): {1: -0.7287048525831605,
  2: -0.7262829849484715,
  3: 0.7289999999999996},
 (17, 1): {1: -0.7289999999999986,
  2: -0.7289999999999985,
  3: -0.7289999999999983},
 (18, 1): {1: 0.6560999999998705,
  2: -0.6132611887557214,
  3: -0.6089849648103065},
 (19, 1): {1: -0.4307472677759666,
  2: 0.6560999999896954,
  3: -0.21869999999999992},
 (20, 1): {1: -0.6363496227203255,
  2: 

When we want to see how good it really is, turn off epsilon (no random moves)

In [11]:
Q_agent.epsilon=0.0

g=Game(number_of_games=1000)
g.display=False
g.run(Q_agent,perfect_agent)
Remember(Q_agent.Q,filename='Q_data.json')
g.report()

Total number of games:  1000
Winning 73.90 percent
Losing 26.10 percent
Tie 0.00 percent


## Can a Q-agent play against another?

In [15]:
Q1_agent=Agent(Q_move)
Q1_agent.post=Q_post

Q1_agent.Q=Remember(filename='Q1_data.json')
Q1_agent.alpha=0.3  # learning rate
Q1_agent.gamma=0.9  # memory
Q1_agent.epsilon=0.1  # chance of making a random move

Q2_agent=Agent(Q_move)
Q2_agent.post=Q_post

Q2_agent.Q=Remember(filename='Q2_data.json')
Q2_agent.alpha=0.3  # learning rate
Q2_agent.gamma=0.9  # memory
Q2_agent.epsilon=0.1  # chance of making a random move


Resetting the database Q1_data.json
Resetting the database Q2_data.json


In [16]:
Q1_agent.epsilon=0.1
Q2_agent.epsilon=0.1

g=Game(number_of_games=1000)
g.display=False
g.run(Q1_agent,Q2_agent)
Remember(Q1_agent.Q,filename='Q1_data.json')
Remember(Q2_agent.Q,filename='Q2_data.json')
g.report()

Total number of games:  1000
Winning 59.90 percent
Losing 40.10 percent
Tie 0.00 percent


In [17]:
Q1_agent.epsilon=0.0
Q2_agent.epsilon=0.0
Q1_agent.alpha=0.0
Q2_agent.alpha=0.0

g=Game(number_of_games=1000)
g.display=False
g.run(Q1_agent,perfect_agent)
g.report()

Total number of games:  1000
Winning 71.50 percent
Losing 28.50 percent
Tie 0.00 percent


In [18]:
Q1_agent.epsilon=0.0
Q2_agent.epsilon=0.0
Q1_agent.alpha=0.0
Q2_agent.alpha=0.0

g=Game(number_of_games=1000)
g.display=False
g.run(perfect_agent,Q2_agent)
g.report()

Total number of games:  1000
Winning 70.90 percent
Losing 29.10 percent
Tie 0.00 percent
