In [1]:
from Game import *

Version:  0.2.23


# Nim

Rules:

1. start with 21 sticks -- state = number of sticks
2. turns alternate taking 1,2, or 3 sticks
3. player taking last stick loses

In [23]:
def initial_state():
    return randint(10,24)

def show_state(state):
    print("There are ",state,"sticks.")
    
def valid_moves(state,player):
    if state==1:
        return [1]
    elif state==2:
        return [1,2]
    else:
        return [1,2,3]
    
def update_state(state,player,move):
    # move = number of sticks to pick up
    new_state=state-move # remove the sticks
    return new_state

def win_status(state,player):
    if state==0:
        return 'lose'
    elif state==1:
        return 'win'
    else:
        return None
    
    # there is no stalemate

In [24]:
def human_move(state,player):
    print("Player ",player)
    move=int(input("How many sticks?"))
    return move

human_agent=Agent(human_move)

In [25]:
def random_move(state,player):
    possible_moves=valid_moves(state,player)
    move=random.choice(possible_moves)
    return move
random_agent=Agent(random_move)

In [26]:
def perfect_move(state,player):
    move=(state-1)%4
    if move==0:
        move=1
    return move
perfect_agent=Agent(perfect_move)

In [27]:
from Game.minimax import *
def minimax_move(state,player):

    values,moves=minimax_values(state,player,display=False)
    return top_choice(moves,values)


minimax_agent=Agent(minimax_move)

In [28]:
def skittles_move(state,player,info):
    S=info.S
    last_action=info.last_action
    last_state=info.last_state
    
    
    # if Ive never seen this state before
    if not state in S:
        actions=valid_moves(state,player)

        S[state]=Table()
        for action in actions:
            S[state][action]=3     
    
    move=weighted_choice(S[state])  # weighted across actions
    
    # what if there are no skittles for a particular state?
    # move is None in that case
    
    if move is None:
        # learn a little bit
        if last_state:
            S[last_state][last_action]=S[last_state][last_action]-1
            if S[last_state][last_action]<0:
                S[last_state][last_action]=0
        
        move=random_move(state,player)
    
    return move

def skittles_after(status,player,info):
    S=info.S
    last_action=info.last_action
    last_state=info.last_state

    if status=='lose':
        # learn a little bit
        S[last_state][last_action]=S[last_state][last_action]-1
        if S[last_state][last_action]<0:
            S[last_state][last_action]=0
        
    


skittles_agent=Agent(skittles_move)
skittles_agent.S=Table()
skittles_agent.post=skittles_after


In [29]:
def Q_move(state,player,info):
    Q=info.Q
    last_action=info.last_action
    last_state=info.last_state
    
    α=info.α
    γ=info.γ
    ϵ=info.ϵ
    

    # if Ive never seen this state before
    if not state in Q:
        actions=valid_moves(state,player)

        Q[state]=Table()
        for action in actions:
            Q[state][action]=0     
    
    # deal with random vs top choice here
    if random.random()<ϵ:
        move=random_move(state,player)  
    else:
        move=top_choice(Q[state]) 
    
    # what if there are no skittles for a particular state?
    # move is None in that case
    
    if not last_action is None:  # not the first move
        # learn a little bit
        # change equation here
        reward=0
        
        # Bellman equation
        Q[last_state][last_action] += α*(reward+
                         γ*max([Q[state][a] for a in Q[state]])  - 
                                Q[last_state][last_action])
    
        
    
    return move

def Q_after(status,player,info):
    Q=info.Q
    last_action=info.last_action
    last_state=info.last_state

    α=info.α
    γ=info.γ
    ϵ=info.ϵ
    
    if status=='lose':
        reward=-1
    elif status=='win':
        reward=1
    elif status=='stalemate':
        reward=0.5
    else:
        reward=0
        
    # learn a little bit
    Q[last_state][last_action] += α*(reward-Q[last_state][last_action])
        


Q_agent=Agent(Q_move)
Q_agent.Q=LoadTable('Q_data.json')
Q_agent.post=Q_after

Q_agent.α=0.3  # learning rate
Q_agent.γ=0.9  # memory constant, discount factor
Q_agent.ϵ=0.1  # probability of a random move during learning

## Training stage

- the Q values can change
- that the agent takes random moves sometimes

In [30]:
Q_agent.α=0.3  # learning rate
Q_agent.ϵ=0.1  # probability of a random move during learning

In [31]:
g=Game(100)
g.display=False
g.run(perfect_agent,Q_agent);

## Testing stage

- the Q values **do not change**
- that the agent **never takes random moves**

In [32]:
Q_agent.α=0.0  # learning rate
Q_agent.ϵ=0.0  # probability of a random move during learning

In [33]:
g=Game(10)
g.display=False
g.run(perfect_agent,Q_agent);

In [34]:
g.report()

Total number of games:  10
Winning 100.00 percent
Losing 0.00 percent
Tie 0.00 percent


In [35]:
Q_agent.Q

{2: {1: 0.9999922690062802, 2: -0.657},
 3: {1: -0.3, 2: 0.9999999632966318, 3: -0.3},
 4: {1: -0.882351, 2: -0.882351, 3: 0.9999998471329936},
 5: {1: -0.9999999820153496, 2: -0.9999999820153496, 3: -0.9999999874107447},
 6: {1: 0.8998871819717115, 2: -0.657, 3: -0.3},
 7: {1: 0, 2: 0.8999574131741293, 3: -0.657},
 8: {1: -0.8234429353958435, 2: 0, 3: 0.8999993771091555},
 9: {1: -0.8999994062712278, 2: -0.8999993372089824, 3: -0.8999994919396499},
 10: {1: 0.8098502078973727, 2: -0.57459723852663, 3: -0.43613623661223455},
 11: {1: -0.31808257341480006, 2: 0.8098515993077074, 3: 0},
 12: {1: -0.5325680449554707, 2: -0.21739672590588902, 3: 0.8099979804135733},
 13: {1: -0.809930755880558, 2: -0.8099405930946288, 3: -0.8099457092923006},
 14: {1: 0.7284028814967268, 2: -0.5170723835347195, 3: -0.4522928158670315},
 15: {1: -0.20821947942937508, 2: 0.7287188645048186, 3: -0.3809729052891},
 16: {1: -0.39075822593747767, 2: -0.10533437801910903, 3: 0.7289580891570612},
 17: {1: -0.72570

In [36]:
SaveTable(Q_agent.Q,'Q_data.json')

## Q vs Q with progress

In [37]:
Q1_agent=Agent(Q_move)
Q1_agent.Q=LoadTable('Q1_data.json')
Q1_agent.post=Q_after

Q1_agent.α=0.3  # learning rate
Q1_agent.γ=0.9  # memory constant, discount factor
Q1_agent.ϵ=0.1  # probability of a random move during learning

Q2_agent=Agent(Q_move)
Q2_agent.Q=LoadTable('Q2_data.json')
Q2_agent.post=Q_after

Q2_agent.α=0.3  # learning rate
Q2_agent.γ=0.9  # memory constant, discount factor
Q2_agent.ϵ=0.1  # probability of a random move during learning

In [38]:
total_number_of_games=0
for epoch in range(100):
    
    number_training_games=10
    number_of_testing_games=10
    
    #=================
    # traning cycle
    Q1_agent.α=0.3  # learning rate
    Q1_agent.ϵ=0.1  # probability of a random move during learning
    Q2_agent.α=0.3  # learning rate
    Q2_agent.ϵ=0.1  # probability of a random move during learning
    
    g=Game(number_training_games)
    g.display=False
    g.run(Q1_agent,Q2_agent)

    #=================
    # testing cycle
    Q1_agent.α=0.0  # learning rate
    Q1_agent.ϵ=0.0  # probability of a random move during learning
    Q2_agent.α=0.0  # learning rate
    Q2_agent.ϵ=0.0  # probability of a random move during learning
    
    
    g=Game(number_of_testing_games)
    g.display=False
    result=g.run(Q1_agent,Q2_agent)
    
    total_number_of_games+=number_training_games
    win_percentage=sum([r==1 for r in result])/number_training_games*100
    loss_percentage=sum([r==2 for r in result])/number_training_games*100
    tie_percentage=sum([r==0 for r in result])/number_training_games*100

    print(total_number_of_games,":",win_percentage," ",end="")
    
    SaveTable(Q1_agent.Q,'Q1_data.json')
    SaveTable(Q2_agent.Q,'Q2_data.json')    
    

10 : 60.0  20 : 60.0  30 : 70.0  40 : 70.0  50 : 80.0  60 : 70.0  70 : 90.0  80 : 60.0  90 : 100.0  100 : 80.0  110 : 90.0  120 : 90.0  130 : 90.0  140 : 80.0  150 : 50.0  160 : 80.0  170 : 70.0  180 : 70.0  190 : 80.0  200 : 90.0  210 : 100.0  220 : 90.0  230 : 80.0  240 : 80.0  250 : 80.0  260 : 70.0  270 : 60.0  280 : 80.0  290 : 80.0  300 : 80.0  310 : 80.0  320 : 70.0  330 : 80.0  340 : 70.0  350 : 80.0  360 : 80.0  370 : 60.0  380 : 60.0  390 : 70.0  400 : 70.0  410 : 80.0  420 : 90.0  430 : 80.0  440 : 90.0  450 : 70.0  460 : 80.0  470 : 90.0  480 : 80.0  490 : 70.0  500 : 70.0  510 : 50.0  520 : 80.0  530 : 90.0  540 : 100.0  550 : 80.0  560 : 90.0  570 : 90.0  580 : 70.0  590 : 70.0  600 : 80.0  610 : 80.0  620 : 70.0  630 : 80.0  640 : 80.0  650 : 80.0  660 : 100.0  670 : 90.0  680 : 80.0  690 : 90.0  700 : 80.0  710 : 100.0  720 : 90.0  730 : 90.0  740 : 90.0  750 : 90.0  760 : 60.0  770 : 60.0  780 : 80.0  790 : 60.0  800 : 80.0  810 : 100.0  820 : 100.0  830 : 80.0  840 : 

## SARSA Agent

In [39]:
def SARSA_move(state,player,info):
    Q=info.Q
    last_action=info.last_action
    last_state=info.last_state
    
    α=info.α
    γ=info.γ
    ϵ=info.ϵ
    

    # if Ive never seen this state before
    if not state in Q:
        actions=valid_moves(state,player)

        Q[state]=Table()
        for action in actions:
            Q[state][action]=0     
    
    # deal with random vs top choice here
    if random.random()<ϵ:
        move=random_move(state,player)  
    else:
        move=top_choice(Q[state]) 
    
    # what if there are no skittles for a particular state?
    # move is None in that case
    
    if not last_action is None:  # not the first move
        # learn a little bit
        # change equation here
        reward=0
        
        # Bellman equation
        Q[last_state][last_action] += α*(reward+
                         γ*Q[state][move] - 
                                Q[last_state][last_action])
    
        
    
    return move

def SARSA_after(status,player,info):
    Q=info.Q
    last_action=info.last_action
    last_state=info.last_state

    α=info.α
    γ=info.γ
    ϵ=info.ϵ
    
    if status=='lose':
        reward=-1
    elif status=='win':
        reward=1
    elif status=='stalemate':
        reward=0.5
    else:
        reward=0
        
    # learn a little bit
    Q[last_state][last_action] += α*(reward-Q[last_state][last_action])
        

In [40]:
SARSA1_agent=Agent(SARSA_move)
SARSA1_agent.Q=LoadTable('SARSA1_data.json')
SARSA1_agent.post=Q_after

SARSA1_agent.α=0.3  # learning rate
SARSA1_agent.γ=0.9  # memory constant, discount factor
SARSA1_agent.ϵ=0.1  # probability of a random move during learning

SARSA2_agent=Agent(Q_move)
SARSA2_agent.Q=LoadTable('SARSA2_data.json')
SARSA2_agent.post=Q_after

SARSA2_agent.α=0.3  # learning rate
SARSA2_agent.γ=0.9  # memory constant, discount factor
SARSA2_agent.ϵ=0.1  # probability of a random move during learning

In [41]:
total_number_of_games=0
for epoch in range(100):
    
    number_training_games=10
    number_of_testing_games=10
    
    #=================
    # traning cycle
    SARSA1_agent.α=0.3  # learning rate
    SARSA1_agent.ϵ=0.1  # probability of a random move during learning
    SARSA2_agent.α=0.3  # learning rate
    SARSA2_agent.ϵ=0.1  # probability of a random move during learning
    
    g=Game(number_training_games)
    g.display=False
    g.run(SARSA1_agent,SARSA2_agent)

    #=================
    # testing cycle
    SARSA1_agent.α=0.0  # learning rate
    SARSA1_agent.ϵ=0.0  # probability of a random move during learning
    SARSA2_agent.α=0.0  # learning rate
    SARSA2_agent.ϵ=0.0  # probability of a random move during learning
    
    
    g=Game(number_of_testing_games)
    g.display=False
    result=g.run(SARSA1_agent,SARSA2_agent)
    
    total_number_of_games+=number_training_games
    win_percentage=sum([r==1 for r in result])/number_training_games*100
    loss_percentage=sum([r==2 for r in result])/number_training_games*100
    tie_percentage=sum([r==0 for r in result])/number_training_games*100

    print(total_number_of_games,":",win_percentage," ",end="")
    
    SaveTable(SARSA1_agent.Q,'SARSA1_data.json')
    SaveTable(SARSA2_agent.Q,'SARSA2_data.json')    
    

10 : 80.0  20 : 30.0  30 : 30.0  40 : 100.0  50 : 20.0  60 : 70.0  70 : 40.0  80 : 20.0  90 : 40.0  100 : 50.0  110 : 80.0  120 : 90.0  130 : 90.0  140 : 70.0  150 : 90.0  160 : 80.0  170 : 90.0  180 : 100.0  190 : 80.0  200 : 70.0  210 : 90.0  220 : 80.0  230 : 60.0  240 : 80.0  250 : 70.0  260 : 100.0  270 : 100.0  280 : 70.0  290 : 90.0  300 : 90.0  310 : 100.0  320 : 70.0  330 : 70.0  340 : 90.0  350 : 70.0  360 : 70.0  370 : 100.0  380 : 100.0  390 : 80.0  400 : 80.0  410 : 80.0  420 : 100.0  430 : 80.0  440 : 90.0  450 : 70.0  460 : 70.0  470 : 70.0  480 : 80.0  490 : 70.0  500 : 80.0  510 : 90.0  520 : 70.0  530 : 100.0  540 : 40.0  550 : 90.0  560 : 80.0  570 : 80.0  580 : 70.0  590 : 100.0  600 : 70.0  610 : 70.0  620 : 90.0  630 : 80.0  640 : 90.0  650 : 60.0  660 : 70.0  670 : 70.0  680 : 90.0  690 : 80.0  700 : 70.0  710 : 60.0  720 : 90.0  730 : 80.0  740 : 90.0  750 : 70.0  760 : 90.0  770 : 90.0  780 : 100.0  790 : 90.0  800 : 90.0  810 : 80.0  820 : 100.0  830 : 80.0  8

In [42]:
SARSA1_agent.Q

{23: {1: -0.10148915487549862, 2: 0.4319029342815772, 3: -0.1180982401597065},
 20: {1: -0.238160665548873, 2: -0.24436750301464205, 3: 0.583987162696529},
 17: {1: -0.5501124307406864, 2: -0.462662241080866, 3: -0.6606470226736118},
 14: {1: 0.656639755013361, 2: -0.6655696062504155, 3: -0.7020860300979508},
 11: {1: -0.5253270520467264, 2: 0.8089325268398339, 3: -0.16048599548526926},
 9: {1: -0.8058164626637306, 2: -0.7926785201316862, 3: -0.8052623045281572},
 4: {1: -0.959646393, 2: -0.7599, 3: 0.9999999999999999},
 18: {1: 0.5975635737494217, 2: -0.5160238239821485, 3: -0.4341635361541453},
 6: {1: 0.3616290822189513, 2: -0.5274768316076847, 3: -0.9176457},
 2: {1: 0.9999999999999993, 2: -0.959646393},
 10: {1: 0.8090211723353758, 2: -0.46261493419267696, 3: -0.10582126971620232},
 12: {1: -0.5297565953023181, 2: -0.3345145825302327, 3: 0.36741337039089655},
 3: {1: -0.9956797190259519, 2: 0.9999999999999999, 3: -0.9903110989592999},
 19: {1: -0.2765340431867735, 2: 0.57469648452