In [1]:
from Game import *

Version:  0.2.21


# Nim

Rules:

1. start with 21 sticks -- state = number of sticks
2. turns alternate taking 1,2, or 3 sticks
3. player taking last stick loses

In [2]:
def initial_state():
    return 21

def show_state(state):
    print("There are ",state,"sticks.")
    
def valid_moves(state,player):
    if state==1:
        return [1]
    elif state==2:
        return [1,2]
    else:
        return [1,2,3]
    
def update_state(state,player,move):
    # move = number of sticks to pick up
    new_state=state-move # remove the sticks
    return new_state

def win_status(state,player):
    if state==0:
        return 'lose'
    elif state==1:
        return 'win'
    else:
        return None
    
    # there is no stalemate

In [3]:
def human_move(state,player):
    print("Player ",player)
    move=int(input("How many sticks?"))
    return move

human_agent=Agent(human_move)

In [4]:
def random_move(state,player):
    possible_moves=valid_moves(state,player)
    move=random.choice(possible_moves)
    return move
random_agent=Agent(random_move)

In [5]:
from Game.minimax import *
def minimax_move(state,player):

    values,moves=minimax_values(state,player,display=False)
    return top_choice(moves,values)


minimax_agent=Agent(minimax_move)

In [6]:
def skittles_move(state,player,info):
    S=info.S
    last_action=info.last_action
    last_state=info.last_state
    
    
    # if Ive never seen this state before
    if not state in S:
        actions=valid_moves(state,player)

        S[state]=Table()
        for action in actions:
            S[state][action]=3     
    
    move=weighted_choice(S[state])  # weighted across actions
    
    # what if there are no skittles for a particular state?
    # move is None in that case
    
    if move is None:
        # learn a little bit
        if last_state:
            S[last_state][last_action]=S[last_state][last_action]-1
            if S[last_state][last_action]<0:
                S[last_state][last_action]=0
        
        move=random_move(state,player)
    
    return move

def skittles_after(status,player,info):
    S=info.S
    last_action=info.last_action
    last_state=info.last_state

    if status=='lose':
        # learn a little bit
        S[last_state][last_action]=S[last_state][last_action]-1
        if S[last_state][last_action]<0:
            S[last_state][last_action]=0
        
    


skittles_agent=Agent(skittles_move)
skittles_agent.S=Table()
skittles_agent.post=skittles_after


In [18]:
def Q_move(state,player,info):
    Q=info.Q
    last_action=info.last_action
    last_state=info.last_state
    
    α=info.α
    γ=info.γ
    ϵ=info.ϵ
    

    # if Ive never seen this state before
    if not state in Q:
        actions=valid_moves(state,player)

        Q[state]=Table()
        for action in actions:
            Q[state][action]=0     
    
    # deal with random vs top choice here
    if random.random()<ϵ:
        move=random_move(state,player)  
    else:
        move=top_choice(Q[state]) 
    
    # what if there are no skittles for a particular state?
    # move is None in that case
    
    if not last_action is None:  # not the first move
        # learn a little bit
        # change equation here
        reward=0
        
        # Bellman equation
        Q[last_state][last_action] += α*(reward+
                         γ*max([Q[state][a] for a in Q[state]])  - 
                                Q[last_state][last_action])
    
        
    
    return move

def Q_after(status,player,info):
    Q=info.Q
    last_action=info.last_action
    last_state=info.last_state

    α=info.α
    γ=info.γ
    ϵ=info.ϵ
    
    if status=='lose':
        reward=-1
    elif status=='win':
        reward=1
    elif status=='stalemate':
        reward=0.5
    else:
        reward=0
        
    # learn a little bit
    Q[last_state][last_action] += α*(reward-Q[last_state][last_action])
        


Q_agent=Agent(Q_move)
Q_agent.Q=LoadTable('Q_data.json')
Q_agent.post=Q_after

Q_agent.α=0.3  # learning rate
Q_agent.γ=0.9  # memory constant, discount factor
Q_agent.ϵ=0.1  # probability of a random move during learning

## Traning stage

- the Q values can change
- that the agent takes random moves sometimes

In [26]:
Q_agent.α=0.3  # learning rate
Q_agent.ϵ=0.1  # probability of a random move during learning

In [27]:
g=Game(100)
g.display=False
g.run(minimax_agent,Q_agent);

## Testing stage

- the Q values **do not change**
- that the agent **never takes random moves**

In [28]:
Q_agent.α=0.0  # learning rate
Q_agent.ϵ=0.0  # probability of a random move during learning

In [29]:
g=Game(10)
g.display=False
g.run(minimax_agent,Q_agent);

In [30]:
g.report()

Total number of games:  10
Winning 0.00 percent
Losing 100.00 percent
Tie 0.00 percent


In [31]:
Q_agent.Q

{20: {1: -0.02755878181911, 2: -0.09137518828627986, 3: 0.6548045161102354},
 17: {1: -0.28560093664146513,
  2: -0.35209008610162745,
  3: -0.36689125718484517},
 13: {1: -0.7178693575103386, 2: -0.730255991088701, 3: -0.7566848912211898},
 9: {1: -0.8939054764706701, 2: -0.8930037978068894, 3: -0.8923631333713686},
 5: {1: -0.9996090178951418, 2: -0.9997263125265993, 3: -0.9996090178951418},
 18: {1: 0.6551058991978964, 2: -0.5238516245907714, 3: -0.035665596},
 15: {1: -0.20821947942937508, 2: 0.7287188645048186, 3: -0.3809729052891},
 14: {1: 0.7284028814967268, 2: -0.5170723835347195, 3: -0.4522928158670315},
 11: {1: -0.31808257341480006, 2: 0.8098515993077074, 3: 0},
 6: {1: 0.8998871819717115, 2: -0.657, 3: -0.3},
 12: {1: -0.5325680449554707, 2: -0.21739672590588902, 3: 0.8098819461637278},
 16: {1: -0.24601660382405044, 2: -0.10533437801910903, 3: 0.728500990955309},
 19: {1: -0.07306850481125086, 2: 0.6543479904829171, 3: -0.17656477338191176},
 7: {1: 0, 2: 0.89995741317412

In [32]:
SaveTable(Q_agent.Q,'Q_data.json')

## Q vs Q with progress

In [49]:
Q1_agent=Agent(Q_move)
Q1_agent.Q=LoadTable('Q1_data.json')
Q1_agent.post=Q_after

Q1_agent.α=0.3  # learning rate
Q1_agent.γ=0.9  # memory constant, discount factor
Q1_agent.ϵ=0.1  # probability of a random move during learning

Q2_agent=Agent(Q_move)
Q2_agent.Q=LoadTable('Q2_data.json')
Q2_agent.post=Q_after

Q2_agent.α=0.3  # learning rate
Q2_agent.γ=0.9  # memory constant, discount factor
Q2_agent.ϵ=0.1  # probability of a random move during learning

In [50]:
total_number_of_games=0
for epoch in range(100):
    
    number_training_games=10
    number_of_testing_games=10
    
    #=================
    # traning cycle
    Q1_agent.α=0.3  # learning rate
    Q1_agent.ϵ=0.1  # probability of a random move during learning
    Q2_agent.α=0.3  # learning rate
    Q2_agent.ϵ=0.1  # probability of a random move during learning
    
    g=Game(number_training_games)
    g.display=False
    g.run(Q1_agent,Q2_agent)

    #=================
    # testing cycle
    Q1_agent.α=0.0  # learning rate
    Q1_agent.ϵ=0.0  # probability of a random move during learning
    Q2_agent.α=0.0  # learning rate
    Q2_agent.ϵ=0.0  # probability of a random move during learning
    
    
    g=Game(number_of_testing_games)
    g.display=False
    result=g.run(Q1_agent,Q2_agent)
    
    total_number_of_games+=number_training_games
    win_percentage=sum([r==1 for r in result])/number_training_games*100
    loss_percentage=sum([r==2 for r in result])/number_training_games*100
    tie_percentage=sum([r==0 for r in result])/number_training_games*100

    print(total_number_of_games,":",win_percentage," ",end="")
    
    SaveTable(Q1_agent.Q,'Q1_data.json')
    SaveTable(Q2_agent.Q,'Q2_data.json')    
    

10 : 60.0  20 : 0.0  30 : 100.0  40 : 0.0  50 : 100.0  60 : 0.0  70 : 0.0  80 : 0.0  90 : 0.0  100 : 0.0  110 : 0.0  120 : 0.0  130 : 0.0  140 : 0.0  150 : 0.0  160 : 0.0  170 : 0.0  180 : 0.0  190 : 0.0  200 : 0.0  210 : 0.0  220 : 0.0  230 : 0.0  240 : 0.0  250 : 0.0  260 : 0.0  270 : 0.0  280 : 0.0  290 : 0.0  300 : 0.0  310 : 0.0  320 : 0.0  330 : 0.0  340 : 0.0  350 : 0.0  360 : 0.0  370 : 0.0  380 : 0.0  390 : 0.0  400 : 0.0  410 : 0.0  420 : 0.0  430 : 0.0  440 : 0.0  450 : 0.0  460 : 0.0  470 : 0.0  480 : 0.0  490 : 0.0  500 : 0.0  510 : 0.0  520 : 0.0  530 : 0.0  540 : 0.0  550 : 0.0  560 : 0.0  570 : 0.0  580 : 0.0  590 : 0.0  600 : 0.0  610 : 0.0  620 : 0.0  630 : 0.0  640 : 0.0  650 : 0.0  660 : 0.0  670 : 0.0  680 : 0.0  690 : 0.0  700 : 0.0  710 : 0.0  720 : 0.0  730 : 0.0  740 : 0.0  750 : 0.0  760 : 0.0  770 : 0.0  780 : 0.0  790 : 0.0  800 : 0.0  810 : 0.0  820 : 0.0  830 : 0.0  840 : 0.0  850 : 0.0  860 : 0.0  870 : 0.0  880 : 0.0  890 : 0.0  900 : 0.0  910 : 0.0  920