In [2]:
import numpy as np
import Backgammon as B
import agent as A
import flipped_agent as FA
import tensorflow as tf
import keras
import keras.layers as L
from IPython.display import clear_output
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [12]:
def get_cumulative_rewards(rewards, gamma = 1):
    rewards = np.array(rewards)
    R = np.zeros_like(rewards, dtype= "float32")
    r = 0.
    for i, reward in enumerate(reversed(rewards)):
        r += reward
        R[-(i + 1)] = r
        r *= gamma
        
    return R

In [4]:
def train_step(_states, _rewards):
    _cumulative_rewards = get_cumulative_rewards(_rewards)
    update.run({states: _states, 
                cumulative_rewards: _cumulative_rewards})

In [5]:
'''
Þetta fall spilar heilan leik gegn random agent
'''

def PlayRandomAgent(Explore = True, Debug = False, Verbose = False, Learn = True):

    boards, rewards = [], []
    board = B.init_board()
    player = 1
    
    Error = False
    GameOver = False
    k = 1
    '''
    Pælingin hér er að spila einn leik til enda og geyma öll boards og actions. 
    Reikna svo eligibility trace og update'a modelið með öllu episodeinu.
    '''
    while True:
        dice = B.roll_dice()
        for i in range(1 + int(dice[0] == dice[1])):
            
            legal_moves, legal_boards = B.legal_moves(board, dice, 1)

            if len(legal_moves) == 0:
                break

            
            probs = get_action_prob(legal_boards)
            n_actions = probs.shape[0]
            probs = probs.reshape(n_actions)
            
            if Explore == True:
                action = np.random.choice(np.arange(0, n_actions), 
                                     p = probs)
            else:
                action = np.argmax(probs)

            move = legal_moves[action]


            if Debug:
                print("Action: \n", action)
                print("Board now: \n", board)
                print("Chosen move:\n", move)

            if len(move) != 0:
                for m in move:
                    board = B.update_board(board = board, move = m, player = 1)
            boards.append(board)
            
            GameOver = B.game_over(board)
            if GameOver:
                rewards.append(1)
                break
        if GameOver:
                break
                
        
        board = FA.flip_board(board)
        player *= -1
        dice = B.roll_dice()
            
            
        for i in range(1 + int(dice[0] == dice[1])):
            
            legal_moves, legal_boards = B.legal_moves(board, dice, 1)
            legal_boards = np.array([board for board in legal_boards])

            if len(legal_moves) == 0:
                break

            
            move = legal_moves[np.random.randint(len(legal_moves))]


            if Debug:
                print("Action: \n", action)
                print("Board now: \n", board)
                print("Chosen move:\n", move)

            if len(move) != 0:
                for m in move:
                    board = B.update_board(board = board, move = m, player = 1)
            
            GameOver = B.game_over(board)
            if GameOver:
                rewards.append(-1)
                break
            else:
                rewards.append(0)
            player *= -1

        if GameOver:
            if Verbose:
                print("Game is over.")
            break

        if B.check_for_error(board):
            Error = True
            print("Error at game step ", k)
            break
        k += 1
         
    if not Error and Learn: 
        train_step(boards, rewards)
    return (player + 1) / 2

In [6]:
'''
Þetta fall spilar heilan leik, uppfærir tauganetið og skilar K, sem er fjöldi umferða í leiknum.
'''

def generate_session(Explore = True, Debug = False, Verbose = False):

    # Spila leikinn
    boards, rewards = [[], []], [[], []]

    board = B.init_board()
    player = 1
    
    Error = False
    GameOver = False
    k = 0
    '''
    Pælingin hér er að spila einn leik til enda og geyma öll boards og actions. 
    Reikna svo eligibility trace og update'a modelið með öllu episodeinu.
    '''
    while True:
        dice = B.roll_dice()
        for i in range(1 + int(dice[0] == dice[1])):

            legal_moves, legal_boards = B.legal_moves(board, dice, 1)

            if len(legal_moves) == 0:
                break

            #probs = np.array([get_action_prob(state.reshape(1, 29)) for state in legal_boards])
            probs = get_action_prob(legal_boards)
            n_actions = probs.shape[0]
            probs = probs.reshape(n_actions)
            
            if Explore == True:
                action = np.random.choice(np.arange(0, n_actions), 
                                     p = probs)
            else:
                action = np.argmax(probs)

            move = legal_moves[action]


            if Debug:
                print("Action: \n", action)
                print("Board now: \n", board)
                print("Chosen move:\n", move)

            if len(move) != 0:
                for m in move:
                    board = B.update_board(board = board, move = m, player = 1)

            #record session history to train later
            
            boards[int((player + 1) / 2)].append(board)
            
            GameOver = B.game_over(board)
            if GameOver:
                rewards[int((player + 1) / 2)].append(1)
                rewards[int((-player + 1) / 2)].append(-1)
                break
            else:
                rewards[int((player + 1) / 2)].append(0)

        board = FA.flip_board(board)
        player *= -1
        if GameOver:
            if Verbose:
                print("Game is over.")
            break

        if B.check_for_error(board):
            Error = True
            print("Error at game step ", k)
            break
        k += 1
    if not Error:  
        for i in range(2):
            train_step(boards[i], rewards[i])
            
    return k

In [7]:
# Skilgreina inputs í model
states = tf.placeholder("float32", (None, 29), name = "states")
cumulative_rewards = tf.placeholder("float32", (None, ), name = "cumulative_rewards")

# Skilgreina model (arkitektúrinn skiptir litlu máli þangað til að þjálfunin gengur)
model = keras.models.Sequential()
model.add(L.Dense(100, activation = "linear"))
model.add(L.Dense(100, activation = "linear"))
model.add(L.Dense(100, activation = "relu"))
model.add(L.Dense(1))


logits = model(states)
policy = tf.nn.softmax(logits, axis = 0)
log_policy = tf.nn.log_softmax(logits, axis = 0)

get_action_prob = lambda s: policy.eval({states: s})

J = tf.reduce_mean(log_policy * cumulative_rewards)
entropy = tf.reduce_sum(tf.multiply(policy, log_policy), 1, name="entropy")

loss = - J + 0.1 * entropy


# Að maximiza J er það sama og að minimiza -J
all_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
update = tf.train.AdamOptimizer().minimize(loss, var_list = all_weights)

In [8]:
s = tf.InteractiveSession()
s.run(tf.global_variables_initializer())

In [14]:
_, boards = B.legal_moves(B.init_board(), B.roll_dice(), 1)
policy.eval({states: boards})

array([[1.8036630e-02],
       [2.4270599e-07],
       [1.5166575e-20],
       [4.8171246e-01],
       [6.4856208e-06],
       [4.0498639e-19],
       [2.4420439e-04],
       [2.7009231e-10],
       [3.2887919e-09],
       [2.0540205e-22],
       [1.8996019e-13],
       [2.5577186e-18],
       [5.4747050e-29],
       [1.5989395e-31],
       [1.8036630e-02],
       [4.8171246e-01],
       [2.4420439e-04],
       [1.8996019e-13],
       [2.4270599e-07],
       [2.7009231e-10],
       [6.4856208e-06],
       [3.2887919e-09],
       [2.5577186e-18],
       [1.5166575e-20],
       [4.0498639e-19],
       [2.0540205e-22],
       [5.4747050e-29],
       [1.5989395e-31]], dtype=float32)

In [13]:
win_pct = []

for i in range(1000):
    
    k = [generate_session() for _ in range(100)]
    clear_output(True)
    print("Training against self")
    print("Mean rounds to win: ", np.mean(k))
    
    print("Playing random agent")
    wins = [PlayRandomAgent(Learn = False, Explore = False) for _ in range(10)]
    win_pct.append(np.mean(wins))
    print("Win percentage: ", np.mean(wins))
    plt.plot(win_pct)
    plt.show()
    

Training against self
Mean rounds to win:  59.01
Playing random agent


KeyboardInterrupt: 

## Allt hér fyrir neðan var notað sem fikkt

Útgáfa þar sem við flippum borðið þ.a. agentinn er bara alltaf nr 1 og flippum aldrei moveinu hans

In [147]:
def train_step2(_states, _actions, _rewards):
    _cumulative_rewards = get_cumulative_rewards(_rewards)
    _actions[_actions == 1] = _cumulative_rewards
    update.run({states: _states, 
                actions: _actions,
                cumulative_rewards: _cumulative_rewards})

In [10]:
# Spila leikinn
boards, actions, rewards = [[], []], [[], []], [[], []]
#all_boards = []

board = B.init_board()
player = 1

Error = False
GameOver = False
Explore = True
Debug = False
Verbose = False
k = 0
'''
Pælingin hér er að spila einn leik til enda og geyma öll boards og actions. 
Reikna svo eligibility trace og update'a modelið með öllu episodeinu.
'''
while True:
    dice = B.roll_dice()
    for i in range(1 + int(dice[0] == dice[1])):

        legal_moves, legal_boards = B.legal_moves(board, dice, 1)

        if len(legal_moves) == 0:
            break

        #probs = np.array([get_action_prob(state.reshape(1, 29)) for state in legal_boards])
        probs = get_action_prob(legal_boards)
        n_actions = probs.shape[0]
        probs = probs.reshape(n_actions)

        if Explore == True:
            action = np.random.choice(np.arange(0, n_actions), 
                                 p = probs)
        else:
            action = np.argmax(probs)

        move = legal_moves[action]


        if Debug:
            print("Action: \n", action)
            print("Board now: \n", board)
            print("Chosen move:\n", move)

        if len(move) != 0:
            for m in move:
                board = B.update_board(board = board, move = m, player = 1)

        #record session history to train later
        all_actions = np.zeros((n_actions, 1))
        all_actions[action, 0] = 1
        boards[int((player + 1) / 2)].append(legal_boards)
        actions[int((player + 1) / 2)].append(all_actions)
        #all_boards[int((player + 1) / 2)].append(legal_boards)

        GameOver = B.game_over(board)
        if GameOver:
            rewards[int((player + 1) / 2)].append(1)
            rewards[int((-player + 1) / 2)].append(-1)
            break
        else:
            rewards[int((player + 1) / 2)].append(0)

    board = FA.flip_board(board)
    player *= -1
    if GameOver:
        if Verbose:
            print("Game is over.")
        break

    if B.check_for_error(board):
        Error = True
        print("Error at game step ", k)
        break
    k += 1
if not Error:  
    for i in range(2):
        train_step(np.vstack(boards[i]), np.vstack(actions[i]), rewards[i])

(637, 29) (637, 1) (34,)


TypeError: unhashable type: 'list'

In [11]:
np.array(rewards[0])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [12]:
a = np.vstack(actions[0])
a.shape

(637, 1)

In [14]:
np.vstack(boards[0])[1]

array([-0., -2.,  1., -0., -0.,  1.,  4., -0.,  2., -0., -0., -0., -4.,
        5., -0., -0., -0., -4., -0., -4., -0., -0., -0., -1.,  2., -0.,
       -0., -0., -0.])

In [15]:
np.array(rewards[0]).shape

(34,)

In [201]:
ss = np.vstack(boards[0])[0]
a = np.vstack(actions[0])[0]
cr = get_cumulative_rewards(rewards[0])

s.run(loss, ({states: ss, 
              actions: a, 
              cumulative_rewards: cr}))

AttributeError: 'numpy.ndarray' object has no attribute 'run'

TypeError: Required argument 'object' (pos 1) not found

Útgáfa þar sem við flippum boardið fyrir Player -1 og flippum múvið hans. 
Virkar ekki eins og er

In [169]:
# Spila leikinn
boards, moves, rewards = [], [], []
    
board = B.init_board()
player = 1

k = 1 # Halda utan um hvenær ég fæ villu
Error = False
Debug = False


'''
Pælingin hér er að spila einn leik til enda og geyma öll boards og actions. 
Reikna svo eligibility trace og update'a modelið með öllu episodeinu.
'''
while True:
    dice = B.roll_dice()
    for i in range(1 + int(dice[0] == dice[1])):

        legal_moves, legal_boards = B.legal_moves(board, dice, 1)
        legal_boards = np.array([board for board in legal_boards])

        if len(legal_moves) == 0:
            break

        #probs = np.array([get_action_prob(state.reshape(1, 29)) for state in legal_boards])
        probs = get_action_prob(legal_boards)
        n_actions = probs.shape[0]
        probs = probs.reshape(n_actions)
        probs = probs / np.sum(probs)

        action = np.random.choice(np.arange(0, n_actions), 
                             p = probs)
        
        move = legal_moves[action]
        
        
        if Debug:
            print("Action: \n", action)
            print("Board now: \n", board)
            print("Chosen move:\n", move)
            
        if len(move) != 0:
            for m in move:
                board = B.update_board(board = board, move = m, player = 1)

        #record session history to train later
        boards.append(board)
        moves.append(move)

        if B.game_over(board):
            rewards.append(1)
            break
        else:
            rewards.append(0)
            
        board = FA.flip_board(board)
    
    if B.game_over(board):
            print("Game is over.")
            break
    
    if B.check_for_error(board):
        Error = True
        print("Error at game step ", k)
        break
    k += 1

Game is over.


In [170]:
moves

[array([[13, 10],
        [13, 11]]), array([[13, 10],
        [24, 22]]), array([[10,  8],
        [13, 10]]), array([[22, 21],
        [ 6,  4]]), array([[6, 5],
        [8, 2]]), array([[24, 22],
        [ 8,  6]]), array([[6, 4],
        [4, 2]]), array([[25, 20],
        [22, 16]]), array([[25, 20],
        [11,  6]]), array([[13,  8],
        [16, 11]]), array([[24, 22],
        [13,  8]]), array([[ 6,  3],
        [13,  9]]), array([[25, 22],
        [ 8,  5]]), array([[25, 22],
        [25, 22]]), array([[8, 4],
        [2, 1]]), array([[4, 1],
        [9, 3]]), array([[25, 24],
        [25, 21]]), array([[25, 23],
        [22, 18]]), array([[25, 20],
        [ 8,  2]]), array([[25, 24],
        [18, 15]]), array([[25, 22]]), array([[25, 22],
        [22, 20]]), array([[25, 23],
        [25, 21]]), array([[13,  7],
        [ 6,  1]]), array([[25, 20],
        [ 8,  2]]), array([[15, 10],
        [22, 21]]), array([[25, 20],
        [22, 16]]), array([[7, 1],
        [8, 2]]), a