# Connect 4 Game Demonstration

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from platform import python_version

from gamelearner import *
from connectx import Connect4Game, Connect4BasicPlayer

python_version()

'3.6.10'

## Game dynamics

In [2]:
game = Connect4Game()
game

Connect4Game()

In [3]:
game.roles

[1, 2]

In [4]:
game.marks

['X', 'O']

In [5]:
game.state

array([[0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0]], dtype=int8)

In [6]:
game.make_move((1, 3))
game.make_move((2, 3))
game.make_move((1, 4))
game.make_move((2, 3))
game.make_move((1, 2))

In [7]:
game.show_state()

_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ O _ _ _
_ _ _ O _ _ _
_ _ X X X _ _


In [8]:
game.get_rewards()

{2: 0.0}

In [9]:
game.check_if_game_over()
game.game_over

False

In [10]:
game.moves

[(1, 3), (2, 3), (1, 4), (2, 3), (1, 2)]

In [11]:
game.turn

2

In [12]:
game.available_moves()

array([0, 1, 2, 3, 4, 5, 6])

In [13]:
try:
    game.make_move((1, 1))
except ValueError as err:
    print(err)

It is not player 1's turn.


In [14]:
game.make_move((2, 5))
game.show_state()

_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ O _ _ _
_ _ _ O _ _ _
_ _ X X X O _


In [15]:
game._pos_last

(0, 5)

In [16]:
game._fill_levels

array([0, 0, 1, 3, 1, 1, 0], dtype=int8)

In [17]:
game.make_move((1, 1))
game.show_state()

_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ O _ _ _
_ _ _ O _ _ _
_ X X X X O _


In [18]:
game.game_over

True

In [19]:
print(game.winner)

1


In [20]:
game.get_rewards()

{2: 0.0}

In [21]:
game.moves

[(1, 3), (2, 3), (1, 4), (2, 3), (1, 2), (2, 5), (1, 1)]

In [22]:
game._pos_last

(0, 1)

In [23]:
game.reverse_move()
game.moves

[(1, 3), (2, 3), (1, 4), (2, 3), (1, 2), (2, 5)]

In [24]:
game._pos_last

(0, 5)

In [25]:
game.reverse_move()
game.moves

[(1, 3), (2, 3), (1, 4), (2, 3), (1, 2)]

In [26]:
game._pos_last

(0, 2)

In [27]:
game.show_state()

_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ O _ _ _
_ _ _ O _ _ _
_ _ X X X _ _


In [28]:
game.reverse_move()
game.moves

[(1, 3), (2, 3), (1, 4), (2, 3)]

In [29]:
game.show_state()

_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ O _ _ _
_ _ _ O _ _ _
_ _ _ X X _ _


In [32]:
game.make_move((1, 5))
game.show_state()

_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ O _ _ _
_ _ _ O _ _ _
_ _ _ X X X _


In [33]:
game.get_rewards()

{2: 0.0}

In [35]:
game.make_move((2, 6))
game.show_state()

_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ O _ _ _
_ _ _ O _ _ _
_ _ _ X X X O


In [36]:
game.make_move((1, 2))
game.show_state()

_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ O _ _ _
_ _ _ O _ _ _
_ _ X X X X O


In [37]:
game.game_over, game.winner

(True, 1)

In [38]:
game.get_rewards()

{2: 0.0}

In [40]:
try:
    game.make_move((2, 1))
except AssertionError as err:
    print(err)

Game is already over.


### Playing with Human Players

In [41]:
players = [HumanPlayer("Jill"), HumanPlayer("Jack")]
players

[HumanPlayer('Jill'), HumanPlayer('Jack')]

In [42]:
game = Connect4Game()
game

Connect4Game()

In [43]:
game.show_state()
players[0].make_move(game, role=1)

_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ _ _ _ _
_ _ _ _ _ _ _
Jill's turn (column from left): 3
Move format is column from left
Jill's turn (column from left): 2
Move format is column from left
Jill's turn (column from left): 4
Move format is column from left
Jill's turn (column from left): 0
Move format is column from left
Jill's turn (column from left): (0, 0)
That position is not available.
Try again.
Jill's turn (column from left): (0, )
Move format is column from left
Jill's turn (column from left): '0'
Move format is column from left
Jill's turn (column from left): 11
Move format is column from left
Jill's turn (column from left): (1, 1)
That position is not available.
Try again.
Jill's turn (column from left): (0, 1)
That position is not available.
Try again.
Jill's turn (column from left): q
Move format is column from left
Jill's turn (column from left): q
Move format is column from left
Jill's turn (column from left): e
Move format is column from left
Jil

KeyboardInterrupt: 

In [None]:
game.show_state()
players[1].make_move(game, role=2)

In [None]:
game.show_state()

### Test the TDLearner player

In [None]:
game = TicTacToeGame()
game.show_state()

In [None]:
ex = TicTacToeExpert("EX")
td = TDLearner("TD", off_policy_rate=0.0)

In [None]:
td.make_move(game, 1)  # TDLearner playing role 1
print("On-policy:", td.on_policy)
game.show_state()

In [None]:
game.get_rewards()

In [None]:
ex.make_move(game, 2)
game.show_state()

In [None]:
game.get_rewards()

In [None]:
td.make_move(game, 1)
game.show_state()

In [None]:
game.get_rewards()

In [None]:
ex.make_move(game, 2)
game.show_state()

In [None]:
game.get_rewards()

In [None]:
td.make_move(game, 1)
game.show_state()

In [None]:
game.get_rewards()

In [None]:
while not game.game_over:
    ex.make_move(game, 2)
    if not game.game_over:
        td.make_move(game, 1)

In [None]:
game.show_state()

In [None]:
game.game_over, game.winner

In [None]:
# Value function before reward update
td.value_function

In [None]:
terminal_rewards = game.get_terminal_rewards()
terminal_rewards

In [None]:
td.update_terminal(game, terminal_rewards[1])

# Value function after reward update
td.value_function

In [None]:
td.saved_game_states[game]

In [None]:
# If you inform td that game is over it will
# reset its state memory
td.gameover(game, 1)

In [None]:
game in td.saved_game_states

### Simulate game against random player

In [None]:
game.reset()

In [None]:
td.make_move(game, 1)
game.show_state()

In [None]:
while not game.game_over:
    random_move = random.choice(game.available_moves())
    game.make_move((2, random_move))
    if game.game_over:
        break
    td.make_move(game, 1)
    rewards = game.get_rewards()
    if rewards.get(1, False):
        # Rewards during game are zero for tic-tac-toe
        # so it is not actually necessary to update
        td.update(game, 1, rewards[1])
game.show_state()

In [None]:
game.game_over, game.winner

In [None]:
terminal_rewards = game.get_terminal_rewards()
terminal_rewards

In [None]:
td.update_terminal(game, terminal_rewards[1])

# Value function after reward update
td.value_function

In [None]:
td.value_function

In [None]:
# TD learner keeps a list of past states
td.saved_game_states[game]

In [None]:
# Inform TD that game is over
td.gameover(game, 1)

## Using the game controller

In [None]:
game = TicTacToeGame()
players = [TicTacToeExpert("EXP1"), TicTacToeExpert("EXP2")]

In [None]:
ctrl = GameController(game, players)
ctrl

In [None]:
ctrl.player_roles

In [None]:
ctrl.play()

## Competition between computer players

In [None]:
game = TicTacToeGame()
players = [TDLearner('TD1'), TicTacToeExpert('EXPERT')]
ctrl = GameController(game, players)

In [None]:
ctrl.play()

In [None]:
for player in players:
    print("\nPlayer %s" % str(player.name))
    print("  played: %d, wins: %d, losses: %d" % (player.games_played, player.games_won, player.games_lost))

## Train a TD Learner

In [None]:
td_learner = TDLearner('TD1')
computer_players = [td_learner, TicTacToeExpert('EXPERT')]

In [None]:
game = TicTacToeGame()
train_computer_players(game, computer_players)

In [None]:
train_computer_players(game, computer_players)

In [None]:
train_computer_players(game, computer_players)

## Now play against the computer!

In [None]:
game = TicTacToeGame()
players = [HumanPlayer("You"), td_learner]
ctrl = GameController(game, players)

In [None]:
while True:
    game.reset()
    ctrl.play()
    text = input("Press enter to play again or 'q' to quit: ")
    if text.strip().lower() == 'q':
        break

print("Results")        
for player in players:
    print("\nPlayer %s" % str(player.name))
    print("  played: %d, wins: %d, losses: %d" % (player.games_played, player.games_won, player.games_lost))

## TD Parameter Optimization

In [None]:
results = {
    'learning_rate': [],
    'off_policy_rate': [],
    'initial_values': [],
    'Wins (1000)': [],
    'Losses (1000)': [],
    'Wins (2000)': [],
    'Losses (2000)': []
}

In [None]:
initial_values = [0.0, 0.5, 1.0]
learning_rates = [0.25, 0.1]
off_policy_rates = [0.0, 0.1]

In [None]:
game = TicTacToeGame()
for lr in learning_rates:
    for opr in off_policy_rates:
        for v0 in initial_values:
            print("\nlr: %s, opr: %s, v0: %s" % (str(lr), str(opr), str(v0)))
            print("Training against expert...")
            td_learner = TDLearner('TD', learning_rate=lr, 
                                   off_policy_rate=opr, initial_value=v0)
            computer_players = [td_learner, TicTacToeExpert('EXPERT')]
            train_computer_players(game, computer_players, 
                                   iterations=1000, show=False)
            results['learning_rate'].append(lr)
            results['off_policy_rate'].append(opr)
            results['initial_values'].append(v0)
            results['Wins (1000)'].append(td_learner.games_won)
            results['Losses (1000)'].append(td_learner.games_lost)
            train_computer_players(game, computer_players, 
                                   iterations=1000, show=False)
            results['Wins (2000)'].append(td_learner.games_won)
            results['Losses (2000)'].append(td_learner.games_lost)

In [None]:
df = pd.DataFrame(results)
df

In [None]:
good_results = {
    'After 1000 iterations:': df['Losses (1000)'].idxmin(),
    'After 2000 iterations:': df['Losses (2000)'].idxmin()
}
good_results

In [None]:
best_result = good_results['After 2000 iterations:']
df.loc[best_result]

## Look at learning rates

In [None]:
td_players = [TDLearner("TD %d" % i) for i in range(4)]
rd = RandomPlayer("Random")
ex = TicTacToeExpert("Expert")

opponents = [rd, ex, td_players[3]]

In [None]:
# Select parameters for test
for td in td_players:
    td.learning_rate = df.loc[best_result]['learning_rate']
    td.initial_values = df.loc[best_result]['initial_values']
    td.off_policy_rate = df.loc[best_result]['off_policy_rate']

In [None]:
td_players[0:3], opponents

In [None]:
import datetime

# This can take a long time!
start_time = datetime.datetime.now()
print("Start time:", start_time)

scores = {}
index = []
n_iter = 100

def player_key(p1, p2):
    return "%s vs. %s" % (p1.name, p2.name)

for td_player, opponent in zip(td_players[0:3], opponents):
    scores[player_key(td_player, opponent)] = [test_player(td_player)]
scores[ex.name] = [test_player(ex)]

index.append(td_player.games_played)

game = TicTacToeGame()
iteration = 0
iterations = [0]

for epoch in range(100):
    
    for td_player, opponent in zip(td_players[0:3], opponents):
        train_computer_players(game, [td_player, opponent], 
                               iterations=n_iter, show=False)
        key = "%s vs. %s" % (td_player.name, opponent.name)
        scores[key].append(test_player(td_player))
    scores[ex.name].append(test_player(ex))

    index.append(td_player.games_played)
    iteration += n_iter
    iterations.append(iteration)
    print(iteration, "%5.2f %5.2f %5.2f %5.2f" % tuple(s[-1] for s in 
                                                     scores.values()))

print("End time:", datetime.datetime.now() - start_time)

In [None]:
pd.DataFrame(scores, index=iterations).plot()
plt.title("Learning curves against different opponents")
plt.xlabel("Games played")
plt.ylabel("Score")
plt.ylim(0,1)
plt.grid()
plt.savefig("learning_rates.pdf")
plt.savefig("images/learning_rates.png")

In [None]:
pd.DataFrame(scores, index=index).tail()

In [None]:
[p.games_played for p in td_players]

In [None]:
games_lost = td_players[1].games_lost
players = [td_players[1], TicTacToeExpert("EXPERT")]
train_computer_players(game, players, iterations=1000, show=False)
games_lost = td_players[1].games_lost - games_lost
print("Games lost against expert:", games_lost)

In [None]:
len(td_players[1].value_function)

In [None]:
combined_values = pd.concat(
    [pd.Series(td.value_function) for td in td_players], 
    axis=1, 
    sort=True
)
combined_values['Mean'] = combined_values.mean(axis=1)
combined_values = combined_values.sort_values(by='Mean', ascending=False)
combined_values.head(10)

In [None]:
combined_values.tail()

In [None]:
bins = np.linspace(0.0, 1.0, 21)
for td in td_players:
    h = pd.Series(td.value_function).hist(bins=bins)
    plt.xlim(0, 1.0)
    plt.title("Distribution of state-values - %s" % td.name)
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    plt.savefig("images/values_%s_hist.png" % td.name)
    plt.savefig("images/values_%s_hist.pdf" % td.name)
    plt.show()

In [None]:
number_of_values = pd.Series(
    [len(td.value_function) for td in td_players],
    index=pd.Index([td.name for td in td_players], name='Player')
)

number_of_values.plot.bar()
plt.title("Number of values stored in TD value functions")
plt.grid()
plt.savefig("number_of_values.pdf")
plt.savefig("images/number_of_values.png")

In [None]:
number_of_values

In [None]:
v = td_players[0].value_function

# These should be high values
(v[b'SSS---OO-'], v[b'SSS----OO'], v[b'SSSOO----'], v[b'SSS-OO---'], 
 v[b'SSS---O-O'], v[b'SSSO-O---'], v[b'SSSO--O--'], v[b'SSS-O--O-'],
 v[b'SSS--O--O'], v[b'SSSO---O-'], v[b'SSSO----O'], v[b'SSS-O-O--'],
 v[b'SSS-O---O'])

### Prepare some tests

In [None]:
from importlib import reload
import gamelearner, tictactoe
reload(gamelearner)
reload(tictactoe)
TDLearner = gamelearner.TDLearner
TicTacToeGame = tictactoe.TicTacToeGame
GameController = gamelearner.GameController
RandomPlayer = gamelearner.RandomPlayer

In [None]:
game = TicTacToeGame()
roles = game.roles

In [None]:
learning_rate=0.25
gamma=1.0
off_policy_rate=0.1
initial_value=0.5
use_afterstates=True

td = TDLearner(
    learning_rate=learning_rate, 
    gamma=gamma,
    off_policy_rate=off_policy_rate,
    initial_value=initial_value,
    use_afterstates=use_afterstates
)
assert len(td.value_function) == 0

In [None]:
# Simulate value_functions
value_functions = {
    1: {},
    2: {}
}

In [None]:
# First play a series of set-moves

planned_moves = [
    (1, (0, 0)),
    (2, (2, 2)),
    (1, (0, 1)),
    (2, (2, 1)),
    (1, (0, 2))
]
game_states = {
    1: [],
    2: []
}
afterstates = {
    1: [],
    2: []   
}

def get_value(role, state_key):
    
    value = value_functions[role].get(state_key, None)
    if value is None:
        value = initial_value
    
    return value

game = TicTacToeGame()
for move in planned_moves:
    role, position = move
    state_key = game.generate_state_key(game.state, role)
    game_states[role].append(state_key)
    for position in game.available_moves():
        possible_move = (role, position)
        next_state = game.next_state(game.state, possible_move)
        next_state_key = game.generate_state_key(next_state, role)
        #value_functions[role][next_state_key] = initial_value
        if possible_move == move:
            afterstates[role].append(next_state_key)
            value_functions[role][next_state_key] = \
                get_value(role, next_state_key)
            
    game.make_move(move)
    if game.game_over:
        opponent = [r for r in roles if r != role][0]
        if game.winner:
            if game.winner == role:
                rewards = {role: 1.0, opponent: 0.0}
            else:
                rewards = {role: 0.0, opponent: 1.0}
            for r in roles:
                state = afterstates[r][-1]
                value_functions[r][state] = \
                    value_functions[r][state] + \
                    learning_rate*(rewards[r] - \
                                   value_functions[r][state])
    if len(afterstates[role]) > 1:
        states = afterstates[role]
        # import pdb; pdb.set_trace()
        value_functions[role][states[-2]] = \
                value_functions[role][states[-2]] + \
                learning_rate*(0.0 + gamma*value_functions[role][states[-1]] -\
                               value_functions[role][states[-2]])

In [None]:
game_states

In [None]:
afterstates

In [None]:
value_functions

In [None]:
game.game_over, game.winner

In [None]:
correct_values = {
    1: {b'S--------': 0.5, b'SS------O': 0.53125, b'SSS----OO': 0.625},
    2: {b'O-------S': 0.5, b'OO-----SS': 0.375}
}
checks = []
for state_key, value in correct_values.items():
    checks.append(value_functions[state_key] == 
                  correct_values[state_key])
all(checks)