In [None]:
# Implementing the Tic Tac Toe game for an agent playing first or second based on a random selection. 
# Approximate reinforcement techniques like Policy gradient and temporal difference is used for the same 
# Learns the optimal first move while playing first or second
# Learns that getting 3 X's is good but does not block O's 
# state = board positions, action = next move, method = Policy gradient and temporal difference 
# Performance gets better with more training 

In [None]:
import numpy as np 
import pandas as pd 

In [None]:
# Decide the game points based on a given state of the board 
# +1 if agent wins, -1 if opponent wins and zero otherwise 
def game_points(matrix):
    agent_win = np.array([1,1,1])
    opponent_win = np.array([-1,-1,-1])
    if ((matrix[0:3] == agent_win).all() | (matrix[0::4] == agent_win).all() | (matrix[0::3] == agent_win).all() | (matrix[1::3] == agent_win).all()
        | (matrix[2::3] == agent_win).all() | (matrix[2:8:2] == agent_win).all() | (matrix[3:6] == agent_win).all() | (matrix[6:9] == agent_win).all()):
        return 1
    elif ((matrix[0:3] == opponent_win).all() | (matrix[0::4] == opponent_win).all() | (matrix[0::3] == opponent_win).all() | (matrix[1::3] == opponent_win).all()
        | (matrix[2::3] == opponent_win).all() | (matrix[2:8:2] == opponent_win).all() | (matrix[3:6] == opponent_win).all() | (matrix[6:9] == opponent_win).all()):
        return -1
    else:
        return 0

# Function for selecting a move for the agent/opponent (player) based on a given policy (greedy/random) 
# player = 'agent' or 'opponent' | random = True or False 
# compile models before running the functions 
def First_player_selection(player,random):
    available_positions = np.where(matrix==0)[0] 
    if random == True:
        selection = np.random.choice(available_positions)
    else:
        selection = np.argmax(model_agentF2.predict(matrix.reshape(1,9)))
        if selection not in available_positions:
            selection = np.random.choice(available_positions)
    if player == 'agent':
        matrix[selection] = 1
    else:
        matrix[selection] =-1
    return selection

def Second_player_selection(player,random):
    available_positions = np.where(matrix==0)[0] 
    if random == True:
        selection = np.random.choice(available_positions)
    else:
        selection = np.argmax(model2.predict(matrix.reshape(1,9)))
        if selection not in available_positions:
            selection = np.random.choice(available_positions)
    if player == 'agent':
        matrix[selection] = 1
    else:
        matrix[selection] =-1
    return selection

In [None]:
## Neural network implementation of approximate value function of the states 
# opponent first
# model to learn state values with agent playing second 
from keras import Sequential
from keras.layers import Dense
model = Sequential()
model.add(Dense(1,input_shape=(9,)))
model.compile(loss='mse', optimizer='adam', metrics=['mae'])

In [None]:
## agent first 
# model to learn state values with agent playing First 
from keras import Sequential
from keras.layers import Dense
model_agentF = Sequential()
model_agentF.add(Dense(1,input_shape=(9,)))
model_agentF.compile(loss='mse', optimizer='adam', metrics=['mae'])

In [None]:
# Train model 
# opponent first
num_episodes = 500  # (increase for better training)
for i in range(num_episodes):
    matrix = np.zeros(9)
    turn = 0 
    opponent_selections= []
    agent_selections = []
    while ((game_points(matrix) == 0) &  (turn < 8)): 
        # opponent makes a move 
        opponent_selections.append(First_player_selection('opponent',True))
        turn += 1
        # state before agent takes an action 
        initial_state = matrix
        # if game not over agent makes a move 
        if ((turn <=7) and (game_points(matrix) == 0)):
            agent_selections.append(Second_player_selection('agent',True))
            turn += 1
            turn
        # state after agent takes an action 
        after_state = matrix
        # reward of being in that state 
        ri = game_points(initial_state)
        ra = game_points(after_state)
        #target = ra
        target = (ra+ri) + 0*(model.predict(initial_state.reshape(1,9)))
        model.fit(initial_state.reshape(1,9),target , epochs=10, verbose=0) 

In [None]:
# train model
# agent first
num_episodes = 500
for i in range(num_episodes):
    matrix = np.zeros(9)
    turn = 0 
    opponent_selections= []
    agent_selections = []
    while ((game_points(matrix) == 0) &  (turn < 8)): 
        # state before agent takes an action 
        initial_state = matrix
        # agent makes a move 
        agent_selections.append(First_player_selection('agent',True))
        turn += 1
        # state after agent takes an action 
        after_state = matrix
        # if game not over opponent makes a move 
        if ((turn <=7) and (game_points(matrix) == 0)):
            opponent_selections.append(Second_player_selection('opponent',True))
            turn += 1
        # reward of being in a state 
        ri = game_points(initial_state)
        ra = game_points(after_state)
        #target = ra
        target = (ra+ri) + 0*(model_agentF.predict(initial_state.reshape(1,9)))
        model_agentF.fit(initial_state.reshape(1,9),target , epochs=10, verbose=0)

In [None]:
# test values of different states 
#initial_state1 = np.array([-1,-1,-1,1,1,-1,1,0,-1])
initial_state1 = np.array([1,0,0,0,1,0,-1,0,1])
initial_state2 = np.array([-1,0,0,0,-1,0,1,0,-1])
model_agentF.predict(initial_state1.reshape(1,9)), model_agentF.predict(initial_state2.reshape(1,9))

In [None]:
# Neural Network implementation to learn action value functions with agent as second player
# opponent first 
from keras import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model2 = Sequential()
model2.add(Dense(9,input_shape=(9,), activation = 'linear'))
model2.compile(loss='mse', optimizer='sgd', metrics=['mse'])

In [None]:
# Neural Network implementation for action value functions with agent as first player
#agent first 
from keras import Sequential
from keras.layers import Dense
model_agentF2 = Sequential()
model_agentF2.add(Dense(9,input_shape=(9,), activation = 'linear'))
model_agentF2.compile(loss='mse', optimizer='sgd', metrics=['mse'])

In [None]:
model_agentF2.get_config()

In [None]:
# Train model 
# opponent first 
y = 0.95
eps = 0.5
decay_factor = 0.999
num_episodes = 2000
a= 0
for i in range(num_episodes):
    matrix = np.zeros(9)
    turn = 0 
    opponent_selections= []
    agent_selections = []
    eps *= decay_factor
    initial_state = np.zeros(9)
    after_state = np.ones(9)
    input_vector = np.zeros([1,9])
    output_vector = np.zeros([1,9])
    r=0
    while ((game_points(matrix) == 0) &  (turn < 8)):
        opponent_selections.append(First_player_selection('opponent',True))
        turn += 1
        initial_state = matrix 
        if ((turn <=7) and (game_points(initial_state) == 0)):  
            if (np.random.random() > eps):
                agent_selections.append(Second_player_selection('agent',False))
            else:
                agent_selections.append(Second_player_selection('agent',True))
            turn += 1
        after_state = matrix 
        if (not(initial_state == after_state).all()):
            target = (game_points(after_state)+(model.predict(after_state.reshape(1,9))))[0][0]  # update target with reward and predicted value of next step 
            target_vec = model2.predict(initial_state.reshape(1,9))[0]  # this models prediction
            target_vec[agent_selections[-1]] += target   # update value of selected action with calculated target value 
            # store the state and target vector to train model 
            input_vector= np.vstack([input_vector,initial_state])
            output_vector = np.vstack([output_vector,target_vec])
    model2.fit(input_vector, output_vector, epochs=100, verbose=0)  # train model with inputs from each episode or game 

In [None]:
#Train model for actions to be taken with
# agent first 
y = 0.95
eps = 0.5
decay_factor = 0.999
num_episodes = 2000
a= 0
for i in range(num_episodes):
    matrix = np.zeros(9)
    turn = 0 
    opponent_selections= []
    agent_selections = []
    eps *= decay_factor
    initial_state = np.zeros(9)
    after_state = np.ones(9)
    input_vector = np.zeros([1,9])
    output_vector = np.zeros([1,9])
    r=0
    while ((game_points(matrix) == 0) &  (turn < 8)):
        initial_state = matrix
        if (np.random.random() > eps):
            agent_selections.append(First_player_selection('agent',False))
        else:
            agent_selections.append(First_player_selection('agent',True))
        turn += 1
        after_state = matrix
        if ((turn <=7) and (game_points(after_state) == 0)):
            opponent_selections.append(Second_player_selection('opponent',True))
            turn += 1            
        target = (game_points(after_state)+(model_agentF.predict(after_state.reshape(1,9))))[0][0]
        target_vec = model_agentF2.predict(initial_state.reshape(1,9))[0]
        target_vec[agent_selections[-1]] += target
        # store the state and target vector to train model 
        input_vector= np.vstack([input_vector,initial_state])
        output_vector = np.vstack([output_vector,target_vec])
    model_agentF2.fit(input_vector, output_vector, epochs=10, verbose=0)

In [None]:
initial_state1 = np.array([ -1, -1,  0,  1,  0, 0,  0, 0, 0])
initial_state2 = np.array([-1,1,-1,-1,1,0,0,0,1])
model_agentF2.predict(initial_state1.reshape(1,9)), model2.predict(initial_state2.reshape(1,9))

In [None]:
# plot tic tac toe board 
import matplotlib.pyplot as plt 
%matplotlib inline
def plot_matrix(selection, agent):
    for x in range(4):
        plt.plot([x, x], [0,3], 'k')
    for y in range(4):
        plt.plot([0, 3], [y,y], 'k')
    if agent == False:
        s = plt.plot((np.mod(selection,3) + 0.5),(2.5-int(selection/3)),
                          'o',markersize=30, markeredgecolor=(1,0,0), markerfacecolor='w', markeredgewidth=2)
    else:
        s = plt.plot((np.mod(selection,3) + 0.5),(2.5-int(selection/3)),
                          'x',markersize=30, markeredgecolor=(0,1,0), markerfacecolor='w', markeredgewidth=2)

In [None]:
# tic tac toe simulation using the above trained models 
matrix = np.zeros(9)
def play_game():
    turn =0
    opponent_selections= []
    agent_selections = []
    agent_first=True
    #if np.random.random() > 0.5:
        #agent_first = True
    print('agent_first', agent_first)
    while ((game_points(matrix) == 0) &  (turn < 8)): 
        
        if agent_first == True:
            agent_selections.append(First_player_selection('agent',False))
            turn +=1
            if ((turn <=7) and (game_points(matrix) == 0)):
                opponent_selections.append(Second_player_selection('opponent',True))
                turn+=1
        else:
            opponent_selections.append(First_player_selection('opponent',True))
            turn +=1
            if ((turn <=7) and (game_points(matrix) == 0)):
                agent_selections.append(Second_player_selection('agent',False))
                turn+=1
                
        plot_matrix(opponent_selections[-1],False)
        plot_matrix(agent_selections[-1],True)
    print(matrix, agent_selections,opponent_selections)
    if game_points(matrix) == 1:
        print('agent wins')
    elif game_points(matrix) == -1:
        print('opponent wins')
    else:
        print('draw')
play_game()

In [None]:
# the agent has learned that best move for starting second is center 
# Learned to get 3 X's in a row 
# does not know to block 3 o's is some cases 

In [None]:
model.save('my_model_agent_second.h5')
model_agentF2.save('my_model_agent_first.h5')

In [None]:
from keras.models import load_model
model3 = load_model('my_model.h5')