In [20]:
import math
import numpy as np
import pandas as pd
import random
import sys
from itertools import product

In [21]:
combinations = [list(range(3)) for _ in range(9)]
all_states = list(set(product(*combinations)))
#print(all_states)

In [22]:
def check_win(player, board):
    if np.all(board[0,:] == player):
        return True
    elif np.all(board[1,:] == player): 
        return True
    elif np.all(board[2,:] == player):
        return True
    elif np.all(board[:,0] == player):
        return True
    elif np.all(board[:,1] == player):
        return True
    elif np.all(board[:,2] == player):
        return True
    elif np.all(board.diagonal()==player):
        return True
    elif  np.all(np.fliplr(board).diagonal()==player):
        return True
    return False

In [23]:
#simulation with player O

master_states_res = []
master_actions_res = []
master_rewards_res = []
master_next_states_res = []

current_player = "O"
opponent_player = "X"

for num_sims in range(300000):
    empty_board = np.array(['-']*9).reshape(3,3)
    board = empty_board

    current_turn = "X"
    x,y = np.where(board == '-')
    possible_actions = [(x,y) for x,y in zip(x,y)]
    avail_positions = [(x,y) for x,y in zip(x,y)]
    #print(possible_actions)

    empty_state_ind = all_states.index((0,0,0,0,0,0,0,0,0))
    states_res = [empty_state_ind]
    actions_res = []
    next_states_res = []

    while (not check_win(current_player, board) and not check_win(opponent_player, board) and ("-" in board)):
        # choose random move
        rand_pos = random.choice(avail_positions)
        board[rand_pos[0]][rand_pos[1]] = current_turn
        avail_positions.remove(rand_pos)

        # if it's our current player's turn (not opponent's turn)
        if current_turn == current_player: 
            # assign action to result
            actions_res.append(possible_actions.index(rand_pos))

            # get the tuple of the current state
            cur_state = [0,0,0,0,0,0,0,0,0]
            cur_flat_board = board.flatten()
            for i in range(len(cur_flat_board)):
                if cur_flat_board[i] == "X":
                    cur_state[i] = 1
                elif cur_flat_board[i] == "O":
                    cur_state[i] = 2
            cur_state = tuple(cur_state)

            # assign states to results
            cur_state_ind = all_states.index(cur_state)
            states_res.append(cur_state_ind)
            next_states_res.append(cur_state_ind)


        # switch to other player for next turn
        if current_turn == "X":
            current_turn = "O"
        else:
            current_turn = "X"
        #print(board)

    # remove last element of states_res because we overshot
    states_res.pop()

    # assign appropriate reward       
    if check_win(current_player, board):
        reward = 1
    elif check_win(opponent_player, board):
        reward = -1
    else:
        reward = 0
    rewards_res = [reward] * len(actions_res)
    #print(rewards_res)

    # add to master results
    master_states_res.extend(states_res)
    master_actions_res.extend(actions_res)
    master_rewards_res.extend(rewards_res)
    master_next_states_res.extend(next_states_res)

"""
print(master_states_res)
print(master_actions_res)
print(master_rewards_res)
print(master_next_states_res)
"""

# make dataframe of master results

results_df = pd.DataFrame({"s": master_states_res, 
                           "a": master_actions_res,
                           "r": master_rewards_res,
                           "sp": master_next_states_res})
print(results_df)

# write dataframe to csv file
results_df.to_csv("simulation_results_player_x_combined", encoding='utf-8', index=False)

             s  a  r     sp
0         8802  4  0  14362
1        14362  6  0   3065
2         3065  0  0   3784
3         3784  5  0  11396
4         8802  8  1   6434
...        ... .. ..    ...
1037174   7321  2  1   5398
1037175   5398  5  1   4483
1037176   8802  6  1   4829
1037177   4829  7  1  14087
1037178  14087  8  1   1716

[1037179 rows x 4 columns]


In [51]:
infile = "simulation_results_player_x_combined"
outfile_name = "policy_results_player_x_combined"

In [52]:
num_states = len(all_states)
num_actions = len(possible_actions)
gamma =  0.95
alpha = 0.1

In [53]:
def update_Q_matrix(Q_matrix, s, a, r, sp, gamma, alpha):
    Q_matrix[s-1, a-1] = Q_matrix[s-1, a-1] + alpha * (r + gamma*max(Q_matrix[sp-1,:]) - Q_matrix[s-1,a-1])
    return Q_matrix

In [54]:
#for i in range(100):
Q_matrix = np.zeros((num_states, num_actions))
input_df = pd.read_csv(infile)
for row in input_df.iterrows():
    s, a, r, sp = row[1][0], row[1][1], row[1][2], row[1][3] 
    Q_matrix = update_Q_matrix(Q_matrix, s, a, r, sp, gamma, alpha)

In [55]:
outfile = open(outfile_name, "w")
policy = np.argmax(Q_matrix, axis=1)
for i in range(len(policy)):
    #print(policy[i] +1)
    outfile.write(str(policy[i]+1))
    if i != (len(policy) - 1):
        outfile.write('\n')
outfile.close()

In [87]:
cur_policy_file = "policy_results_player_o_combined_100"
policy_df = pd.read_csv(cur_policy_file)

In [86]:
total_reward = 0

current_player = "O"
opponent_player = "X"

for num_sims in range(10000):
    empty_board = np.array(['-']*9).reshape(3,3)
    board = empty_board
    
    current_turn = "X"
    x,y = np.where(board == '-')
    possible_actions = [(x,y) for x,y in zip(x,y)]
    avail_positions = [(x,y) for x,y in zip(x,y)]
    #print(possible_actions)

    empty_state_ind = all_states.index((0,0,0,0,0,0,0,0,0))
    
    while (not check_win(current_player, board) and not check_win(opponent_player, board) and ("-" in board)):
        

        # if it's our current player's turn (not opponent's turn)
        # look at the policy for the move
        if current_turn == current_player: 
            
            # get the tuple of the current state
            cur_state = [0,0,0,0,0,0,0,0,0]
            cur_flat_board = board.flatten()
            for i in range(len(cur_flat_board)):
                if cur_flat_board[i] == "X":
                    cur_state[i] = 1
                elif cur_flat_board[i] == "O":
                    cur_state[i] = 2
            cur_state = tuple(cur_state)
            
            cur_state_ind = all_states.index(cur_state)
            
            chosen_move = policy_df['1'][cur_state_ind]
            chosen_pos = possible_actions[chosen_move-1]
            if chosen_pos in avail_positions:
                board[chosen_pos[0]][chosen_pos[1]] = current_turn
                avail_positions.remove(chosen_pos)
            else:
                rand_pos = random.choice(avail_positions)
                board[rand_pos[0]][rand_pos[1]] = current_turn
                avail_positions.remove(rand_pos)
        else: # it's the opponent's turn (simulate by choosing randomly)
            # choose random move
            rand_pos = random.choice(avail_positions)
            board[rand_pos[0]][rand_pos[1]] = current_turn
            avail_positions.remove(rand_pos)

        # switch to other player for next turn
        if current_turn == "X":
            current_turn = "O"
        else:
            current_turn = "X"
            
    # keep track of reward       
    if check_win(current_player, board):
        total_reward += 1
    elif check_win(opponent_player, board):
        total_reward += -1

print(total_reward)

-2577


In [58]:
"""
RESULTS (total rewards)
-----------------------

(1 iterations of q-learning with player X with 300000 initial simulations)
"policy_results_player_x_combined": 3253

(100 iterations of q-learning with player X with 300000 initial simulations)
"policy_results_player_x_combined_100" : 3330 

random policy player X: 3122

(1 iterations of q-learning with player O with 300000 initial simulations)
"policy_results_player_x_combined": -2670

(100 iterations of q-learning with player O with 300000 initial simulations)
"policy_results_player_x_combined_100" : -2571

random policy player O: -2974
"""

'\nRESULTS (total rewards)\n-----------------------\n\n(1 iterations of q-learning with player X with 300000 initial simulations)\n"policy_results_player_x_combined": \n\n(100 iterations of q-learning with player X with 300000 initial simulations)\n"policy_results_player_x_combined_100" : 3302 \n\n(1 iterations of q-learning with player O with 300000 initial simulations)\n"policy_results_player_x_combined": \n\n(100 iterations of q-learning with player O with 300000 initial simulations)\n"policy_results_player_x_combined_100" : 3302 \n\nrandom policy: \n'