In [1]:
import pickle
import random
from collections import defaultdict

In [2]:
strategy = pickle.load(open("perfectPolicy.p", "rb"))

In [3]:
strategy 


{(0, 0, 0, 0, 0, 0, 0, 0, 0): array([0., 1., 0., 0., 0., 0., 0., 0., 0.]),
 (1, 0, 0, 0, 0, 0, 0, 0, 0): array([0., 0., 0., 1., 0., 0., 0., 0.]),
 (1, 2, 0, 0, 0, 0, 0, 0, 0): array([0., 0., 1., 0., 0., 0., 0.]),
 (1, 2, 1, 0, 0, 0, 0, 0, 0): array([0., 1., 0., 0., 0., 0.]),
 (1, 2, 1, 2, 0, 0, 0, 0, 0): array([1., 0., 0., 0., 0.]),
 (1, 2, 1, 2, 1, 0, 0, 0, 0): array([1., 0., 0., 0.]),
 (1, 2, 1, 2, 1, 2, 0, 0, 0): array([1., 0., 0.]),
 (1, 2, 1, 2, 1, 2, 0, 1, 0): array([1., 0.]),
 (1, 2, 1, 2, 1, 2, 2, 1, 0): array([1.]),
 (1, 2, 1, 2, 1, 2, 0, 1, 2): array([1.]),
 (1, 2, 1, 2, 1, 0, 2, 0, 0): array([0., 0., 1.]),
 (1, 2, 1, 2, 1, 1, 2, 0, 0): array([0., 1.]),
 (1, 2, 1, 2, 1, 1, 2, 2, 0): array([1.]),
 (1, 2, 1, 2, 1, 1, 2, 0, 2): array([1.]),
 (1, 2, 1, 2, 1, 0, 2, 1, 0): array([0., 1.]),
 (1, 2, 1, 2, 1, 0, 2, 1, 2): array([1.]),
 (1, 2, 1, 2, 1, 0, 0, 2, 0): array([1., 0., 0.]),
 (1, 2, 1, 2, 1, 1, 0, 2, 0): array([1., 0.]),
 (1, 2, 1, 2, 1, 1, 0, 2, 2): array([1.]),
 (1, 2, 1, 

In [4]:
len(strategy)

4520

In [5]:
def generate_board():
    return (0, 0, 0, 0, 0, 0, 0, 0, 0) #following the perfect policy format of a tuple with 9 zeros, with a 0 being empty and a 1 or 2 representing player moves

In [6]:
def current_player(board, starting_player):
    # #since player 1 will always start, it'll be player 1's turn if both players have played the same amount of turns
    # if board.count(1) == board.count(2):
    #     return 1
    # else:
    #     return 2

#rewrote current_player so i can randomize the starting player instead
    turns_played = board.count(1) + board.count(2) #counts the total number of played turns
    if turns_played % 2 == 0: #if an even number of turns have been played, it's the starting player's turn
        return starting_player 
    else:
        return 3 - starting_player #3 - starting_player always gives the other player since 3 - 1 = player 2 and 3 - 2 = player 1

In [7]:
def actions(board):
    available_actions = []
    
    for i in range(len(board)): #looping through each index in the board
        if board[i] == 0: #checks if the position is empty 
            available_actions.append(i) #if it is, appends the index to available actions list
    return available_actions #return said list

In [8]:
def tictactoe(board, action, player): #takes in the current board, an action and who did it
    game_board = list(board) #have to make it a list so i can change it
    game_board[action] = player #inputs the player's number (1 or 2) into the chosen action spot
    return tuple(game_board) #reconverting it and returning it as a tuple again


In [9]:
def random_strategy(board):
    return random.choice(actions(board)) #using the random choice function from the random framework to pick an action from the available_actions list

In [10]:
def policy_strategy(board, Q_table, exploration_rate):
    allowed_actions = actions(board) #saves all of the allowed moves for the current board
    if random.random() < exploration_rate: #exploration_rate is set at 1.0, making it randomly choose an action 100% of the time in the beginning until the decay kicks in so it slowly starts exploiting what it learns
        return random.choice(allowed_actions) #just a random action from the list
    
    best_value = max(Q_table[(board, action)] for action in allowed_actions) #looks through the q values of each action in the list of allowed actions and picks the one with the highest score
    best_actions = [] 
    for action in allowed_actions:
        if Q_table[(board, action)] == best_value: #this now iterates through the list and appends all of the actions that share the highest value to a new list so i can randomly choose one of them, should hopefully help with bias
            best_actions.append(action) 

    chosen_action = random.choice(best_actions)
    return chosen_action

In [11]:
def perfect_strategy(board):
    ps_actions = strategy[board] #saves the actions from the pickle dictionary for the current board
    allowed_actions = actions(board)
    best_action_index = ps_actions.argmax() #find the index with the highest q value inside it
    return allowed_actions[best_action_index]

In [12]:
def check_winner(board):
    win_conditions = [(0, 1, 2), (3, 4, 5), (6, 7, 8), #rows
                     (0, 3, 6), (1, 4, 7), (2, 5, 8), #columns
                     (0, 4, 8), (2, 4, 6)] #diagonals
    
    for position in win_conditions:
        if board[position[0]] == board[position[1]] == board[position[2]] != 0: #checks if the three positions forming the win condition are from the same index, but not 0's
            return board[position[0]] #returns whichever index (1 or 2) that fulfilled the win condition / the winner
        

    if 0 not in board: #a draw condition, if no winner's been found and all the positions are filled should declare a draw
        return 0 #0 wins! :D
        
    return None #if neither of those are met the game's still ongoing

In [None]:
def training_loop_random(episodes = 5000):
    for episode in range(episodes):
        board = generate_board() #initializes an empty board
        starting_player = random.choice([1, 2]) #randomizes who starts
        opponent = "random"

        game_memory = []
        Q_table = defaultdict(float) #using defaultdict over an empty dictionary to avoid errors from from nonexistent values when updating on first run
        exploration_rate = 1.0
        exploration_decay = 0.95
        learning_rate = 0.1
        reward_decay = 0.9


        while True:
            
            if current_player(board, starting_player) == 1:
                action = policy_strategy(board, Q_table, exploration_rate)
                board = tictactoe(board, action, 1)

                winner = check_winner(board)
                reward = 1 if winner == 1 else reward = -1 if winner == 2 else reward = 0
                #q-update
                break   

            if current_player(board, starting_player) == 2:
                opponent_action = random_strategy(board)
                opponent_board = tictactoe(board, opponent_action, 2)

                winner = check_winner(opponent_board)
                reward = 1 if winner == 1 else reward -1 if winner == 2 else reward = 0
                break

            game_memory.append((board, action)) #list to save the game information so i can update q values

            if winner == 1:
                print("1 wins")
                break
            elif winner == 2:
                print("2 wins")
                break
            elif winner == 0:
                print("Draw")
                break

In [None]:
def training_loop_perfect():
    print("")