In [852]:
%reset
import numpy as np
import random
from scipy.special import softmax
import operator

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [853]:
gamma = 0.9
# discount factor: states that are close to the end of the game are rewarded/punished 
# more than states that are towards the beginning of the game

In [854]:
def serialize(state):
    # hash function for boards, converts board to int so I can compute value of states
    serial = 0
    for i in range(9):
        serial = serial + state[i]*3**(i)
    return int(serial)

In [855]:
def de_serialize(serial):
    state = np.zeros((9,))
    for i in range(9):
        state[i] = serial%3
        serial /= 3
    return state

In [856]:
def print_state(state):
    legal = 0
    for i in range(3):
        for j in range(3):
            if state[3*i+j] == 0:
                print str(legal),
                legal += 1
            elif state[3*i+j] == 1:
                print 'X',
            else:
                print 'O',
        print '\n'
    print '\n'

In [857]:
class Player:
    def __init__(self, mode = 0):
        self.values = {}
        # compute a value for each of the states of the board. 
        # This is kind of like using a minimax tree with depth 1, 
        # for which the desirability of a future state is not computed explicitly depending upon actual wins and losses 
        # but rather learned from past experience
        self.mode = mode
        # 0 -> random legal moves
        # 1 -> training, some exploration
        # 2 -> no training, no exploration
        # 3 -> user
        
    
    def new_game(self, generation):
        self.sent = []
        self.recieved = []
        self.generation = generation
        
    def play(self, current_state, next_states):
        self.recieved += [current_state]
        if self.mode == 0:
            # playing random legal moves
            move = random.sample(range(len(next_states)), 1)[0]
            self.sent += [next_states[move]]
            return move
        elif self.mode == 1 or self.mode == 2:
            # use boltzmann style distribution to compute next move 
            # plays less optimal moves with some probability to encourage exploration
            v = np.zeros((len(next_states),))
            for i, state in enumerate(next_states):
                v[i] = self.values.get(serialize(state), 0)
            if self.mode == 1:
                move = np.argmax(np.random.multinomial(1, softmax(v)))
                self.sent += [next_states[move]]
                return move
            else:
                move = np.argmax(v)
                self.sent += [next_states[move]]
                return move
        else:
            move = input("select move [0->8]: ")
            self.sent += [next_states[move]]
            return move
        
    def game_over(self, winner):
        self.update_values(winner)
        if self.mode == 3:
            if winner:
                print('you WON!!!!')
            else:
                print('you LOST!!!')
        
    def update_values(self, winner):
        if self.mode == 1:
            if winner:
                for ind, b in enumerate(map(serialize, list(reversed(self.recieved)))):
                    self.values[b] = self.values.get(b, 0)-gamma**ind/np.log(self.generation+1)
                for ind, b in enumerate(map(serialize, list(reversed(self.sent)))):
                    self.values[b] = self.values.get(b, 0)+gamma**ind/np.log(self.generation+1)
            else:
                for ind, b in enumerate(map(serialize, list(reversed(self.recieved)))):
                    self.values[b] = self.values.get(b, 0)+gamma**ind/np.log(self.generation+1)
                for ind, b in enumerate(map(serialize, list(reversed(self.sent)))):
                    self.values[b] = self.values.get(b, 0)-gamma**ind/np.log(self.generation+1)
    
    def no_exploration(self):
        self.mode = 2
    
    def best_value_state(self):
        #print(self.values)
        sorted_values = sorted(self.values.items(), key=operator.itemgetter(1))
        #print(sorted_values)
        for i in range(10):
            print_state(de_serialize(sorted_values[-i][0]))

In [858]:
class Game:
    def __init__(self, generation, player1, player2):
        self.state = np.zeros((9,))
        # self.board is a 1D array representing the board state.
        # 0 is empty, 1 is X, 2 is O

        self.num_moves = 0
        
        self.history = []
        # list of board states that are played during the game
        
        self.generation = generation
        # generation number for value convergence. 
        # The weights I use (1/log(generation)) don't actually lead to convergence 
        # L2 norm of series should be finite for convergence
        # but whatever
        
        self.players = [player1, player2]
        if np.random.uniform() > 0.5:
            self.players[0], self.players[1] = self.players[1], self.players[0]
        self.players[0].new_game(self.generation)
        self.players[1].new_game(self.generation)
        
    def next_move(self, verbose = False):            
        next_states = self.get_next_states()
        move = self.players[self.num_moves%2].play(self.state, next_states)
        self.state = next_states[move]
        
        self.history += [serialize(self.state)]
        # append board state to history, so that the value of the state can be updated at the end of the game
        
        self.num_moves = self.num_moves+1
        if verbose:
            print_state(self.state)
        
    def get_next_states(self):
        moves = self.legal_moves()
        next_states = [self.state.copy() for i in range(len(moves))]
        for i, move in enumerate(moves):
            next_states[i][move] = self.num_moves%2+1
        return next_states
            
    def legal_moves(self):
        # returns a list of legal moves
        moves = []
        for i in range(9):
            if self.state[i] == 0:
                moves += [i]
        return moves
    
    def game_won(self):
        if (self.state[0] == self.state[1] and self.state[0] == self.state[2] and self.state[0] != 0) or \
            (self.state[0] == self.state[4] and self.state[0] == self.state[8] and self.state[0] != 0) or \
            (self.state[0] == self.state[3] and self.state[0] == self.state[6] and self.state[0] != 0) or \
            (self.state[1] == self.state[4] and self.state[1] == self.state[7] and self.state[1] != 0) or \
            (self.state[2] == self.state[4] and self.state[2] == self.state[6] and self.state[2] != 0) or \
            (self.state[2] == self.state[5] and self.state[2] == self.state[8] and self.state[2] != 0) or \
            (self.state[3] == self.state[4] and self.state[3] == self.state[5] and self.state[3] != 0) or \
            (self.state[6] == self.state[7] and self.state[6] == self.state[8] and self.state[6] != 0):
            return True
        return False
    
    def decide_winner(self):
        if not self.game_won():
            self.players[0].game_over(False)
            self.players[1].game_over(False)
        elif self.num_moves%2 == 0:
            self.players[0].game_over(False)
            self.players[1].game_over(True)
        else:
            self.players[0].game_over(True)
            self.players[1].game_over(False)
            
    

In [859]:
def play_game(generation, p1, p2):
    training_game = Game(generation, p1, p2)
    while training_game.num_moves < 9 and not training_game.game_won():
        training_game.next_move(verbose = False)
    training_game.decide_winner()

In [860]:
bot = Player(mode = 1)
random_bot = Player(mode = 0)

for i in range(1, 100000):
    play_game(i, bot, random_bot)

In [861]:
user_player = Player(mode = 3)
bot.no_exploration()
actual_game = Game(-1, user_player, bot)

while actual_game.num_moves < 9 and not actual_game.game_won():
    actual_game.next_move(verbose = True)
actual_game.decide_winner()

0 1 2 

3 X 4 

5 6 7 



select move [0->8]: 1
0 O 1 

2 X 3 

4 5 6 



X O 0 

1 X 2 

3 4 5 



select move [0->8]: 0
X O O 

0 X 1 

2 3 4 



X O O 

0 X 1 

X 2 3 



select move [0->8]: 0
X O O 

O X 0 

X 1 2 



X O O 

O X 0 

X 1 X 



you LOST!!!


In [842]:
bot.best_value_state()

0 1 2 

3 4 5 

6 7 8 



0 1 2 

3 X 4 

5 6 7 



0 O 1 

2 X 3 

X 4 5 



0 1 2 

3 X O 

4 5 X 



X 0 1 

2 X 3 

O 4 5 



0 1 2 

3 X 4 

X O 5 



0 1 2 

O X 3 

X 4 5 



O 0 1 

2 X X 

3 4 5 



0 1 2 

3 X 4 

5 X O 



0 1 O 

X X 2 

3 4 5 





In [733]:
f = {}

In [734]:
f[1] = f.get(1, 0)+1