In [1]:
import numpy as np
length=3
# Tic - Tac -Toe 
""""
  Basic Terminology
    1.Agent
    2.Human
    3.Environment
"""

'\n  Basic Terminology\n    1.Agent\n    2.Human\n    3.Environment\n'

In [2]:
class Environment: 
    
    def __init__(self):
        self.board = np.zeros((length,length))
        self.x = -1
        self.o = 1
        self.winner = None
        self.ended = False
        self.num_states = 3**(length*length)
     
    def is_empty(self,i,j):
        return self.board[i,j] == 0
    
    def reward(self,sym): 
        if not self.game_over():
            return 0
        return 1 if self.winner == sym else 0
    
    def get_state(self):
        h=0
        k=0
        
        for i in range(length):
            for j in range(length):
                if(self.board[i,j]==0):
                    v=0
                elif(self.board[i,j]==self.x):
                    v=1
                elif(self.board[i,j]==self.o):
                    v=2
                 
                h=h+(3**k)*v
                k=k+1
        return h
    
    def game_over(self,force_recalculate=False):
        
        if not force_recalculate and self.ended:
            return self.ended
        #row
        for i in range(length):
            for j in (self.x,self.o):
                if(self.board[i].sum()==j*length):
                    self.winner = j
                    self.ended = True
                    return True
        #col     
        for i in range(length):
            for j in (self.x,self.o):
                if(self.board[:,i].sum()==j*length):
                    self.winner = j
                    self.ended = True
                    return True   
        #diagonal 
        for  j in (self.x,self.o):
            if(self.board.trace()==j*length):
                self.winner=j
                self.ended=True
                return True
            
            if(np.fliplr(self.board).trace()==j*length):
                self.winner=j
                self.ended=True
                return True
            
        #check all fill
        if np.all((self.board == 0) == False):
            self.winner=None
            self.ended=True
            return True
        
        self.winner=None
        return False
    
    
    def is_draw(self):
        return self.ended and self.winner is None
    
    def draw_board(self):
        for i in range(length):
            print("-------------")
            for j in range(length):
                print(" ",end="")
                if(self.board[i,j]==self.x):
                    print("x  |",end="")
                elif(self.board[i,j]==self.o):
                    print("o  |",end="")
                else:
                    print("-  |",end="")
            print("")
            
        print("-------------")
        
    
            

In [3]:
class Agent:
    
    def __init__(self,eps=0.1,alpha=0.5):
        self.eps = eps
        self.alpha = alpha
        self.verbose = False
        self.state_history = []
        
    def setV(self,V):
        self.V = V
     
    def set_symbol(self,sym):
        self.sym = sym
    
    def set_verbose(self,v):
        self.verbose = v
    
    def reset_history(self):
        self.state_history = []
        
    def take_action(self,env):    
        
        r = np.random.rand()
        best_state = None
        # Random Action
        if(r < self.eps):
            
            if(self.verbose):
                print("Taking a random action")
            
            possible_moves=[]
            for i in range(length):
                for j in range(length):
                    if(env.is_empty(i,j)):
                        possible_moves.append((i,j))
            idx = np.random.choice(len(possible_moves))
            next_move = possible_moves[idx]
        # Greedy Action
        else:
            pos2value = {}
            next_move = None
            best_value = -1
            for i in range(length):
                for j in range(length):
                    if(env.is_empty(i,j)):
                        env.board[i,j]=self.sym
                        state= env.get_state()
                        env.board[i,j]=0;
                        pos2value[(i,j)]=self.V[state]
                        if(self.V[state]>best_value):
                            best_value=self.V[state]
                            best_state=state
                            next_move=(i,j)
                            
            if(self.verbose):
                print("Taking a greedy action")
                for i in range(length):
                    print("------------------")
                    for j in range(length):
                        if(env.is_empty(i,j)):
                            print("%.2f|" % pos2value[(i,j)],end="")
                        else:
                            print(" ",end="")
                            if(env.board[i,j]==env.x):
                                print("x  |",end="")
                            elif(env.board[i,j]==env.o):
                                print("o  |",end="")
                            else:
                                print("  |",end="")
                    print("")        
                print("------------------")
                    
        env.board[next_move[0]][next_move[1]]=self.sym
        
        
    def update_state_history(self,s):
        self.state_history.append(s)
        
        
    def update(self,env):
        reward = env.reward(self.sym)
        target = reward
        for prev in reversed(self.state_history):
            value = self.V[prev] + self.alpha*(target - self.V[prev])
            self.V[prev] = value
            target = value
        self.reset_history()
        
      
        
                    
                    

In [4]:
class Human:
    def __init__(self):
        pass
    
    def set_symbol(self,sym):
        self.sym=sym
    #input  
    def take_action(self,env):
        while(True):
            move=input("Enter coordinates i,j for your next move (i,j=0..2): ")
            i,j=move.split(',')
            i = int(i)
            j = int(j)
            if(env.is_empty(i,j)):
                env.board[i,j]=self.sym
                break
                
    def update(self,env):
        pass
    def update_state_history(self,s):
        pass
    
    

In [5]:
def get_state_hash_and_winner(env,i=0,j=0):
    result = []
    for v in range(0,env.x,env.o):
        env.board[i,j]=v
        
        if(j==2):
            if(i==2):
                state=env.get_state()
                ended=env.game_over(force_recalculate=True)
                winner=env.winner
                result.append((state,ended,winner))
            else:
                result+=get_state_hash_and_winner(env,i+1,0)
            result+=get_state_hash_and_winner(env,i,j+1) 
    return result

In [6]:
def initial_Vx(env,state_winner_triples):
    v=np.zeros(env.num_states)
    for state,winner,ended in state_winner_triples:
        if(ended):
            if(winner==env.x):
                v[state]=1
            else:
                v[state]=0
        else:
            v[state]=0.5
     
    return v       


In [7]:
def initial_Vo(env,state_winner_triples):
    v=np.zeros(env.num_states)
    for state,winner,ended in state_winner_triples:
        if(ended):
            if(winner==env.o):
                v[state]=1
            else:
                v[state]=0
        else:
            v[state]=0.5
     
    return v 

In [None]:
def play_game(p1,p2,env,draw=False):
    
    current_player = None
    while(not env.game_over()):
        
        if(current_player==p1):
            current_player=p2
        else:
            current_player=p1
            
        if(draw):    
            if(draw==1 and current_player==p1):
                env.draw_board()
            if(draw==2 and current_player==p2):
                env.draw_board()
                
        current_player.take_action(env)
        
        state=env.get_state()
        p1.update_state_history(state)
        p2.update_state_history(state)
        
        
    if(draw):
        env.draw_board()
    
    p1.update(env)
    p2.update(env)

In [None]:
if __name__=='__main__':
    
    p1=Agent()
    p2=Agent()
    
    env=Environment()
    state_winner_triples = get_state_hash_and_winner(env)
    
    Vx=initial_Vx(env,state_winner_triples)
    p1.setV(Vx) # define p1 intial state
    
    Vo=initial_Vo(env,state_winner_triples)
    p2.setV(Vo) # define p2 initial state
    
    p1.set_symbol(env.x)
    p2.set_symbol(env.o)
    
    T=10000
    for i in range(T):
        if(i%200==0):
            print(i)
        play_game(p1,p2,Environment())
        
    human = Human()
    human.set_symbol(env.o)
    while(True):
        p1.set_verbose(True)
        play_game(p1,human,Environment(),draw=2)
        
        answer = input("Play again? [Y/n]: ")
        if answer and answer.lower()[0] == 'n':
             break

0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5600
5800
6000
6200
6400
6600
6800
7000
7200
7400
7600
7800
8000
8200
8400
8600
8800
9000
9200
9400
9600
9800
Taking a greedy action
------------------
0.48|0.67|0.52|
------------------
0.48|1.00|0.63|
------------------
0.45|0.07|0.48|
------------------
-------------
 -  | -  | -  |
-------------
 -  | x  | -  |
-------------
 -  | -  | -  |
-------------
