In [None]:
### Install Dependencies ###
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from torch.utils.tensorboard import SummaryWriter


In [None]:
writer = SummaryWriter()
WIN = 1.0
LOSS = -1.0
DRAW = 0.5
EPS = 0.7

In [None]:
class Board():
  def __init__(self):
    self.state =  torch.zeros(9)
    self.marker = torch.tensor(1.0)
    self.game_status = torch.tensor(0.0)
    self.result = torch.tensor(0.0)
    self.eps = EPS
  def reset(self):
    self.state =  torch.zeros(9)
    self.marker = torch.tensor(1.0)
    self.game_status = torch.tensor(0.0)
    self.result = torch.tensor(0.0)
  def play_move(self,pos):
    if(self.state[pos]!=0.0):
        raise Exception("You made an illegal move")

    self.state[pos]=self.marker
    self.marker = self.marker *-1.0
  def get_status(self):

    winning_combinations = [[0,1,2],[3,4,5],[6,7,8],
                            [0,3,6],[1,4,7],[2,5,8],
                            [0,4,8],[2,4,6]]

    for combination in winning_combinations:
           if(self.state[combination[0]]==self.state[combination[1]]==self.state[combination[2]]!=0.0):
               self.game_status = torch.tensor(1.0)
               self.result = self.state[combination[0]]
    if(torch.count_nonzero(self.state)==9.0):
        self.game_status = torch.tensor(1.0)
        self.result = torch.tensor(0.5)
    return self.game_status,self.result
  def get_mask(self):
     mask = self.state==0.0
     return mask









In [None]:
class TicTacToeMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TicTacToeMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class TicTacToePolicy:
    def __init__(self):
        self.model = TicTacToeMLP(9, 128, 9)  # Input: 9 (3x3 board), Hidden: 128, Output: 9 (actions)
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=1e-2)
        self.eval = False

    def get_action_probabilities(self, board):
        inp = board.state

        logit = self.model(inp.clone())


        # Mask out illegal actions
        mask =   board.get_mask() # Mask: 1 for legal actions, 0 for illegal actions

        mask = mask.float()

        logits = logit - 1e9 * (1 - mask)  # Apply large negative values to illegal actions
        probabilities = F.softmax(logits, dim=0)
        if(eval):
          move = torch.argmax(probabilities)
        m =   torch.distributions.categorical.Categorical(probs=probabilities)
        move = m.sample()

        step = torch.count_nonzero(mask)
        eps_temp = board.eps*(1-step/12)

        if(random.random()<eps_temp):
          valid_moves = np.where(mask==1.0)[0]
          move = np.random.choice(valid_moves)
          move = torch.tensor(move)

        log_prob = m.log_prob(move)
        return log_prob,move.item()

In [None]:
class RandomPlayer:
    def __init__(self):
        pass

    def get_action(self, board):
        # Get a list of available actions
        pos = np.where(board.get_mask()==1)[0]
        move = np.random.choice(pos)


        return move


In [None]:
class V(nn.Module):
   def __init__(self, input_dim, hidden_dim, output_dim):
        super(V, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

   def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
v_net = V(9,128,1)


In [None]:
board = Board()
player_1 = TicTacToePolicy()

player_2 = RandomPlayer()


def games(player="random"):
  states = []
  log_prob = []

  board.reset()
  while True:
    states.append(board.state)
    prob,move = player_1.get_action_probabilities(board)
    log_prob.append(prob)

    board.play_move(move)
    status,reward =board.get_status()
    if(status==1.0):
       break
    if(player=="random"):
       move = player_2.get_action(board)

    board.play_move(move)
    status,reward =board.get_status()
    if(status==1.0):
       break
  return states,reward,log_prob





In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
criterion = nn.MSELoss()
v_optimizer = torch.optim.Adam(v_net.parameters(),lr =1e-2)


In [None]:
!rm -r runs

In [None]:
###  Define Training Loop ##
num_games = 500000
board.eps=0.6
player_1.optimizer.zero_grad()
for playouts in range(num_games):
  states,reward,log_prob= games()
  writer.add_scalar('Episode Reward', reward, playouts)
  states_tensor = torch.stack(states)
  states_tensor.shape
  log_prob_tensor = torch.stack(log_prob)
  log_prob_tensor.shape
  reward_tensor = torch.ones(log_prob_tensor.shape)*reward
  b = v_net(states_tensor)
  loss =criterion(b.squeeze(),reward_tensor)
  writer.add_scalar('Value_Loss', loss.item(), playouts)
  v_optimizer.zero_grad()
  loss.backward()
  v_optimizer.step()
  b = v_net(states_tensor)
  rewards_baseline = reward_tensor -b.detach().squeeze()
  grad_tensor = (-(log_prob_tensor)*(rewards_baseline)).sum()
  grad_tensor.backward()
  board.eps = board.eps-board.eps/(num_games-30000)
  #board.eps = 0.0
  #if(playouts>num_games-50000):
    #board.eps=0

  if((playouts+1)%512==0):
     player_1.optimizer.step()
     player_1.optimizer.zero_grad()

  if(playouts==5000):
    torch.save(player_1.model.state_dict(),"/content/drive/MyDrive/policy"+str(playouts)+".pt")
  if(playouts==15000):
    torch.save(player_1.model.state_dict(),"/content/drive/MyDrive/policy"+str(playouts)+".pt")
  if(playouts==25000):
    torch.save(player_1.model.state_dict(),"/content/drive/MyDrive/policy"+str(playouts)+".pt")
  if(playouts==60000):
    torch.save(player_1.model.state_dict(),"/content/drive/MyDrive/policy"+str(playouts)+".pt")
  if(playouts==250000):
    torch.save(player_1.model.state_dict(),"/content/drive/MyDrive/policy"+str(playouts)+".pt")
  if(playouts==450000):
    torch.save(player_1.model.state_dict(),"/content/drive/MyDrive/policy"+str(playouts)+".pt")


  if(playouts%10000==0):
     player_1.eval=True
     loss = 0
     draw = 0
     win = 0
     for i in range(1000):
      _,reward,_ = games()
      if(reward==1.0):
        win+=1
      elif(reward==-1.0):
        loss+=1
      else:
        draw+=1







     writer.add_scalar("Win_percentage_random", win*100,playouts)
     writer.add_scalar("Loss_percentage_random",loss*100,playouts)
     writer.add_scalar("Draw_percentage_random",draw*100,playouts)
     print("Evaluation after",playouts,"games")
     print("Win",win/1000)
     print("Loss",loss/1000)
     print("Draw",draw/1000)
     player_1.eval=False
### Final testing against minimax,minimx_random and random ###
player_1.eval=True
loss = 0
draw = 0
win = 0
for i in range(1000):
    _,reward,_ = games(player="random")
    if(reward==1.0):
      win+=1
    elif(reward==-1.0):
      loss+=1
    else:
      draw+=1



torch.save(player_1.model.state_dict(),"/content/drive/MyDrive/policy.pt")
torch.save(v_net.state_dict(),"/content/drive/MyDrive/value.pt")

writer.add_scalar("Win_percentage_final", win*100)
writer.add_scalar("Loss_percentage_final",loss*100)
writer.add_scalar("Draw_percentage_final",draw*100)
print("Evaluation after 1000 games")
print("Win",win/1000)
print("Loss",loss/1000)
print("Draw",draw/1000)










Evaluation after 0 games
Win 0.362
Loss 0.299
Draw 0.339
Evaluation after 10000 games
Win 0.722
Loss 0.142
Draw 0.136
Evaluation after 20000 games
Win 0.762
Loss 0.135
Draw 0.103
Evaluation after 30000 games
Win 0.754
Loss 0.114
Draw 0.132
Evaluation after 40000 games
Win 0.778
Loss 0.126
Draw 0.096
Evaluation after 50000 games
Win 0.788
Loss 0.124
Draw 0.088
Evaluation after 60000 games
Win 0.801
Loss 0.103
Draw 0.096
Evaluation after 70000 games
Win 0.806
Loss 0.111
Draw 0.083
Evaluation after 80000 games
Win 0.809
Loss 0.12
Draw 0.071
Evaluation after 90000 games
Win 0.823
Loss 0.118
Draw 0.059
Evaluation after 100000 games
Win 0.847
Loss 0.091
Draw 0.062
Evaluation after 110000 games
Win 0.816
Loss 0.118
Draw 0.066
Evaluation after 120000 games
Win 0.839
Loss 0.11
Draw 0.051
Evaluation after 130000 games
Win 0.833
Loss 0.113
Draw 0.054
Evaluation after 140000 games
Win 0.842
Loss 0.105
Draw 0.053
Evaluation after 150000 games
Win 0.833
Loss 0.117
Draw 0.05
Evaluation after 160000 g

In [None]:
torch.save(player_1.model.state_dict(),"/content/weights.pt")
torch.save(v_net.state_dict(),"/content/v_net.pt")

In [None]:
!cp -r runs_vanilla /content/drive/MyDrive

In [None]:
player_1.eval=True
loss = 0
draw = 0
win = 0
for i in range(1000):
    _,reward,_ = games(player="mini")
    if(reward==1.0):
      win+=1
    elif(reward==-1.0):
      loss+=1
    else:
      draw+=1



writer.add_scalar("Win_percentage_strong", win*100)
writer.add_scalar("Loss_percentage_strong",loss*100)
writer.add_scalar("Draw_percentage_strong",draw*100)
print("Evaluation after",playouts,"games")
print("Win",win*100/10)
print("Loss",loss*100/10)
print("Draw",draw*100/10)

KeyboardInterrupt: ignored

In [None]:
print(win,draw,loss)

132 31 60


In [None]:
!unzip tictac-master.zip -d ./tmp
!mv ./tmp/*/* .

Archive:  tictac-master.zip
220bbdc6103ff012ec60b5b424e1566205349588
   creating: ./tmp/tictac-master/
  inflating: ./tmp/tictac-master/.gitignore  
  inflating: ./tmp/tictac-master/Pipfile  
  inflating: ./tmp/tictac-master/Pipfile.lock  
  inflating: ./tmp/tictac-master/README.md  
  inflating: ./tmp/tictac-master/demo_get_average_values.py  
  inflating: ./tmp/tictac-master/path.bat  
  inflating: ./tmp/tictac-master/path.sh  
   creating: ./tmp/tictac-master/tests/
  inflating: ./tmp/tictac-master/tests/test_board.py  
  inflating: ./tmp/tictac-master/tests/test_mcts.py  
  inflating: ./tmp/tictac-master/tests/test_minimax.py  
  inflating: ./tmp/tictac-master/tests/test_qneural.py  
  inflating: ./tmp/tictac-master/tests/test_qtable.py  
  inflating: ./tmp/tictac-master/tests/test_random.py  
  inflating: ./tmp/tictac-master/tests/test_transform.py  
   creating: ./tmp/tictac-master/tictac/
  inflating: ./tmp/tictac-master/tictac/board.py  
  inflating: ./tmp/tictac-master/tictac/

In [None]:
!rm -r tmp

In [None]:
from tictac.minimax import play_minimax_move
_,move = play_minimax_move(board.state,True)
print(board.state)


AttributeError: ignored