In [None]:
### Install Dependencies ###
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from torch.utils.tensorboard import SummaryWriter


In [None]:
writer = SummaryWriter()
WIN = 1.0
LOSS = -1.0
DRAW = 1.0
BLACK = True
EPS = 0.7

In [None]:
class Board():
  def __init__(self):
    self.state =  torch.zeros(9)
    self.marker = torch.tensor(1.0)
    self.game_status = torch.tensor(0.0)
    self.result = torch.tensor(0.0)
    self.eps = EPS
  def reset(self):
    self.state =  torch.zeros(9)
    self.marker = torch.tensor(1.0)
    self.game_status = torch.tensor(0.0)
    self.result = torch.tensor(0.0)
  def play_move(self,pos):
    if(self.state[pos]!=0.0):
        raise Exception("You made an illegal move")

    self.state[pos]=self.marker
    self.marker = self.marker *-1.0
  def get_status(self):

    winning_combinations = [[0,1,2],[3,4,5],[6,7,8],
                            [0,3,6],[1,4,7],[2,5,8],
                            [0,4,8],[2,4,6]]

    for combination in winning_combinations:
           if(self.state[combination[0]]==self.state[combination[1]]==self.state[combination[2]]!=0.0):
               self.game_status = torch.tensor(1.0)
               self.result = self.state[combination[0]]
               if(BLACK):
                 self.result = self.result*-1.0
    if(torch.count_nonzero(self.state)==9.0):
        self.game_status = torch.tensor(1.0)
        self.result = torch.tensor(DRAW)
    return self.game_status,self.result
  def get_mask(self):
     mask = self.state==0.0
     return mask









In [None]:
class TicTacToeMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TicTacToeMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class TicTacToePolicy:
    def __init__(self,default=False):
        self.model = TicTacToeMLP(9, 128, 9)  # Input: 9 (3x3 board), Hidden: 128, Output: 9 (actions)
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=5e-2)
        self.eval = False
        self.default = default

    def get_action_probabilities(self, board):
        inp = board.state

        logit = self.model(inp.clone())


        # Mask out illegal actions
        mask =   board.get_mask() # Mask: 1 for legal actions, 0 for illegal actions

        mask = mask.float()

        logits = logit - 1e10 * (1 - mask)  # Apply large negative values to illegal actions
        probabilities = F.softmax(logits, dim=0)
        if(eval):
          move = torch.argmax(probabilities)
        m =   torch.distributions.categorical.Categorical(probs=probabilities)
        move = m.sample()

        step = torch.count_nonzero(mask)
        if(self.default):
          eps_temp = 0.1
        else:
          eps_temp = board.eps*(1-step/12)

        if(random.random()<eps_temp):
          valid_moves = np.where(mask==1.0)[0]
          move = np.random.choice(valid_moves)
          move = torch.tensor(move)

        log_prob = m.log_prob(move)
        return log_prob,move.item()

In [None]:
class RandomPlayer:
    def __init__(self):
        pass

    def get_action(self, board):
        # Get a list of available actions
        pos = np.where(board.get_mask()==1)[0]
        move = np.random.choice(pos)


        return move


In [None]:
class V(nn.Module):
   def __init__(self, input_dim, hidden_dim, output_dim):
        super(V, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

   def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
v_net = V(9,128,1)


In [None]:
board = Board()
player_1 = TicTacToePolicy(default=True)
player_1.model.load_state_dict(torch.load("/content/drive/MyDrive/policy.pt"))

player_2 = RandomPlayer()

player_3 = TicTacToePolicy()


def games(player="random"):
  states = []
  log_prob = []

  board.reset()
  while True:

    prob,move = player_1.get_action_probabilities(board)

    board.play_move(move)
    status,reward =board.get_status()
    if(status==1.0):
       break
    states.append(board.state)

    if(player=="random"):
       move = player_2.get_action(board)
    else:
       prob,move = player_3.get_action_probabilities(board)
    log_prob.append(prob)


    board.play_move(move)
    status,reward =board.get_status()
    if(status==1.0):
       break
  return states,reward,log_prob





In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
criterion = nn.MSELoss()
v_optimizer = torch.optim.Adam(v_net.parameters(),lr =1e-2)


In [None]:
!rm -r runs

In [None]:
###  Define Training Loop ##
num_games = 100000
board.eps=0.6
player_3.optimizer.zero_grad()
for playouts in range(num_games):
  states,reward,log_prob= games("neural")
  reward = -(reward-(0.8))**2
  writer.add_scalar('Episode Reward', reward, playouts)
  states_tensor = torch.stack(states)
  states_tensor.shape
  log_prob_tensor = torch.stack(log_prob)
  log_prob_tensor.shape
  reward_tensor = torch.ones(log_prob_tensor.shape)*reward
  b = v_net(states_tensor)
  loss =criterion(b.squeeze(),reward_tensor)
  writer.add_scalar('Value_Loss', loss.item(), playouts)
  v_optimizer.zero_grad()
  loss.backward()
  v_optimizer.step()
  b = v_net(states_tensor)
  rewards_baseline = reward_tensor -b.detach().squeeze()
  grad_tensor = (-(log_prob_tensor)*(rewards_baseline)).sum()
  grad_tensor.backward()
  board.eps = board.eps-board.eps/(num_games-20000)

  if((playouts+1)%512==0):
     player_3.optimizer.step()
     player_3.optimizer.zero_grad()


  if(playouts%10000==0):
     player_3.eval=False
     loss = 0
     draw = 0
     win = 0
     for i in range(1000):
      _,reward,_ = games("neural")
      if(reward==1.0):
        draw+=1           ## Note: We are rewarding draws and wins as 1 to calculate ELO, only score matters
      elif(reward==-1.0):
        loss+=1
      else:
        win+=1







     writer.add_scalar("Win_percentage", win*100,playouts)
     writer.add_scalar("Loss_percentage",loss*100,playouts)
     writer.add_scalar("Draw_percentage",draw*100,playouts)
     print("Evaluation after",playouts,"games")
     print("Win",win/1000)
     print("Loss",loss/1000)
     print("Draw",draw/1000)
     player_1.eval=False
### Final testing against minimax,minimx_random and random ###




torch.save(player_3.model.state_dict(),"/content/drive/MyDrive/policy_exact.pt")
#torch.save(v_net.state_dict(),"/content/drive/MyDrive/value.pt")

writer.add_scalar("Win_percentage_final", win*100)
writer.add_scalar("Loss_percentage_final",loss*100)
writer.add_scalar("Draw_percentage_final",draw*100)
print("Evaluation after 1000 games")
print("Win",win/1000)
print("Loss",loss/1000)
print("Draw",draw/1000)










Evaluation after 0 games
Win 0.0
Loss 0.753
Draw 0.247
Evaluation after 10000 games
Win 0.0
Loss 0.742
Draw 0.258
Evaluation after 20000 games
Win 0.0
Loss 0.761
Draw 0.239
Evaluation after 30000 games
Win 0.0
Loss 0.719
Draw 0.281
Evaluation after 40000 games
Win 0.0
Loss 0.712
Draw 0.288
Evaluation after 50000 games
Win 0.0
Loss 0.71
Draw 0.29
Evaluation after 60000 games
Win 0.0
Loss 0.724
Draw 0.276
Evaluation after 70000 games
Win 0.0
Loss 0.727
Draw 0.273
Evaluation after 80000 games
Win 0.0
Loss 0.698
Draw 0.302
Evaluation after 90000 games
Win 0.0
Loss 0.693
Draw 0.307
Evaluation after 1000 games
Win 0.0
Loss tensor(0.0092, grad_fn=<DivBackward0>)
Draw 0.307


In [None]:
torch.save(player_1.model.state_dict(),"/content/weights.pt")
torch.save(v_net.state_dict(),"/content/v_net.pt")

In [None]:
!cp -r runs_exact_strength /content/drive/MyDrive