In [None]:
### Install Dependencies ###
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from torch.utils.tensorboard import SummaryWriter


In [None]:
writer = SummaryWriter()
WIN = 1.0
LOSS = -1.0
DRAW = 0.5
EPS = 0.7
DEVICE = 'cpu'

In [None]:
class Board():
  def __init__(self,mark=1.0):
    self.mark = mark
    self.state =  torch.zeros(9)
    self.marker = torch.tensor(1.0)
    self.game_status = torch.tensor(0.0)
    self.result = torch.tensor(0.0)
    self.eps = EPS
  def reset(self):
    self.state =  torch.zeros(9)
    self.marker = torch.tensor(1.0)
    self.game_status = torch.tensor(0.0)
    self.result = torch.tensor(0.0)
  def play_move(self,pos):
    if(self.state[pos]!=0.0):
        raise Exception("You made an illegal move")

    self.state[pos]=self.marker
    self.marker = self.marker *-1.0
  def get_status(self):

    winning_combinations = [[0,1,2],[3,4,5],[6,7,8],
                            [0,3,6],[1,4,7],[2,5,8],
                            [0,4,8],[2,4,6]]

    for combination in winning_combinations:
           if(self.state[combination[0]]==self.state[combination[1]]==self.state[combination[2]]!=0.0):
               self.game_status = torch.tensor(1.0)
               self.result = self.state[combination[0]]*self.mark
    if(torch.count_nonzero(self.state)==9.0):
        self.game_status = torch.tensor(1.0)
        self.result = torch.tensor(DRAW)
    return self.game_status,self.result
  def get_mask(self):
     mask = self.state==0.0
     return mask









In [None]:
class TicTacToeMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TicTacToeMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class TicTacToePolicy:
    def __init__(self,default=False):
        self.default = default
        self.model = TicTacToeMLP(9, 128, 9).to(torch.device(DEVICE))  # Input: 9 (3x3 board), Hidden: 128, Output: 9 (actions)
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=1e-2)
        self.eval = False

    def get_action_probabilities(self, board):
        inp = board.state

        logit = self.model(inp.clone().to(torch.device(DEVICE)))


        # Mask out illegal actions
        mask =   board.get_mask() # Mask: 1 for legal actions, 0 for illegal actions

        mask = mask.float().to(torch.device(DEVICE))

        logits = logit - 1e9 * (1 - mask)  # Apply large negative values to illegal actions
        probabilities = F.softmax(logits, dim=0)
        if(eval):
          move = torch.argmax(probabilities)
        m =   torch.distributions.categorical.Categorical(probs=probabilities)
        move = m.sample()

        step = torch.count_nonzero(mask)
        if(self.default):
          eps_temp = 0.1
        else:
          eps_temp = board.eps*(1-step/12)

        if(random.random()<eps_temp):
          valid_moves = torch.where(mask==1.0)[0]
          move = np.random.choice(valid_moves.cpu())
          move = torch.tensor(move).to(torch.device(DEVICE))


        log_prob = m.log_prob(move)
        return log_prob,move.item()

In [None]:
class RandomPlayer:
    def __init__(self):
        pass

    def get_action(self, board):
        # Get a list of available actions
        pos = np.where(board.get_mask()==1)[0]
        move = np.random.choice(pos)


        return move


In [None]:
class V(nn.Module):
   def __init__(self, input_dim, hidden_dim, output_dim):
        super(V, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

   def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
v_net_1 = V(9,128,1).to(torch.device(DEVICE))
v_net_3 = V(9,128,1).to(torch.device(DEVICE))



In [None]:
board = Board()
player_1 = TicTacToePolicy()
player_1.model.load_state_dict(torch.load("/content/drive/MyDrive/attack_o_1.pt"))

player_2 = TicTacToePolicy(default=True)
player_2.model.load_state_dict(torch.load("/content/drive/MyDrive/o_strategy450000.pt"))

player_3 = TicTacToePolicy()
#player_3.model.load_state_dict(torch.load("/content/drive/MyDrive/o_strategy250000.pt"))



def attack(adv=False):
  states = []
  log_prob = []

  board.reset()
  while True:
    states.append(board.state)
    prob,move = player_1.get_action_probabilities(board)
    log_prob.append(prob)

    board.play_move(move)
    status,reward =board.get_status()
    if(status==1.0):
       break
    if(adv==False):
        prob,move = player_2.get_action_probabilities(board)
    else:
        prob,move = player_3.get_action_probabilities(board)
    board.play_move(move)
    status,reward =board.get_status()


    if(status==1.0):
       break
  return states,reward,log_prob

def defense(mark=-1.0):
  states = []
  log_prob = []

  board.reset()
  if(mark==-1.0):
    board.mark=-1.0

  while True:

    _,move = player_1.get_action_probabilities(board)


    board.play_move(move)
    board.mark=-1.0
    status,reward =board.get_status()
    if(status==1.0):
       break
    states.append(board.state)

    prob,move = player_3.get_action_probabilities(board)
    board.play_move(move)
    board.mark=-1.0
    status,reward =board.get_status()
    log_prob.append(prob)
    if(status==1.0):
       break
  board.mark=1.0
  return states,reward,log_prob






In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
criterion = nn.MSELoss()
v_optimizer = torch.optim.Adam(v_net_1.parameters(),lr =1e-2)
a_optimizer = torch.optim.Adam(v_net_3.parameters(),lr =1e-2)


In [None]:
!rm -r runs

In [None]:
###  Define Training Loop ##
num_games = 100000
board.eps=0.5
player_1.optimizer.zero_grad()
player_3.optimizer.zero_grad()
for playouts in range(num_games):
  states,reward,log_prob= attack()
  #writer.add_scalar('Episode Reward', reward, playouts)
  states_tensor = torch.stack(states)
  log_prob_tensor = torch.stack(log_prob)
  reward_tensor = torch.ones(log_prob_tensor.shape)*reward
  reward_tensor = reward_tensor.to(torch.device(DEVICE))
  b = v_net_1(states_tensor.to(torch.device(DEVICE)))
  loss_1 =criterion(b.squeeze(),reward_tensor)
  #writer.add_scalar('Value_Loss', loss_1.item(), playouts)
  v_optimizer.zero_grad()
  loss_1.backward()
  v_optimizer.step()
  b = v_net_1(states_tensor.to(torch.device(DEVICE)))
  rewards_baseline = reward_tensor -b.detach().squeeze()
  grad_tensor = (-(log_prob_tensor)*(rewards_baseline)).sum()
  grad_tensor.backward()

  states,reward,log_prob= defense(mark=-1.0)
  states_tensor = torch.stack(states)
  log_prob_tensor = torch.stack(log_prob)
  reward_tensor = torch.ones(log_prob_tensor.shape)*reward
  reward_tensor = reward_tensor.to(torch.device(DEVICE))
  b = v_net_3(states_tensor.to(torch.device(DEVICE)))
  loss_2 =criterion(b.squeeze(),reward_tensor)
  a_optimizer.zero_grad()
  loss_2.backward()
  a_optimizer.step()
  b = v_net_3(states_tensor.to(torch.device(DEVICE)))
  rewards_baseline = reward_tensor -b.detach().squeeze()
  grad_tensor_1 = ((-log_prob_tensor)*(rewards_baseline)).sum()
  grad_tensor_1.backward()

  if(playouts%1000==0):
    print(playouts)

  states,reward,log_prob= attack(adv=True)
  states_tensor = torch.stack(states)
  log_prob_tensor = torch.stack(log_prob)
  reward_tensor =  torch.ones(log_prob_tensor.shape)*reward
  reward_tensor = reward_tensor.to(torch.device(DEVICE))
  b = v_net_1(states_tensor.to(torch.device(DEVICE)))
  loss_3 =criterion(b.squeeze(),reward_tensor)
  loss_3 = loss_3*2.5
  v_optimizer.zero_grad()
  loss_3.backward()
  v_optimizer.step()
  b = v_net_1(states_tensor.to(torch.device(DEVICE)))
  rewards_baseline = reward_tensor -b.detach().squeeze()
  grad_tensor = (-(log_prob_tensor)*(rewards_baseline)).sum()
  grad_tensor.backward()

  board.eps = board.eps-board.eps/(num_games-30000)

  if((playouts+1)%64==0):
    player_3.optimizer.step()
    player_3.optimizer.zero_grad()

  if((playouts+1)%512==0):
     player_1.optimizer.step()
     player_1.optimizer.zero_grad()




  if(playouts%10000==0):
     player_1.eval=True
     loss = 0
     draw = 0
     win = 0
     for i in range(1000):
      _,reward,_ = attack()
      if(reward==1.0):
        win+=1
      elif(reward==-1.0):
        loss+=1
      else:
        draw+=1







     writer.add_scalar("Win_percentage_mined", win*100,playouts)
     writer.add_scalar("Loss_percentage_mined",loss*100,playouts)
     writer.add_scalar("Draw_percentage_mined",draw*100,playouts)
     print("Evaluation after",playouts,"games")
     print("Win",win/1000)
     print("Loss",loss/1000)
     print("Draw",draw/1000)
     player_1.eval=False
  if(playouts%10000==0):
     player_1.eval=True
     player_3.eval=True
     loss = 0
     draw = 0
     win = 0
     for i in range(1000):
      _,reward,_ = defense()
      if(reward==1.0):
        win+=1
      elif(reward==-1.0):
        loss+=1
      else:
        draw+=1







     writer.add_scalar("Win_percentage_adverserial", win*100,playouts)
     writer.add_scalar("Loss_percentage_adverserial",loss*100,playouts)
     writer.add_scalar("Draw_percentage_adverserial",draw*100,playouts)
     print("Evaluation after",playouts,"games")
     print("Win",win/1000)
     print("Loss",loss/1000)
     print("Draw",draw/1000)
     player_1.eval=False
     player_3.eval










0
Evaluation after 0 games
Win 0.699
Loss 0.168
Draw 0.133
Evaluation after 0 games
Win 0.192
Loss 0.701
Draw 0.107
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
Evaluation after 10000 games
Win 0.741
Loss 0.16
Draw 0.099
Evaluation after 10000 games
Win 0.115
Loss 0.83
Draw 0.055
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
Evaluation after 20000 games
Win 0.749
Loss 0.17
Draw 0.081
Evaluation after 20000 games
Win 0.079
Loss 0.877
Draw 0.044
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
Evaluation after 30000 games
Win 0.777
Loss 0.152
Draw 0.071
Evaluation after 30000 games
Win 0.059
Loss 0.899
Draw 0.042
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
Evaluation after 40000 games
Win 0.78
Loss 0.154
Draw 0.066
Evaluation after 40000 games
Win 0.049
Loss 0.927
Draw 0.024
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
Evaluation after 50000 games
Win 0.817
Loss 0.127
Draw 0.056
Evaluation after 50000 games
Win 0.028
Loss 0.93

In [None]:
torch.save(player_1.model.state_dict(),"/content/drive/MyDrive/attack_o_60.pt")
#torch.save(player_3.model.state_dict(),"/content/adv_o-1.pt")

In [None]:
!cp -r runs_60_o /content/drive/MyDrive