In [None]:
### Install Dependencies ###
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from torch.utils.tensorboard import SummaryWriter


In [None]:
writer = SummaryWriter()
WIN = 1.0
LOSS = -1.0
DRAW = 0.5
EPS = 0.7

In [None]:
class Board():
  def __init__(self):
    self.state =  torch.zeros(9)
    self.marker = torch.tensor(1.0)
    self.game_status = torch.tensor(0.0)
    self.result = torch.tensor(0.0)
    self.eps = EPS
  def reset(self):
    self.state =  torch.zeros(9)
    self.marker = torch.tensor(1.0)
    self.game_status = torch.tensor(0.0)
    self.result = torch.tensor(0.0)
  def play_move(self,pos):
    if(self.state[pos]!=0.0):
        raise Exception("You made an illegal move")

    self.state[pos]=self.marker
    self.marker = self.marker *-1.0
  def get_status(self):

    winning_combinations = [[0,1,2],[3,4,5],[6,7,8],
                            [0,3,6],[1,4,7],[2,5,8],
                            [0,4,8],[2,4,6]]

    for combination in winning_combinations:
           if(self.state[combination[0]]==self.state[combination[1]]==self.state[combination[2]]!=0.0):
               self.game_status = torch.tensor(1.0)
               self.result = self.state[combination[0]]
    if(torch.count_nonzero(self.state)==9.0):
        self.game_status = torch.tensor(1.0)
        self.result = torch.tensor(0.5)
    return self.game_status,self.result
  def get_mask(self):
     mask = self.state==0.0
     return mask









In [None]:
class TicTacToeMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TicTacToeMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class TicTacToePolicy:
    def __init__(self):
        self.model = TicTacToeMLP(9, 128, 9)  # Input: 9 (3x3 board), Hidden: 128, Output: 9 (actions)
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=1e-2)
        self.eval = False
        self.state = False

    def get_action_probabilities(self, board):
        if(self.state==True):
          inp = board
        else:
           inp = board.state

        logit = self.model(inp.clone())


        # Mask out illegal actions
        if(self.state==True):
           mask = board==0
        else:
           mask =   board.get_mask() # Mask: 1 for legal actions, 0 for illegal actions

        mask = mask.float()

        logits = logit - 1e9 * (1 - mask)  # Apply large negative values to illegal actions
        probabilities = F.softmax(logits, dim=0)
        if(eval):
          move = torch.argmax(probabilities)
        m =   torch.distributions.categorical.Categorical(probs=probabilities)
        move = m.sample()

        step = torch.count_nonzero(mask)
        if(self.state==False):
           eps_temp = board.eps*(1-step/12)
        else:
           eps_temp=0.03

        if(random.random()<eps_temp):
          valid_moves = np.where(mask==1.0)[0]
          move = np.random.choice(valid_moves)
          move = torch.tensor(move)

        log_prob = m.log_prob(move)
        return log_prob,move.item(),probabilities

In [None]:
class RandomPlayer:
    def __init__(self):
        pass

    def get_action(self, board):
        # Get a list of available actions
        pos = np.where(board.get_mask()==1)[0]
        move = np.random.choice(pos)


        return move


In [None]:
board = Board()
player_1 = TicTacToePolicy()
player_2 = TicTacToePolicy()
player_3 = TicTacToePolicy()
player_1.model.load_state_dict(torch.load("/content/drive/MyDrive/policy.pt"))
player_2.model.load_state_dict(torch.load("/content/drive/MyDrive/MMD.pt"))
player_3.model.load_state_dict(torch.load("/content/drive/MyDrive/policy450000.pt"))


player_4 = TicTacToePolicy()
#player_4.model.load_state_dict(torch.load("/content/drive/MyDrive/policy_exact.pt"))



def games(strategy=player_1):
  states = []
  log_prob = []
  player=strategy

  board.reset()
  while True:
    states.append(board.state)
    prob,move,actions = player.get_action_probabilities(board)
    log_prob.append(prob)

    board.play_move(move)
    status,reward =board.get_status()
    if(status==1.0):
       break

    prob,move,actions = player_4.get_action_probabilities(board)

    board.play_move(move)
    status,reward =board.get_status()
    if(status==1.0):
       break
  return states,reward,log_prob





In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
board.eps = 0.0
reward_list = []
strategies = [player_1,player_2,player_3]
for entry in strategies:
  entry.eval=True
  loss = 0
  draw = 0
  win = 0
  r = 0
  for i in range(1000):
    _,reward,_ = games(strategy=entry)
    r+=reward
    if(reward==1.0):
      win+=1
    elif(reward==-1.0):
      loss+=1
    else:
      draw+=1
  print("Win",win/1000)
  print("Loss",loss/1000)
  print("Draw",draw/1000)
  reward_list.append(r)
print(reward_list)



Win 0.943
Loss 0.057
Draw 0.0
Win 0.961
Loss 0.039
Draw 0.0
Win 0.961
Loss 0.039
Draw 0.0
[tensor(886.), tensor(922.), tensor(922.)]


In [None]:
print(reward_list)
indices = np.argsort(reward_list)
print(indices)
fittest = indices[[-1,-2]]
print(fittest)



[tensor(894.), tensor(912.), tensor(900.)]
[0 2 1]
[1 2]


In [None]:
## Cross Over ##

child = TicTacToePolicy()

# Perform crossover at the weight level
crossover_point = torch.randint(0, 2, child.model.fc1.weight.shape)
child.model.fc1.weight = nn.Parameter(crossover_point * strategies[fittest[0]].model.fc1.weight + (1 - crossover_point) * strategies[fittest[1]].model.fc1.weight)

crossover_point = torch.randint(0, 2, child.model.fc2.weight.shape)
child.model.fc2.weight = nn.Parameter(crossover_point * strategies[fittest[0]].model.fc2.weight + (1 - crossover_point) * strategies[fittest[1]].model.fc2.weight)

# Print the child model after crossover
print(child)

## Mutate ##
mutation_prob = 0.1

# Perform mutation on the weights
for param in child.model.parameters():
    if len(param.shape) > 1:  # Perform mutation only on weight matrices, not biases
        mask = torch.rand(param.shape) < mutation_prob
        mutation = torch.randn(param.shape)
        param.data = torch.where(mask, param.data + mutation, param.data)

In [None]:
reward_list = []
child.eval=True
loss = 0
draw = 0
win = 0
r = 0
for i in range(1000):
  _,reward,_ = games(strategy=child)
  r+=reward
  if(reward==1.0):
    win+=1
  elif(reward==-1.0):
    loss+=1
  else:
    draw+=1
print("Win",win/1000)
print("Loss",loss/1000)
print("Draw",draw/1000)
print(r)


Win 0.887
Loss 0.111
Draw 0.002
tensor(777.)


In [None]:
for i in range(100000):
    weakest = strategies[fittest[1]]
    weakest_strength = reward_list[fittest[1]]

    strongest= strategies[fittest[0]]
    strongest_strength = reward_list[fittest[0]]


    child = TicTacToePolicy()

    # Perform crossover at the weight level
    crossover_point = torch.randint(0, 2, child.model.fc1.weight.shape)
    child.model.fc1.weight = nn.Parameter(crossover_point * strategies[fittest[0]].model.fc1.weight + (1 - crossover_point) * strategies[fittest[1]].model.fc1.weight)

    crossover_point = torch.randint(0, 2, child.model.fc2.weight.shape)
    child.model.fc2.weight = nn.Parameter(crossover_point * strategies[fittest[0]].model.fc2.weight + (1 - crossover_point) * strategies[fittest[1]].model.fc2.weight)

    # Print the child model after crossover


    ## Mutate ##
    mutation_prob = 0.1

    # Perform mutation on the weights
    for param in child.model.parameters():
        if len(param.shape) > 1:  # Perform mutation only on weight matrices, not biases
            mask = torch.rand(param.shape) < mutation_prob
            mutation = torch.randn(param.shape)
            param.data = torch.where(mask, param.data + mutation, param.data)
    child.eval=True
    loss = 0
    draw = 0
    win = 0
    r = 0
    for i in range(1000):
      _,reward,_ = games(strategy=child)
      r+=reward
      if(reward==1.0):
        win+=1
      elif(reward==-1.0):
        loss+=1
      else:
        draw+=1
    #print("Win",win/1000)
    #print("Loss",loss/1000)
    #print("Draw",draw/1000)
    #print(r)
    if(r>weakest_strength):
      if(r>strongest_strength):

        strategies[fittest[1]] = strategies[fittest[0]]
        reward_list[fittest[1]] = reward_list[fittest[0]]
        strategies[fittest[0]] = child
        reward_list[fittest[0]] = r
      else:
        strategies[fittest[1]] = child
        reward_list[fittest[1]] = r
    print(reward_list[fittest[0]]/1000)




NameError: ignored

In [None]:
!rm -r runs

In [None]:
torch.save(player_2.model.state_dict(),"/content/drive/MyDrive/MMD.pt")

In [None]:
!cp -r runs_mmd /content/drive/MyDrive

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import operator

board.eps=0


# Function to perform selection operation
def selection(policies, performance):
    sorted_policies = [policy for _, policy in sorted(zip(performance, policies),key=lambda x: x[0], reverse=True)]
    return sorted_policies[:3]

# Function to perform crossover operation
def crossover(policies):
    new_policies = []
    for i in range(len(policies)):
        for j in range(i + 1, len(policies)):
            child_policy = TicTacToePolicy().model
            child_policy.fc1.weight.data = torch.mean(torch.stack([policies[i].fc1.weight.data, policies[j].fc1.weight.data]), dim=0)
            child_policy.fc1.bias.data = torch.mean(torch.stack([policies[i].fc1.bias.data, policies[j].fc1.bias.data]), dim=0)
            child_policy.fc2.weight.data = torch.mean(torch.stack([policies[i].fc2.weight.data, policies[j].fc2.weight.data]), dim=0)
            child_policy.fc2.bias.data = torch.mean(torch.stack([policies[i].fc2.bias.data, policies[j].fc2.bias.data]), dim=0)
            new_policies.append(child_policy)
    return new_policies

# Function to perform mutation operation
def mutation(policies, mutation_rate):
    for policy in policies:
        if random.random() < mutation_rate:
            policy.fc1.weight.data += torch.randn_like(policy.fc1.weight.data) * 0.1
            policy.fc1.bias.data += torch.randn_like(policy.fc1.bias.data) * 0.1
            policy.fc2.weight.data += torch.randn_like(policy.fc2.weight.data) * 0.1
            policy.fc2.bias.data += torch.randn_like(policy.fc2.bias.data) * 0.1
    return policies

def evaluate(policy):
   dummy = TicTacToePolicy()
   dummy.model.load_state_dict(policy.state_dict())
   r = 0
   for i in range(1000):
       _,performance,_ = games(dummy)
       r+=performance

   #print(r.item()/1000)
   return r.item()


# Generate initial set of policies
input_size = 9
hidden_size = 128
output_size = 9
num_policies = 5

policies = [player_1.model,player_2.model,player_3.model]

# Evaluate the performance of each policy
performance = [evaluate(policy) for policy in policies]  # Function to evaluate the performance

# Main loop for selection, crossover, and mutation
num_iterations = 1000
mutation_rate = 0.1

for iteration in range(num_iterations):
    # Perform selection
    c = np.random.rand()


    selected_policies = selection(policies, performance)

    # Perform crossover
    if(c>0.8):
      crossed_policies = crossover(selected_policies)
    else:
      crossed_policies=selected_policies

    # Perform mutation
    mutated_policies = mutation(crossed_policies, mutation_rate)

    # Update the population
    policies = selected_policies + mutated_policies

    # Evaluate the performance of each policy
    performance = [evaluate(policy) for policy in policies]  # Function to evaluate the performance

    sorted_policies = [policy for _, policy in sorted(zip(performance, policies), key=lambda x: x[0], reverse=True)]


    # Keep the best three policies
    policies = sorted_policies[0:3]
    print(performance)


[924.0, 874.0, 870.0, 896.0, 852.0, 904.0]
[888.0, 898.0, 890.0, 902.0, 886.0, 878.0]
[906.0, 918.0, 908.0, 884.0, 900.0, 920.0]
[924.0, 908.0, 886.0, 920.0, 910.0, 902.0]
[882.0, 918.0, 902.0, 896.0, 888.0, 922.0]
[920.0, 882.0, 900.0, 908.0, 902.0, 880.0]
[882.0, 902.0, 910.0, 918.0, 902.0, 904.0]
[882.0, 910.0, 918.0, 888.0, 868.0, 900.0]
[918.0, 912.0, 900.0, 890.0, 902.0, 880.0]
[884.0, 910.0, 916.0, 900.0, 910.0, 920.0]
[902.0, 882.0, 882.0, 886.0, 862.0, 894.0]
[896.0, 904.0, 916.0, 898.0, 916.0, 930.0]
[886.0, 874.0, 898.0, 874.0, 922.0, 856.0]
[894.0, 898.0, 906.0, 896.0, 908.0, 896.0]
[890.0, 887.5, 878.0, 918.0, 876.0, 892.0]
[878.0, 902.0, 902.0, 896.0, 900.0, 886.0]
[908.0, 890.0, 898.0, 898.0, 894.0, 888.0]
[886.0, 912.0, 882.0, 884.0, 886.0, 906.0]
[896.0, 886.0, 896.0, 906.0, 854.0, 882.0]
[902.0, 900.0, 918.0, 908.0, 888.0, 914.0]
[924.0, 884.0, 898.0, 912.0, 886.0, 852.0]
[888.0, 904.0, 896.0, 896.0, 924.0, 896.0]
[876.0, 912.0, 868.0, 890.0, 888.0, 876.0]
[870.0, 880

KeyboardInterrupt: ignored