In [10]:
from game import Game
from players import *
import numpy as np

# Train.py

IN this section we would like to decide the player neural network architecture and then we would like to train it against a random player and see how it performors.

In [11]:
from numpy import unravel_index
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [12]:
class From_Pos_Net(torch.nn.Module):
    def __init__(self, gamma=0.9):
        super().__init__()
        self.optimizer = None
        self.gamma = gamma
        self.p1 = nn.Linear(
            26, 100
        )  # 26 because we have 25 integers and 1 int for player id
        self.p2 = nn.Linear(100, 100)
        self.p3 = nn.Linear(100, 100)
        self.p4 = nn.Linear(100, 25)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

    def forward(self, x):
        x = torch.tensor(x, dtype=torch.float32).to(self.device)
        x = F.relu(self.p1(x))
        x = F.relu(self.p2(x))
        x = F.relu(self.p3(x))
        from_pos = F.softmax(self.p4(x), dim=-1)

        # During the train step he should learn to take the from_pos using the professor convention

        return from_pos

    def train_net(self, state, next_state, from_pos, target):
        if target != 1:
            target += self.gamma * torch.max(self.forward(next_state))

        output = self.forward(state)
        target_f = output.clone()
        target_f[np.argmax(from_pos)] = target
        target_f.detach()
        self.optimizer.zero_grad()
        loss = F.mse_loss(output, target_f)
        loss.backward()
        self.optimizer.step()

In [13]:
class Action_Net(torch.nn.Module):
    def __init__(self, gamma=0.9):
        super().__init__()
        self.optimizer = None
        self.gamma = gamma
        # network for the action
        self.a1 = nn.Linear(27, 50)
        self.a2 = nn.Linear(50, 50)
        self.a3 = nn.Linear(50, 25)
        self.a4 = nn.Linear(25, 4)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

    def forward(self, y):
        y = torch.tensor(y, dtype=torch.float32).to(self.device)
        y = F.relu(self.a1(y))
        y = F.relu(self.a2(y))
        y = F.relu(self.a3(y))
        act = F.softmax(self.a4(y), dim=-1)

        return act

    def train_net(self, state, next_state, action, target):
        if target != 1:
            target += self.gamma * torch.max(self.forward(next_state))

        output = self.forward(state)
        target_f = output.clone()
        target_f[np.argmax(action)] = target
        target_f.detach()
        self.optimizer.zero_grad()
        loss = F.mse_loss(output, target_f)
        loss.backward()
        self.optimizer.step()
        return

In [14]:
class DeepQTrain(Player):
    def __init__(self, params):
        self._epsilon = 0.3
        self.train_mode = True
        self.file_name = ""
        self.learning_rate = params["learning_rate"]
        self.weight_path = params["weight_path"]
        self.memory = []
        self.from_pos_net = From_Pos_Net()
        self.from_pos_net.optimizer = optim.Adam(
            self.from_pos_net.parameters(),
            weight_decay=0,
            lr=self.learning_rate,
        )
        self.action_net = Action_Net()
        self.action_net.optimizer = optim.Adam(
            self.action_net.parameters(), weight_decay=0, lr=self.learning_rate
        )

        DEVICE = "cpu"
        self.from_pos_net.to(DEVICE)
        self.action_net.to(DEVICE)

    def set_epsilon(self, eps: int) -> None:
        self._epsilon = eps

    def get_epsilon(self) -> int:
        return self._epsilon

    def make_move(self, game: "Game") -> tuple[tuple[int, int], Move]:
        input = game.get_board()
        from_pos_net_input = np.append(input, game.get_current_player())

        if self.train_mode and np.random.random() < self._epsilon:
            possible_moves = game.get_possible_moves()
            index = np.random.choice(len(possible_moves))
            move = possible_moves[index]
        else:
            from_pos = self.from_pos_net(from_pos_net_input)
            from_pos = from_pos.reshape(5, 5)  # we reshape it in matrix form.
            from_pos = unravel_index(
                from_pos.cpu().argmax(), from_pos.shape
            )  # we take the position of the matrix we are interested into.
            # this will be part of the additional input to the action net.
            act_net_input = np.append(input, from_pos)
            act = self.action_net.forward(act_net_input)
            act = act.cpu().argmax().numpy()

            move = (from_pos, Move(act))

        # vorrei la memoria fatta da (board_state+player, (move, act))
        self.memory.append((from_pos_net_input, move))

        return move

    def back_prop(self, reward: int):
        for idx in range(len(self.memory) - 1):
            curr_state, move = self.memory[idx]
            curr_pos, curr_move = move
            next_state, n_move = self.memory[idx + 1]
            next_pos, _ = n_move
            r = 0 if idx < len(self.memory) - 2 else reward
            self.from_pos_net.train_net(curr_state, next_state, curr_pos, r)
            self.action_net.train_net(
                np.append(curr_state[:-1], curr_pos),
                np.append(next_state[:-1], next_pos),
                curr_move,
                r,
            )

        self.memory = []

    def save_policy(self):
        from_pos_weights = self.from_pos_net.state_dict()
        act_weights = self.action_net.state_dict()
        torch.save(from_pos_weights, "from_pos_net.h5")
        torch.save(act_weights, "action_net.h5")
        pass







### New Game subclass

In [15]:
from tqdm import tqdm

In [16]:
class GameTrainer(Game):
    def __init__(self) -> None:
        super().__init__()

    def print(self) -> None:
        # os.system("cls||clear")
        pass

    def __acceptable_slides(self, from_position: tuple[int, int]):
        """When taking a piece from {from_position} returns the possible moves (slides)"""
        acceptable_slides = [Move.BOTTOM, Move.TOP, Move.LEFT, Move.RIGHT]
        axis_0 = from_position[0]  # axis_0 = 0 means uppermost row
        axis_1 = from_position[1]  # axis_1 = 0 means leftmost column

        if axis_0 == 0:  # can't move upwards if in the top row...
            acceptable_slides.remove(Move.TOP)
        elif axis_0 == 4:
            acceptable_slides.remove(Move.BOTTOM)

        if axis_1 == 0:
            acceptable_slides.remove(Move.LEFT)
        elif axis_1 == 4:
            acceptable_slides.remove(Move.RIGHT)
        return acceptable_slides

    def get_possible_moves(self):
        # __acceptable_slides -> prende from_pos e ritorna le slides possibili.
        # for solo sugli element di contorno. e prendiamo le posizion. poi abbiamo acceptable_slides che ci dice le slide possivbili.
        moves = []
        for row in [0, 4]:
            for col in range(5):
                if (
                    self._board[row, col] == self.current_player_idx
                    or self._board[row, col] == -1
                ):
                    slides = self.__acceptable_slides((row, col))
                    for slide in slides:
                        moves.append(((col, row), slide))
                if (
                    self._board[col, row] == self.current_player_idx
                    or self._board[col, row] == -1
                ):
                    slides = self.__acceptable_slides((col, row))
                    for slide in slides:
                        moves.append(((row, col), slide))
        return moves

    def play(self, player1: Player, player2: Player) -> int:
        self._board = np.full((5, 5), -1, dtype=np.int8)
        self.current_player_idx = -1
        players = [player1, player2]
        winner = -3
        n_move = 0
        while winner < 0 and n_move < 150:
            self.current_player_idx += 1
            self.current_player_idx %= len(players)
            ok = False
            in_loop = 0
            while not ok:
                in_loop += 1
                from_pos, slide = players[self.current_player_idx].make_move(
                    self
                )
                ok = self._Game__move(from_pos, slide, self.current_player_idx)
                if in_loop > 200:
                    pass
            n_move += 1
            winner = self.check_winner()
        return winner

    # look at the problem of the starting position.
    # The idea could be to leave the play as it is, then we can make our players play as first then as second.
    # we can also make array players shuffle. -> i'll go with this solution
    def train(self, trainee: Player, trainer: Player, epochs: int) -> None:
        if not trainee.file_name:
            print("starting full exploration mode")
            trainee.set_epsilon(1)
        # if isinstance(trainer, RLayer) and not trainer.file_name:
        # trainer.set_epsilon(1)
        players = [trainee, trainer]
        winning_reward = 1
        losing_reward = -3
        first_draw_reward = 0.1
        second_draw_reward = 0.5
        bar = tqdm(total=epochs, desc="Epoch")
        for ep in range(epochs):
            if ep % (epochs // 30) == 0:
                old_eps = trainee.get_epsilon()
                new_eps = old_eps - 0.1 if old_eps > 0.3 else old_eps
                # eps decrease at the same rate for both.
                trainee.set_epsilon(new_eps)
                # if isinstance(trainer, RLayer):
                # trainer.set_epsilon(new_eps)
            np.random.shuffle(players)
            winner_idx = self.play(players[0], players[1])
            loser_idx = (winner_idx + 1) % 2
            if winner_idx != -1:
                if isinstance(players[winner_idx], DeepQTrain):
                    players[winner_idx].back_prop(winning_reward)
                if isinstance(players[loser_idx], DeepQTrain):
                    players[loser_idx].back_prop(losing_reward)
            else:
                if isinstance(players[0], DeepQTrain):
                    # if first start draws, not very good.
                    players[0].back_prop(first_draw_reward)
                if isinstance(players[1], DeepQTrain):
                    # if second starting draws, good for him
                    players[1].back_prop(second_draw_reward)
            bar.update(1)
        if isinstance(trainee, DeepQTrain):
            trainee.save_policy()
        # if isinstance(trainer, RLayer):
        #     trainer.save_policy()
        return

# Prova

In [17]:
game = GameTrainer()
trainee = DeepQTrain({"learning_rate": 0.001, "weight_path": "ciao"})
trainer = RandomPlayer()

In [18]:
game.train(trainee, trainer, 3000)

starting full exploration mode


Epoch: 100%|██████████| 3000/3000 [04:11<00:00, 11.92it/s]


In [19]:
trainee.is_training = False
n_game = 1000

print("starting evaluation")
bar = tqdm(total=n_game * 2, desc="Game #")

wins_as_first = 0
for _ in range(n_game):
    winner = game.play(trainee, RandomPlayer())
    if winner == 0:
        wins_as_first += 1
    bar.update(1)


wins_as_second = 0
for _ in range(n_game):
    winner = game.play(RandomPlayer(), trainee)
    if winner == 1:
        wins_as_second += 1
    bar.update(1)

starting evaluation




In [None]:
print("")
print(f"Wins as first: {wins_as_first/n_game:.2f}%")
print(f"Wins as second: {wins_as_second/n_game:.2f}%")

print(f"total percentage: {(wins_as_first + wins_as_second)/(n_game*2):.2f}%")


Wins as first: 0.77%
Wins as second: 0.70%
total percentage: 0.74%


