<a href="https://colab.research.google.com/github/aureliendersy/Power4_AI/blob/main/Power4_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Goal of the notebook is to implement the game of powerfour and to train an AI on it

# Setting up the game

In [None]:
!pip install colorama



In [None]:
from colorama import Fore
from colorama import Style
from random import sample
import numpy as np

class Player:
    
    def __init__(self,name,marker, human=True):
        self.name=name
        self.piece=marker
        self.is_human = human
    
    def display_player(self):  
        print('Player '+self.name+', playing with the '+self.piece+' markers')
    
    def ask_move(self, valid_moves, board_state):
      col1=input('Player '+ self.name +' where do you want to place your piece ?')

      while not col1.isdigit():
        col1= input('Please enter a positive numerical column')

      while int(col1)-1 not in valid_moves:
          print('This is not a valid column')
          col1=input('Please enter a new column :')

      return col1


class Computer(Player):
    def __init__(self,name,marker):
      super(Player, self).__init__(name, marker, human=False) 

class RandomComputer(Computer):
    def __init__(self, name, marker):
      super(Computer, self).__init__(name, marker) 

    def ask_move(self, valid_moves, board_state):
      return sample(valid_moves,1)[0]+1

class TrainedComputer(Computer):
    def __init__(self, name, marker, agent):
      super(Computer, self).__init__(name, marker) 
      self.agent = agent

    def ask_move(self, valid_moves, board_state):
      
      desired_move = self.agent.act(board_state.flatten())

      if desired_move in valid_moves: 
        return self.agent.act(board_state.flatten()) + 1
      else:
        return valid_moves[-1] +1


class Board:
    
    def __init__(self, grid_size=8):
        self.grid_size = grid_size
        self.grid = np.zeros((self.grid_size, self.grid_size), dtype=int)
        
    def display_grid(self, markers=None):

        if markers is None:
          markers=['x', 'x']
        for line in self.grid:
            for piece in line:
                if piece==0:
                    print('|'+'.',end='')
                elif piece==1:
                    print('|'+'\033[34m' + markers[0] + '\033[0m',end='')   
                else: 
                    print('|'+'\033[35m' + markers[1] + '\033[0m',end='')   
            print('|')
        print(' '+'1 2 3 4 5 6 7 8') # %TODO   Change this
        print('\n')    
                   
    def valid_move(self,col):
        return self.grid[0][col]==0

    def place_piece(self,piece,col):
        
        i=len(self.grid)-1
        while self.grid[i][col]!=0:
            i=i-1
        self.grid[i][col]=piece    
  
    def check_victory(self):
        columns=False
        lines=False
        diagonals=False
        for i in range(0,len(self.grid)):
            for j in range(0,len(self.grid)-3):
                if all(self.grid[j][i]==self.grid[k][i] for k in range(j,j+4)) and self.grid[j][i]!=0:
                    columns=True
        
        for i in range(0,len(self.grid)-3):
            for j in range(0,len(self.grid)):
                if all(self.grid[j][i]==self.grid[j][k] for k in range(i,i+4)) and self.grid[j][i]!=0:
                    lines=True    
                    
        for i in range(0,len(self.grid)-3):
            for j in range(0,len(self.grid)-3): 
                if all(self.grid[j][i]==self.grid[j+k][i+k] for k in range (0,4)) and self.grid[j][i]!=0:
                    diagonals=True
        for i in range(0,len(self.grid)-3):
            for j in range(3,len(self.grid)): 
                if all(self.grid[j][i]==self.grid[j-k][i+k] for k in range (0,4)) and self.grid[j][i]!=0:
                    diagonals=True            
                        
        return (columns or lines or diagonals)  
    
    def board_full(self):
       return self.grid.all()

class Game:
    def __init__(self, show_game=True):
        self.players=[]
        self.board=Board()
        self.show_game = show_game
        
    def add_player(self,player):
        self.players.append(player)
        
    def introduce_game(self):
        print('Let\'s introduce our players: ')
        for player in self.players:
          player.display_player()    
        print('We will be playing with the standard grid as shown:')
        Board().display_grid()         

    def play_game(self):
        markers_players = [player.piece for player in self.players]
        col_nums = range(0, len(self.board.grid))

        if self.show_game:
          self.introduce_game()
        while(not self.board.board_full() and not self.board.check_victory()):
          
          col1 = self.players[0].ask_move([move for move in col_nums if self.board.valid_move(move)], self.board.grid)
          self.board.place_piece(1,int(col1)-1)
          
          if self.show_game:
            self.board.display_grid(markers_players)
        
          if not self.board.check_victory():
              col2= self.players[1].ask_move([move for move in col_nums if self.board.valid_move(move)],self.board.grid)
              self.board.place_piece(-1,int(col2)-1)
              
              if self.show_game:
                self.board.display_grid(markers_players)
              
              if self.board.check_victory():
                  if self.show_game:
                    print('Victory for player '+ self.players[1].name)      
                  return 2
          else:
              if self.show_game:
                print('Victory for player '+ self.players[0].name)
              return 1
        if not self.board.check_victory():
            if self.show_game:
              print('The board is full and it is a draw')
            return 0


In [None]:
Apolline=Player('Apolline','x')
Aurelien=Player('Aurelien','O')
Bot1 = RandomComputer('Bot1', 'X')
Bot2 = RandomComputer('Bot2', 'O')

In [None]:
board1= Board()

In [None]:
game_real=Game(show_game=False)
game_real.add_player(Bot1)
game_real.add_player(Bot2)
game_real.play_game()

1

In [None]:
from collections import Counter
results = []
for i in range(1000):
  game_real=Game(show_game=False)
  game_real.add_player(Bot1)
  game_real.add_player(Bot2)
  result = game_real.play_game()
  results.append(result)

Counter(results)

Counter({1: 429, 2: 571})

# AI for the computer with RL

## Imports 

In [None]:
!pip install chainerrl



In [None]:
import gym
import random
from gym import spaces
from copy import deepcopy
import logging
import os

import chainer
from chainer import functions as F
import gym.spaces

import chainerrl

## Environment wrapper

In [None]:
class Power4Env(gym.Env):
  def __init__(self, grid_size=8):
    super(Power4Env, self).__init__()
    self.grid_size = grid_size
    self.board = Board(grid_size)

    self.action_space = spaces.Discrete(grid_size)
    #self.observation_space = spaces.MultiDiscrete([2 for _ in range(0, grid_size*grid_size*3)])
    self.observation_space = spaces.Box(low=-1, high=1, shape=(grid_size, grid_size), dtype=np.int32)

  def reset(self):
    self.board = Board(self.grid_size)
    return self.board.grid.flatten()

  def render(self, mode="human"):
    self.board.display_grid()
  
  def close(self):
    pass
  
  def step(self, action):

    player, column = action
    if self.board.valid_move(int(column)):
      self.board.place_piece(player,int(column))
    else:
      valid_moves = [move for move in range(self.grid_size) if self.board.valid_move(move)]
      self.board.place_piece(player, valid_moves[-1])

    done = self.board.check_victory() or self.board.board_full()

    reward = 1 if self.board.check_victory() else 0
    info = {}

    return self.board.grid.flatten() , reward, done, info

In [None]:
env = Power4Env()
obs_space = env.observation_space
action_space = env.action_space
print('Observation space:', obs_space)
print('Action space:', action_space)

Observation space: Box(-1, 1, (8, 8), int32)
Action space: Discrete(8)


In [None]:
obs = env.reset()
env.render()
print('initial observation:', obs)

action = env.action_space.sample()
obs, r, done, info = env.step((1, action))
env.render()
print('next observation:', obs)
print('reward:', r)
print('done:', done)
print('info:', info)

|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
 1 2 3 4 5 6 7 8


initial observation: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|[34mx[0m|
 1 2 3 4 5 6 7 8


next observation: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
reward: 0
done: False
info: {}


## Define the RL agents

In [None]:
q_func_player1 = chainerrl.q_functions.FCStateQFunctionWithDiscreteAction(
    obs_space.low.size, action_space.n,
    n_hidden_layers=2, n_hidden_channels=248)
q_func_player1.to_gpu(0)

q_func_player2 = chainerrl.q_functions.FCStateQFunctionWithDiscreteAction(
    obs_space.low.size, action_space.n,
    n_hidden_layers=2, n_hidden_channels=248)
q_func_player2.to_gpu(0)

optimizer_dqn_1 = chainer.optimizers.Adam(eps=1e-2)
optimizer_dqn_1.setup(q_func_player1)

optimizer_dqn_2 = chainer.optimizers.Adam(eps=1e-2)
optimizer_dqn_2.setup(q_func_player2)
# Set the discount factor that discounts future rewards.
gamma_dqn = 0.95

# Use epsilon-greedy for exploration
explorer_1 = chainerrl.explorers.ConstantEpsilonGreedy(
    epsilon=0.3, random_action_func=env.action_space.sample)

explorer_2 = chainerrl.explorers.ConstantEpsilonGreedy(
    epsilon=0.3, random_action_func=env.action_space.sample)

# DQN uses Experience Replay.
# Specify a replay buffer and its capacity.
replay_buffer_1 = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)
replay_buffer_2 = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)

# Type conversion
phi = lambda x: x.astype(np.float32, copy=False)

# Now create an agent that will interact with the environment.
agent_dqn_player1 = chainerrl.agents.DoubleDQN(
    q_func_player1, optimizer_dqn_1, replay_buffer_1, gamma_dqn, explorer_1,
    replay_start_size=500, update_interval=1,
    target_update_interval=100,phi=phi)

agent_dqn_player2 = chainerrl.agents.DoubleDQN(
    q_func_player2, optimizer_dqn_2, replay_buffer_2, gamma_dqn, explorer_2,
    replay_start_size=500, update_interval=1,
    target_update_interval=100,phi=phi)

# Train the agents sequentially

In [None]:
import logging
import sys
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')

Preload some agents if we want to do learning on past agents


In [None]:
#agent_dqn_player1.load('agent1c')
#agent_dqn_player2.load('agent2c')

In [None]:
n_episodes = 100000
individual_agent_train = 500

agent_players = [agent_dqn_player1, agent_dqn_player2]
player_pieces = [1 , -1]

# Play against the random agent sometimes
random_agent_cycle = 50

train_agent_ep  = 0
train_agent_index = 0


for i in range(1, n_episodes + 1):
    
    if i%1000<=random_agent_cycle:
      random_opponent = True
    else:
      random_opponent = False

    if train_agent_ep >= individual_agent_train:
      train_agent_ep = 0
      train_agent_index = 1- train_agent_index

    obs = env.reset()
    reward = 0
    done = False
    player = 0

    while not done:
      if train_agent_index == player:  
        action = agent_players[player].act_and_train(obs, reward)
        obs, reward, done, _ = env.step((player_pieces[player], action))

      else:

        if not random_opponent:
          action = agent_players[player].act(obs)
        else:
          valid_moves = [move for move in range(env.grid_size) if env.board.valid_move(move)]
          action = sample(valid_moves,1)[0] 

        obs, reward, done, _ = env.step((player_pieces[player], action))
        reward = - reward

      player = 1- player

    agent_players[train_agent_index].stop_episode_and_train(obs, reward, done)
    train_agent_ep+=1

    if i%1000==0:
      print('Finished episode ' + str(i))

print('Finished.')

Finished episode 1000
Finished episode 2000
Finished episode 3000
Finished episode 4000
Finished episode 5000
Finished episode 6000
Finished episode 7000
Finished episode 8000
Finished episode 9000
Finished episode 10000
Finished episode 11000
Finished episode 12000
Finished episode 13000
Finished episode 14000
Finished episode 15000
Finished episode 16000
Finished episode 17000
Finished episode 18000
Finished episode 19000
Finished episode 20000
Finished episode 21000
Finished episode 22000
Finished episode 23000
Finished episode 24000
Finished episode 25000
Finished episode 26000
Finished episode 27000
Finished episode 28000
Finished episode 29000
Finished episode 30000
Finished episode 31000
Finished episode 32000
Finished episode 33000
Finished episode 34000
Finished episode 35000
Finished episode 36000
Finished episode 37000
Finished episode 38000
Finished episode 39000
Finished episode 40000
Finished episode 41000
Finished episode 42000
Finished episode 43000
Finished episode 440

# Saving and loading agents

## Saving

In [None]:
# Save an agent to the 'agent' directory
agent_dqn_player1.save('agent1d')
agent_dqn_player2.save('agent2d')

## Loading

In [None]:
agent_dqn_player1.load('agent1d')
agent_dqn_player2.load('agent2d')

# Testing the agents on a game

In [None]:
SmartBot1 = TrainedComputer('SmartBot1', 'X', agent_dqn_player1)
SmartBot2 = TrainedComputer('SmartBot2', 'O', agent_dqn_player2)
RandomBot1 = RandomComputer('RandBot1', 'X')
RandomBot2 = RandomComputer('RandBot2', 'O')

In [None]:
game_real=Game(show_game=True)
game_real.add_player(SmartBot1)
game_real.add_player(SmartBot2)
game_real.play_game()

Let's introduce our players: 
Player SmartBot1, playing with the X markers
Player SmartBot2, playing with the O markers
We will be playing with the standard grid as shown:
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
 1 2 3 4 5 6 7 8


|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|[34mX[0m|.|.|.|.|.|.|.|
 1 2 3 4 5 6 7 8


|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|[34mX[0m|.|.|.|[35mO[0m|.|.|.|
 1 2 3 4 5 6 7 8


|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|[34mX[0m|[34mX[0m|.|.|[35mO[0m|.|.|.|
 1 2 3 4 5 6 7 8


|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.

2

In [None]:
results = []
for i in range(1000):

  game_real=Game(show_game=False)
  game_real.add_player(SmartBot1)
  game_real.add_player(RandomBot2)
  results.append(game_real.play_game())

Counter(results)

Counter({1: 893, 2: 107})

In [None]:
results = []
for i in range(1000):

  game_real=Game(show_game=False)
  game_real.add_player(RandomBot1)
  game_real.add_player(SmartBot2)
  results.append(game_real.play_game())

Counter(results)

Counter({1: 145, 2: 855})

In [None]:
  game_real=Game(show_game=True)
  game_real.add_player(Aurelien)
  game_real.add_player(SmartBot2)
  results.append(game_real.play_game())

Let's introduce our players: 
Player Aurelien, playing with the O markers
Player SmartBot2, playing with the O markers
We will be playing with the standard grid as shown:
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
 1 2 3 4 5 6 7 8


Player Aurelien where do you want to place your piece ?4
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|[34mO[0m|.|.|.|.|
 1 2 3 4 5 6 7 8


|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|[34mO[0m|[35mO[0m|.|.|.|
 1 2 3 4 5 6 7 8


Player Aurelien where do you want to place your piece ?5
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|.|.|.|.|
|.|.|.|.|[34mO[0m|.|.|.|
|.|.|.|[34mO[0m|[35mO[0m|.|.|.|
 1 2 3 4 5 6 7 8


|.|.|.|.|