In [1]:
from fourInRowGame import Chip, FourInRowGame
import models
import numpy as np
import torch
import copy
import sys
import tqdm
import torch.nn as nn
import os

In [2]:
# Create a game
nrow = 6
ncol = 7
num_states = [nrow, ncol]
env = FourInRowGame(nrow, ncol)

In [3]:
def exploration_policy(eps):
    act = np.random.choice(['model','random'], p = [1 - eps, eps])
    return act


def get_action(model, state, eps):
    #Get the action based on greedy epsilon policy
    act = exploration_policy(eps)
    #Get predictions
    preds = model(state)
    weights = preds.clone().numpy(force=True)
    if act == 'model':
        action = np.argmax(weights)
    elif act == 'random':
        action = np.random.randint(0, 7)
    else:
        raise ValueError(f"act is {act}. Manno")
    return int(action), weights


def check_valid(action, weights, env):
    cont = True
    while cont:    
        if env.column_height(action) >= 6:
            if np.isfinite(weights).sum() != 0:
                weights[action] = np.NINF
                action = np.argmax(weights)
            else:
                raise ValueError(f"Someone managed to violate the rules. I can not move and therefore the game has already ended!")
        else:
            cont = False    
    return action


def get_reward(winner, terminated):
    if not terminated:
        reward = 0.1
    else:
        if winner == 1:  # Model won
            reward = 0.
        elif winner == -1:  # Model lost
            reward = 1000.
        else:  # Model drew
            reward = 10.
    return reward


def make_step(env, action, chip1, chip2, model, mode="self", trans_eps = 0):
    terminated = False
    # Model is player 1 and makes a step
    env.drop(chip1, action)
    if env.check_for_victory():
        winner = 1
        terminated = True
    else:
        not_full_cols = 0
        for col in range(6):
           if env.column_height(col) <= 5:
               not_full_cols += 1
        if not_full_cols == 0:
            terminated = True
        winner = 0
    if not terminated:
        if "trans" in mode: 
            mode = np.random.choice(['self','rng'], p = [1 - trans_eps, trans_eps])
        if "self" in mode:
            with torch.no_grad():
                state = torch.tensor(env.get_simple_slots(), dtype=torch.float)
                preds = model(state)
                weights = preds.clone().numpy(force=True)
                action = int(np.argmax(weights))
                action = check_valid(action, weights, env)
                env.drop(chip2, action)
        elif "rng" in mode:
            # Opponent is a random player and makes step
            pos_acts = []
            for action in range(0,6):
                if env.column_height(action) <= 5:
                    pos_acts.append(action)
            pos_acts = np.array(pos_acts)
            if pos_acts.size != 0:
                action2 = np.random.choice(pos_acts)
                env.drop(chip2, action2)
            else:
                raise ValueError("Choices Empty, your draw check sucks.")
        if env.check_for_victory():
            winner = -1
            terminated = True
        else:
            no_full_cols = 0
            for col in range(6):
               if env.column_height(col) <= 5:
                   no_full_cols += 1
            if no_full_cols == 0:
                terminated = True
            winner = 0
    return env, winner, terminated



def train_step(model, criterion, optimizer, states, actions, rewards):    
    preds = model(states)
    loss = criterion(preds, actions) * np.mean(-rewards)
    optimizer.step()
    

In [4]:
def main(env, model, learn_rate, criterion, eps, eps_rate, eps_min, mode, chip_model, chip_opponent, path="model_name"):
    # Set optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate)
    counter = 0
    encounters = 10000
    trans_eps = 1
    pbar = tqdm.tqdm(range(encounters))
    for episode in pbar:
        # Reset the environment to an empty board
        env.reset()
        states = torch.ones((1,6,7))
        acts = torch.ones((1,7))
        rewards = []
        eps = eps * eps_rate
        trans_eps = trans_eps * 0.999
        if eps <= eps_min:
            eps = eps_min
        terminated = False
        model_is_first = np.random.randint(2)
        if model_is_first == 0:
            action = np.random.randint(0, 7)
            env.drop(chip_opponent, action)
        while not terminated:
            states = torch.cat((states,(torch.tensor(env.get_simple_slots(), dtype=torch.float)).unsqueeze(0)))
            action, weights = get_action(model, states[-1], eps)
            action = check_valid(action, weights, env)
            env, winner, terminated = make_step(env, action, chip_model, chip_opponent, model, mode, trans_eps)
            reward = get_reward(winner, terminated)
            action_ohe = torch.full((1,7), 0.05)
            action_ohe[:,action] = 0.7
            acts = torch.cat((acts, action_ohe))
            rewards.append(reward)
            if winner == 1:
                counter += 1
        preds = model(states[1:])
        if preds.ndim >= 3:
            preds = preds.squeeze()
        loss = criterion(preds, acts[1:]) * np.mean(rewards)
        optimizer.step()
        pbar.set_description(f"Loss: {loss}")
    torch.save(model.state_dict(), path)
    print(f"The model won {counter/encounters}% of its encounters")
    return model

In [5]:
# model_structure = models.FCmedium
env = FourInRowGame(6, 7)
learn_rate = 0.001
criterion = nn.CrossEntropyLoss()
eps = 1
eps_rate = 0.995
eps_min = 0.1
mode = "trans"
chip_model = Chip.RED
chip_opponent = Chip.YELLOW
model = models.CNNmedium(1, env.columns)
model_name = "CNNmedium"
for i in range(1):
    path = f"{model_name}/Iteration_{i}"
    if not os.path.exists(model_name):
        os.makedirs(model_name)
    path_pre = f"{model_name}/Iteration_{i-1}"
    if os.path.exists(path_pre):
        model = models.CNNmedium(1, env.columns)
        model.load_state_dict(torch.load(path_pre))
    model = main(env, model, learn_rate, criterion, eps, eps_rate, eps_min, mode, chip_model, chip_opponent, path=path)

Loss: 0.18043816089630127: 100%|█████████████████████████████████████████████████| 10000/10000 [04:13<00:00, 39.46it/s]

The model won 0.5049% of its encounters



