Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [24]:
import numpy as np 
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice
from copy import deepcopy
from tqdm.notebook import tqdm
from scipy import stats


In [2]:
'''
Magic Square :

2 | 7 | 6
--+---+--
9 | 5 | 1
--+---+--
4 | 3 | 8

'''

MAGIC = [2,7,6,
         9,5,1,
         4,3,8]

In [3]:
State = namedtuple('State', ['x','o'])

In [4]:
def print_board(pos : State) :
    for r in range(3) :
        for c in range(3) :
            index = r*3 + c
            if MAGIC[index] in pos.x :
                print('X', end='')
            elif MAGIC[index] in pos.o :
                print('O', end='')
            else :
                print('.', end='')
        print()
    print()

In [5]:
def win(elements) :
    """Check if positions of a player contain a full line"""
    return any(sum(c) == 15 for c in combinations(elements,3))

def state_value(position : State) :
    """Returns 1 if x player wins, -1 if o player wins, 0 else"""
    if win(position.x) :
        return 1
    elif win(position.o) :
        return -1
    else :
        return 0

In [6]:
def random_game() :
    trajectory = list()
    state = State(set(),set())
    available = set(range(1, 9+1))
    while available :
        x = choice(list(available))
        state.x.add(x)
        trajectory.append(deepcopy(state))
        available.remove(x)
        if win(state.x) :
            break
        
        if available == set() :
            break
        o = choice(list(available))
        state.o.add(o)
        trajectory.append(deepcopy(state))
        available.remove(o)
        if win(state.o) :
            break
    
    return trajectory


In [7]:
value_dictionary = defaultdict(float)
epsilon = 0.001

for steps in tqdm(range(100000)) :
    trajectory = random_game()
    final_reward = state_value(trajectory[-1])
    for state in trajectory :
        hashable_state = (frozenset(state.x), frozenset(state.o))
        value_dictionary[hashable_state] = value_dictionary[hashable_state] + epsilon*(final_reward - value_dictionary[hashable_state])

  0%|          | 0/100000 [00:00<?, ?it/s]

In [8]:
sorted(value_dictionary.items(), key=lambda e : e[1], reverse=True)[:10]

[((frozenset({5}), frozenset()), 0.5161358325738735),
 ((frozenset({5}), frozenset({3})), 0.44638041049170357),
 ((frozenset({5}), frozenset({1})), 0.43392784060987294),
 ((frozenset({5}), frozenset({9})), 0.41540157043729864),
 ((frozenset({5}), frozenset({7})), 0.41084269218460745),
 ((frozenset({1, 2, 4, 7, 9}), frozenset({3, 5, 6, 8})), 0.40563411037998426),
 ((frozenset({3, 5, 6, 7, 9}), frozenset({1, 2, 4, 8})), 0.40503914952951375),
 ((frozenset({1, 2, 4, 5, 6}), frozenset({3, 7, 8, 9})), 0.40325069125445406),
 ((frozenset({1, 3, 5, 6, 9}), frozenset({2, 4, 7, 8})), 0.3954383734429431),
 ((frozenset({1, 5, 7, 8, 9}), frozenset({2, 3, 4, 6})), 0.39483320664959265)]

In [28]:
'''Compute all possible actions and resulting states for a player'''
def next_possible_states(cur_state : State, player) :
    possible_states = []
    empty_places = set(range(1,9+1)) - cur_state.x - cur_state.o 
    for possible_play in empty_places :
        if player == 'X' :
            new_state = (frozenset(cur_state.x ^ set({possible_play})), frozenset(cur_state.o))
        else :
            new_state = (frozenset(cur_state.x), frozenset(cur_state.o ^ set({possible_play})))
        possible_states.append(new_state)
    
    return possible_states

'''Given a state of the game and the current player, returns the state resulting from the best play, based on the learned reward values'''
def policy(cur_state : State, player, value_dictionary : dict) :
    possible_states = next_possible_states(cur_state, player)
    # If the player is X, the goal is to maximize the reward
    if player == 'X' :
        compare = lambda x,y : x > y
        best_reward = -1 
    # If the player is O, the goal is to minimize the reward
    else :
        compare = lambda x,y : x < y 
        best_reward = 1
    best_state = None 
    for next in possible_states :
        reward = value_dictionary[next]
        # A reward of 0 means the state is not present in the dict
        if reward == 0 :
            continue
        if compare(reward, best_reward) :
            best_reward = reward 
            best_state = next 
    
    # If no good move (if none of the next states are in the dict) can be found, play randomly
    if best_state is None :
        best_state = choice(possible_states)
    
    return State(set(best_state[0]), set(best_state[1]))

In [31]:
'''Definition of a random agent, against which our policy will be tested'''
def random_strat(state : State, player, value_dictionary = None) :
    possible_states = next_possible_states(state, player)
    rnd_state = choice(possible_states)
    return State(set(rnd_state[0]), set(rnd_state[1]))

In [29]:
state = State(set({3,8,1}),set({9,4}))
player = 'O'
policy(state, player, value_dictionary)

State(x={8, 1, 3}, o={9, 2, 4})

In [42]:
# Returns the confidence interval of the mean of sample
def compute_conf_interval(sample, conf_level) :
    #print(sample)
    mean = np.mean(sample)
    sd = np.std(sample, ddof=1)
    alpha = 1 - conf_level
    n = len(sample)
    if n <= 30 :
        quantile = stats.t(df=n-1).ppf(1 - alpha/2)
    else :
        quantile = stats.norm().ppf(1 - alpha/2)
    
    delta = quantile*sd/np.sqrt(n)

    return [mean - delta, mean + delta]

# Checks if two intervals overlap or not
def are_disjoint(itv1, itv2) -> bool :
    if itv1[0] > itv2[1] :
        return True
    if itv2[0] > itv1[1] :
        return True
    return False

In [35]:
'''Simulates a game between two strategies, and return 1 if the first strategy wins, -1 if the second strategy wins, or 0 if there is a draw'''
def play_game(strat_1 : callable, strat_2 : callable, st1_is_first : bool, value_dictionary=None) :
    state = State(set(),set())
    available = set(range(1,9+1))
    players = ['X','O']
    players_strat = [strat_1, strat_2] if st1_is_first else [strat_2, strat_1] 
    ix_player = 0
    while available :
        strat = players_strat[ix_player]
        player = players[ix_player]
        state = strat(state, player, value_dictionary)
        if win(state.x) :
            return 1 if st1_is_first else -1
        if win(state.o) :
            return -1 if st1_is_first else 1
        available = set(range(1,9+1)) - state.x - state.o
        ix_player = 1 - ix_player

    return 0



In [39]:
play_game(policy, random_strat, False, value_dictionary)

1

In [58]:
'''Performs multiple games (Until confidence intervals are separated) with a policy agent vs a random player. The policy agent is tested being 'X' and 'O' '''
def test_policy(policy : callable, value_dictionary) :
    policy_wins = list()
    random_wins = list()
    while True :
        cnt_policy = 0
        for _ in range(50) :
            if play_game(policy, random_strat, st1_is_first=True, value_dictionary=value_dictionary) == 1 :
                cnt_policy += 1
        for _ in range(50) :
            if play_game(policy, random_strat, st1_is_first=False, value_dictionary=value_dictionary) == 1 :
                cnt_policy += 1
        policy_wins.append(cnt_policy)
        random_wins.append(100 - cnt_policy)
        if len(policy_wins) == 1 :
            continue
        itv_policy = compute_conf_interval(policy_wins, 0.95)
        itv_random = compute_conf_interval(random_wins, 0.95)
        if are_disjoint(itv_policy, itv_random) :
            return np.mean(itv_policy)

In [61]:
test_policy(policy, value_dictionary)

93.5

In [None]:
""" 
TODO :
- Search for a better alternative to MonteCarlo
""" 

In [27]:
a = frozenset({1,2})
set(a)

{1, 2}