## A markov chain based rock-paper-scissors game playing agent implementation.

In [5]:
import numpy as np
import pandas as pd
from collections import Counter

In [6]:
def compare(action_A, action_B):
    """ Compares actions performed by the two players and determines
    the winner.
    
    :param action_A: Action performed by player 'A'.
    :param action_B: Action performed by player 'B'.
    :returns String: 'A' if player 'A' won,'B' if player 'B' won,'draw' otherwise.
    """
    beats = {'R': ['S'], 'P': ['R'], 'S': ['P']}
    if action_B in beats[action_A]:
        return 'A'
    if action_A in beats[action_B]:
        return 'B'
    return 'draw'


def simulate(states, states_to_indices, transition_prob_A, transition_prob_B,
         init_state_A, init_state_B, learn=False, n_iter=50, verbose=True):
    """ Simulates ''n_iter'' rock-paper-scissors games between two competing players. Learning is 
    performed by estimating the transition matrix of player 'B' based on its previous moves, therefore
    predicting the next action of this player and choosing an action of player 'A' that would win the
    round if the prediction were correct.
    
    :param states: A list of possible MDP states.
    :param states_to_indices: A dictionary mapping state names (from ''states'') to 
    indices in the transition matrices.
    :param transition_prob_A: A matrix of transition probabilities for player 'A'.
    :param transition_prob_B: A matrix of transition probabilities for player 'B'.
    :param init_state_A: The initial state of player 'A'.
    :param init_state_B: The initial state of player 'B'.
    :param learn: If True will perform transition matrix updates for player 'A' (default=False).
    :param n_iter: Number of games to simulate (default=50).
    :param verbose: If True prints the results of each game to stdout. 
    :returns: If ''learn'' is True, returns a tuple of ('outcomes', 'counts'), where 'outcomes' is
    a list of strings indicating the winner of each round, and 'counts' is a matrix of observed B's 
    transitions. Otherwise, only 'outcomes' is returned.
    """
    outcomes = []
    state_A = init_state_A
    state_B = init_state_B
    
    if learn:
        # Initialize the counts that are meant to approximate 
        # player B's transition matrix to zeros
        counts_B = np.zeros((3, 3))
        # Dictionary used to determine the action of 'A' that 
        # counters the predicted action of 'B'
        beats = {'R': ['S'], 'P': ['R'], 'S': ['P']}
        # Store the index of the previous B's action on each round
        prev_state_B_index = None
        
    for game in range(1, n_iter+1):
        winner = compare(state_A, state_B)
        outcomes.append(winner)
        
        if verbose:
            print(f'Game: {game} \tAction A: {state_A}\tAction B: {state_B}\tWinner: {winner}')
        
        # Map states to indices into the transition matrices
        state_A_index = states_to_indices[state_A]
        state_B_index = states_to_indices[state_B]
        
        if learn and prev_state_B_index is None:
            # Actions in the first round are sampled randomly 
            # for both players
            prev_state_B_index = state_B_index
            state_A = np.random.choice(states)[0]
            state_B = np.random.choice(states, p=transition_prob_B[state_B_index])
        if learn and prev_state_B_index is not None:
            # Increment the count of the observed transition of player B from state 
            # in the previous round to state in this round
            counts_B[prev_state_B_index][state_B_index] += 1
            # Normalize the counts for transitions from the previous state of player B 
            # to a probability distribution over the next possible state
            states_distribution_B = counts_B[prev_state_B_index] / counts_B[prev_state_B_index].sum()
            # Sample a state from this distribution to predict B's next move
            predicted_state_B = np.random.choice(states, p=states_distribution_B)
            prev_state_B_index = state_B_index
            
            # Pick a new A's state that counters the predicted B's state
            state_A = beats[predicted_state_B][0]
            # Player 'B' still moves according to the random transition matrix
            state_B = np.random.choice(states, p=transition_prob_B[state_B_index])           
        else:
            state_A = np.random.choice(states, p=transition_prob_A[state_A_index])
            state_B = np.random.choice(states, p=transition_prob_B[state_B_index])
    if learn:
        return outcomes, counts_B
    else:
        return outcomes

## Simulation without learning

In [7]:
states_to_indices = {'R': 0, 'P': 1, 'S': 2}
states = list(states_to_indices.keys())

# Generate random transition matrices for both players
uniform_samples_A = np.random.uniform(size=(3, 3))
transition_prob_A = uniform_samples_A / np.sum(uniform_samples_A, axis=1, keepdims=True)

uniform_samples_B = np.random.uniform(size=(3, 3))
transition_prob_B = uniform_samples_B / np.sum(uniform_samples_B, axis=1, keepdims=True)

# Sample a random initial state for both players
init_state_A = np.random.choice(states)[0]
init_state_B = np.random.choice(states)[0]

outcomes = simulate(states, states_to_indices, transition_prob_A, transition_prob_B, init_state_A, init_state_B, learn=False, n_iter=500, verbose=False)

n_games = len(outcomes)
outcomes = Counter(outcomes)
A_won = float(outcomes['A'] / n_games)
A_lost = float(outcomes['B'] / n_games)
draw = float(outcomes['draw'] / n_games)
print(f'\nPlayer A:\n\tWon: {round(A_won, 1) * 100}%\n\tDraw: {round(draw, 1) * 100}%\n\tLost: {round(A_lost, 1) * 100}%')

print(f'\nPlayer A transition matrix:\n{pd.DataFrame(transition_prob_A, index=states, columns=states)}')
print(f'\nPlayer B transition matrix:\n{pd.DataFrame(transition_prob_B, index=states, columns=states)}')


Player A:
	Won: 30.0%
	Draw: 30.0%
	Lost: 30.0%

Player A transition matrix:
          R         P         S
R  0.405356  0.509672  0.084972
P  0.305782  0.336625  0.357593
S  0.352444  0.322806  0.324749

Player B transition matrix:
          R         P         S
R  0.247000  0.361835  0.391165
P  0.247401  0.309909  0.442690
S  0.389238  0.120834  0.489928


## Simulation with learning

In [10]:
# Hardcoded player B's transition matrix, so as to
# create some pattern which player A may learn.
transition_prob_B = np.array([[0.1, 0.1, 0.8],
                              [0.7, 0.05, 0.25],
                              [0.1, 0.9, 0.0]])

outcomes, counts = simulate(states, states_to_indices, None, transition_prob_B, init_state_A, init_state_B, learn=True, n_iter=500, verbose=False)
predicted_B_transition_prob = counts / counts.sum(axis=1, keepdims=True)

n_games = len(outcomes)
outcomes = Counter(outcomes)
A_won = float(outcomes['A'] / n_games)
A_lost = float(outcomes['B'] / n_games)
draw = float(outcomes['draw'] / n_games)
print(f'\nPlayer A:\n\tWon: {round(A_won, 1) * 100}%\n\tDraw: {round(draw, 1) * 100}%\n\tLost: {round(A_lost, 1) * 100}%')

print(f'\nPredicted player B transition matrix (determines the moves of player A):\n{pd.DataFrame(predicted_B_transition_prob, index=states, columns=states)}')
print(f'\nTrue player B transition matrix:\n{pd.DataFrame(transition_prob_B, index=states, columns=states)}')


Player A:
	Won: 30.0%
	Draw: 50.0%
	Lost: 20.0%

Predicted player B transition matrix (determines the moves of player A):
          R         P         S
R  0.127389  0.063694  0.808917
P  0.687861  0.075145  0.236994
S  0.111765  0.882353  0.005882

True player B transition matrix:
     R     P     S
R  0.1  0.10  0.80
P  0.7  0.05  0.25
S  0.1  0.90  0.00
