# 0. Installation of the environment

In order to use the environment, you have to install the package gym-foo with **pip install -e gym-foo** were the directory is located (at least on linux).

In [132]:
import gym
import numpy
env = gym.make('gym_foo:foo-v0', cr = 7, pr = 10, np = 2, ng = 10, alphas = [.5,.5]) # you can pass the arguments of the __init__ method here

# 1. Basic example 

In [113]:
for i_episode in range(1):
    observation = env.reset()
    for t in range(10):
#         env.render()
#         print(observation)
        action = numpy.random.randint(0,2,env.num_of_players)
        observation, reward, done, info = env.step(action)
#         print(reward)
        if done:
            env.close()
            break

End of the 10 turns.
 iter | p1 | p2 
     1| 0  | 1  
     2| 1  | 1  
     3| 1  | 0  
     4| 0  | 0  
     5| 1  | 0  
     6| 1  | 0  
     7| 0  | 1  
     8| 1  | 1  
     9| 0  | 1  
    10| 1  | 0  
scores|123 |113 
 util |123 |113 

p1 has won !


# 2. Test with Q-Learning and tables
We initialize with the reward of a 1-turn game according to the number of cooperating agent and the action the agent has chosen.

We implement a function to enhance the lisibility of the result tables.

In [148]:
def interpretation(table):
    num_of_players = table.shape[1]
    print("num of cooperating players | Q-value if cooperating | Q-value if not")
    for i,line in enumerate(table):
        print("{0:^26} | {1:^22} | {2:^14}".format(i,*line))

## 2.1 $\epsilon$-greedy selection, $\epsilon$ decreasing

In [133]:
tables = [numpy.zeros((env.num_of_players,2)) for player in env.player]
alpha = .9
gamma = .85
epsilon = 0.5

for i_episode in range(100000):
    s_t1 = env.reset()
    
    for t in range(10):
#         env.render()
        if t == 0:
            s_t1 = [0 for player in env.player] # We start with the idea that everyone is cooperating. That is a choice.
            
        a_t1 = numpy.zeros(env.num_of_players, dtype = "int")
        for i,player in enumerate(env.player):
            # The only relevant information is the number of agents who chose to cooperate.
            # We simplify the state with the number of cooperating agents.
            # We will update a table in order to find the best action according to the number of cooperating players
            
            coop = env.num_of_players - 1 - (sum(s_t1) - s_t1[i])
            best_action = numpy.argmax(tables[i][coop])
            if 1-epsilon/(i_episode + 1) > numpy.random.uniform():  
                a_t1[i] = best_action
            else:
                a_t1[i] = 1 - best_action

        s_t2, r_t1, done, info = env.step(a_t1)
        for i,player in enumerate(env.player):
            tab = tables[i]
            coop_t1, coop_t2 = env.num_of_players - 1 - (sum(s_t1) - s_t1[i]), env.num_of_players - 1 -(sum(s_t2) - s_t2[i])

            tab[coop_t1][a_t1[i]] += alpha * (r_t1[i] + gamma * max(tab[coop_t2]) - tab[coop_t1][a_t1[i]])
        
        s_t1 = s_t2
            
        if done:
#             env.close()
#             print(tables)
            break

In [136]:
for player, table in zip(env.player, tables):
    print(player.name)
    interpretation(table)

num of cooperating players | Q-value if cooperating | Q-value if not
            0              |   11.431453563011264   | 59.977234732446306
            1              |   3.024878553867845    | 108.39577659117276


## 2.2 Softmax selection 

In [146]:
tables = [numpy.zeros((env.num_of_players,2)) for player in env.player]
for table in tables:
    for i in range(table.shape[1]):
        table[i][0], table[i][1] = env.collective_reward*(i+1),env.collective_reward*i+env.personal_reward 

alpha = .9
gamma = .85
tau = 1

for i_episode in range(100000):
    s_t1 = env.reset()
    
    for t in range(10):
#         env.render()
        if t == 0:
            s_t1 = [0 for player in env.player] # We start with the idea that everyone is cooperating. That is a choice.
            
        a_t1 = numpy.zeros(env.num_of_players, dtype = "int")
        for i,player in enumerate(env.player):
            # The only relevant information is the number of agents who chose to cooperate.
            # We simplify the state with the number of cooperating agents.
            # We will update a table in order to find the best action according to the number of cooperating players
            
            coop = env.num_of_players - 1 - (sum(s_t1) - s_t1[i])
            p0,p1 = numpy.exp(tables[i][coop,0]),numpy.exp(tables[i][coop,1])
            if p0/(p0+p1) > numpy.random.uniform():  
                a_t1[i] = 0
            else:
                a_t1[i] = 1

        s_t2, r_t1, done, info = env.step(a_t1)
        for i,player in enumerate(env.player):
            tab = tables[i]
            coop_t1, coop_t2 = env.num_of_players - 1 - (sum(s_t1) - s_t1[i]), env.num_of_players - 1 -(sum(s_t2) - s_t2[i])

            tab[coop_t1][a_t1[i]] += alpha * (r_t1[i] + gamma * max(tab[coop_t2]) - tab[coop_t1][a_t1[i]])
        
        s_t1 = s_t2
            
        if done:
#             env.close()
#             print(tables)
            break

In [150]:
for player, table in zip(env.player, tables):
    print(player.name)
    interpretation(table)

p1
num of cooperating players | Q-value if cooperating | Q-value if not
            0              |   5.449937367191885    | 62.25362965196577
            1              |   5.203359994263341    | 39.58310395204195
p2
num of cooperating players | Q-value if cooperating | Q-value if not
            0              |   7.819217800007606    | 4.628774978267523
            1              |   16.846841713121172   | 3.072675031502081
