In [1]:
import numpy as np
from mushroom_rl.algorithms.value import DQN
from mushroom_rl.core import Core
from mushroom_rl.environments import CartPole
from mushroom_rl.policy import EpsGreedy
from mushroom_rl.utils.dataset import compute_J
from mushroom_rl.utils.parameters import Parameter
from mushroom_rl.approximators.parametric import TorchApproximator
from mushroom_rl.utils.dataset import compute_metrics
from mushroom_rl.utils.parameters import LinearParameter, Parameter

mdp = CartPole(gamma=0.9)

In [2]:
w = 0
if w :
    w+1
w

0

In [3]:
def print_epoch(epoch):
    print('################################################################')
    print('Epoch: ', epoch)
    print('----------------------------------------------------------------')


def get_stats(dataset):
    score = compute_metrics(dataset)
    print(('min_reward: %f, max_reward: %f, mean_reward: %f,'
          ' games_completed: %d' % score))

    return score

scores = list()

train_frequency = 1
evaluation_frequency = 7500
target_update_frequency = 500
initial_replay_size = 5000
max_replay_size = 50000
test_samples = 500
max_steps = 52500


# Policy
epsilon = LinearParameter(value=1.,
                          threshold_value=.1,
                          n=10000)
epsilon_test = Parameter(value=.05)
epsilon_random = Parameter(value=1)
pi = EpsGreedy(epsilon=epsilon_random)

In [4]:
from mushroom_rl.utils.callbacks.collect_dataset import CollectDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

class Network(nn.Module):
    def __init__(self, input_shape, output_shape, **kwargs):
        super().__init__()
        
        n_input = input_shape[-1]
        n_output = output_shape[0]

        self.hl0 = nn.Linear(n_input, 16)
        self.hl1 = nn.Linear(16, 16)
        self.hl2 = nn.Linear(16, n_output)
        
        nn.init.xavier_uniform_(self.hl0.weight, gain=nn.init.calculate_gain('relu'))
        nn.init.xavier_uniform_(self.hl1.weight, gain=nn.init.calculate_gain('relu'))
        nn.init.xavier_uniform_(self.hl2.weight, gain=nn.init.calculate_gain('relu'))

    def forward(self, state, action=None):
        h = F.relu(self.hl0(state.float()))
        h = F.relu(self.hl1(h))
        q = self.hl2(h)

        if action is None:
            return q
        else:
            q_acted = torch.squeeze(q.gather(1, action.long()))            
            return q_acted

network =Network


In [5]:
# Optimizer
optimizer=dict()
optimizer['class'] = torch.optim.Adam
optimizer['params'] = dict(lr=1e-3)

# Approximator
approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                           output_shape=(mdp.info.action_space.n,),
                           n_actions = mdp.info.action_space.n,
                           network = network,
                           optimizer=optimizer,
                           loss=F.smooth_l1_loss
                           )
approximator = TorchApproximator

algorithm_params = dict(
    batch_size=32,
    target_update_frequency=target_update_frequency // train_frequency,
    replay_memory=None,
    initial_replay_size=initial_replay_size,
    max_replay_size=max_replay_size
)

# Agent
agent = DQN(mdp.info, pi, approximator,
            approximator_params=approximator_params,**algorithm_params)
core = Core(agent, mdp)


In [6]:
# Fill replay memory with random dataset
print_epoch(0)
core.learn(n_steps=initial_replay_size,
           n_steps_per_fit=initial_replay_size)

# Evaluate initial policy
pi.set_epsilon(epsilon_test)
dataset = core.evaluate(n_steps=test_samples)
scores.append(get_stats(dataset))

for n_epoch in range(1, max_steps // evaluation_frequency + 1):
    print_epoch(n_epoch)
    print('- Learning:')
    # learning step
    pi.set_epsilon(epsilon)
    core.learn(n_steps=evaluation_frequency,
               n_steps_per_fit=train_frequency)

    print('- Evaluation:')
    # evaluation step
    pi.set_epsilon(epsilon_test)
    dataset = core.evaluate(n_steps=test_samples)
    scores.append(get_stats(dataset))

################################################################
Epoch:  0
----------------------------------------------------------------


                                                    

min_reward: -1.000000, max_reward: -1.000000, mean_reward: -1.000000, games_completed: 108
################################################################
Epoch:  1
----------------------------------------------------------------
- Learning:


                                                    

- Evaluation:


                                                  

min_reward: -1.000000, max_reward: -1.000000, mean_reward: -1.000000, games_completed: 9
################################################################
Epoch:  2
----------------------------------------------------------------
- Learning:


                                                    

- Evaluation:


                                                  

min_reward: -1.000000, max_reward: -1.000000, mean_reward: -1.000000, games_completed: 7
################################################################
Epoch:  3
----------------------------------------------------------------
- Learning:


                                                    

- Evaluation:


                                                  

min_reward: -1.000000, max_reward: -1.000000, mean_reward: -1.000000, games_completed: 4
################################################################
Epoch:  4
----------------------------------------------------------------
- Learning:


                                                    

- Evaluation:


                                                  

min_reward: -1.000000, max_reward: -1.000000, mean_reward: -1.000000, games_completed: 7
################################################################
Epoch:  5
----------------------------------------------------------------
- Learning:


                                                    

- Evaluation:


                                                  

min_reward: -1.000000, max_reward: -1.000000, mean_reward: -1.000000, games_completed: 8
################################################################
Epoch:  6
----------------------------------------------------------------
- Learning:


                                                    

- Evaluation:


                                                  

min_reward: -1.000000, max_reward: -1.000000, mean_reward: -1.000000, games_completed: 11
################################################################
Epoch:  7
----------------------------------------------------------------
- Learning:


                                                    

- Evaluation:


                                                  

min_reward: -1.000000, max_reward: -1.000000, mean_reward: -1.000000, games_completed: 9




In [7]:
dataset = core.evaluate(n_steps=3000, render=True)

                                                   

In [8]:
f = open('experiences.txt', 'w')
for i in dataset:
    f.write(''+('%s, %s, %s, %s, %s, %s' % (i[0], i[1], i[2],i[3], i[4],i[5]))+';\n')
f.close()

In [13]:
c =zip(*dataset)

In [14]:
s, a, r, ns, ab, d = c

In [43]:
x = zip(s, a, r, ns, d)

In [12]:
for i in list(c):
    print(i)