In [4]:
# ! pip install gym
# ! pip3 install box2d-py
# ! pip install stable-baselines[mpi]==2.10.0 box2d box2d-kengz
# ! pip install 'ribs[all]' gym~=0.17.0 Box2D~=2.3.10 tqdm

In [2]:
import gym
env = gym.make("LunarLander-v2")

import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(obs_size, n_actions)
        # self.fc2 = nn.Linear(hidden_size, 128)
        # self.fc3 = nn.Linear(128, n_actions)
        
    def forward(self, x):
        # x = F.relu(self.fc1(x))
        # x = F.relu(self.fc2(x))
        return self.fc1(x)

In [4]:
def generate_batch(env,batch_size, t_max=1000):
    
    activation = nn.Softmax(dim=1)
    batch_actions,batch_states, batch_rewards = [],[],[]
    
    for b in range(batch_size):
        states,actions = [],[]
        total_reward = 0
        s = env.reset()
        for t in range(t_max):
            s_v = torch.FloatTensor([s])
            act_probs_v = activation(net(s_v))
            act_probs = act_probs_v.data.numpy()[0]
            a = np.random.choice(len(act_probs), p=act_probs)
            new_s, r, done, info = env.step(a)
            states.append(s)
            actions.append(a)
            total_reward += r
            s = new_s
            if done:
                batch_actions.append(actions)
                batch_states.append(states)
                batch_rewards.append(total_reward)
                break
    return batch_states, batch_actions, batch_rewards

In [5]:
def filter_batch(states_batch,actions_batch,rewards_batch,percentile=50):
 
    reward_threshold = np.percentile(rewards_batch, percentile)

    elite_states = []
    elite_actions = []


    for i in range(len(rewards_batch)):
        if rewards_batch[i] > reward_threshold:
            for j in range(len(states_batch[i])):
                elite_states.append(states_batch[i][j])
                elite_actions.append(actions_batch[i][j])

    return elite_states,elite_actions

In [6]:
batch_size = 100
session_size = 500
percentile = 80
hidden_size = 200
learning_rate = 0.0025
completion_score = 200
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
#neural network
net = Net(n_states, hidden_size, n_actions)
#loss function
objective = nn.CrossEntropyLoss()
#optimisation function
optimizer = optim.Adam(params=net.parameters(), lr=learning_rate)
for i in range(session_size):
    #generate new sessions
    batch_states,batch_actions,batch_rewards = generate_batch(env, batch_size, t_max=5000)
    elite_states, elite_actions = filter_batch(batch_states,batch_actions,batch_rewards,percentile)
    
    optimizer.zero_grad()
    tensor_states = torch.FloatTensor(elite_states)
    tensor_actions = torch.LongTensor(elite_actions)
    action_scores_v = net(tensor_states)
    loss_v = objective(action_scores_v, tensor_actions)
    loss_v.backward()
    optimizer.step()
    #show results
    mean_reward, threshold = np.mean(batch_rewards),np.percentile(batch_rewards, percentile)
    print("%d: loss=%.3f, reward_mean=%.1f,reward_threshold=%.1f" % (i, loss_v.item(), mean_reward, threshold))
    
    #check if 
    if np.mean(batch_rewards)> completion_score:
        print("Environment has been successfullly completed!")
        break

0: loss=1.374, reward_mean=-231.4,reward_threshold=-105.3
1: loss=1.369, reward_mean=-254.4,reward_threshold=-128.6
2: loss=1.373, reward_mean=-243.0,reward_threshold=-104.2
3: loss=1.378, reward_mean=-265.8,reward_threshold=-123.3
4: loss=1.371, reward_mean=-230.1,reward_threshold=-104.6
5: loss=1.373, reward_mean=-221.7,reward_threshold=-94.2
6: loss=1.371, reward_mean=-255.9,reward_threshold=-123.3
7: loss=1.368, reward_mean=-212.6,reward_threshold=-96.1
8: loss=1.371, reward_mean=-218.0,reward_threshold=-100.9
9: loss=1.382, reward_mean=-243.1,reward_threshold=-117.6
10: loss=1.375, reward_mean=-221.5,reward_threshold=-100.8
11: loss=1.378, reward_mean=-219.1,reward_threshold=-95.0
12: loss=1.374, reward_mean=-244.9,reward_threshold=-107.5
13: loss=1.366, reward_mean=-236.7,reward_threshold=-101.5
14: loss=1.375, reward_mean=-217.0,reward_threshold=-101.0
15: loss=1.376, reward_mean=-216.7,reward_threshold=-102.1
16: loss=1.370, reward_mean=-210.4,reward_threshold=-108.5
17: loss=1

In [11]:
import gym.wrappers
# env = gym.wrappers.Monitor(gym.make("LunarLander-v2"), directory="videos", force=True)
env = gym.wrappers.Monitor(env,directory="videos3", force=False)
generate_batch(env, 1, t_max=500)
env.close()



NameError: name 'net' is not defined

: 