In [4]:
# ! pip install gym
# ! pip3 install box2d-py
# ! pip install stable-baselines[mpi]==2.10.0 box2d box2d-kengz
# ! pip install 'ribs[all]' gym~=0.17.0 Box2D~=2.3.10 tqdm

In [3]:
import gym
env = gym.make("LunarLander-v2")

import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [4]:
batch_size = 100
session_size = 500
percentile = 80
hidden_size = 200
learning_rate = 0.0025
completion_score = 200

# BATCH_SIZE = 100
# SESSION_SIZE = 500
# PERCENTILE = 80
# LEARNING_RATE = 0.0025
# COMPLETION_SCORE = 200

In [5]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(obs_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, n_actions)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

In [6]:
def generate_batch(env,batch_size, t_max=1000):
    
    activation = nn.Softmax(dim=1)
    batch_actions,batch_states, batch_rewards = [],[],[]
    
    for b in range(batch_size):
        states,actions = [],[]
        total_reward = 0
        s = env.reset()
        for t in range(t_max):
            s_v = torch.FloatTensor([s])
            print(s_v)
            act_probs_v = activation(net(s_v))
            act_probs = act_probs_v.data.numpy()[0]
            a = np.random.choice(len(act_probs), p=act_probs)
            new_s, r, done, info = env.step(a)
            states.append(s)
            actions.append(a)
            total_reward += r
            s = new_s
            if done:
                batch_actions.append(actions)
                batch_states.append(states)
                batch_rewards.append(total_reward)
                break
    return batch_states, batch_actions, batch_rewards

In [7]:
def filter_batch(states_batch,actions_batch,rewards_batch,percentile=50):
 
    reward_threshold = np.percentile(rewards_batch, percentile)

    elite_states = []
    elite_actions = []


    for i in range(len(rewards_batch)):
        if rewards_batch[i] > reward_threshold:
            for j in range(len(states_batch[i])):
                elite_states.append(states_batch[i][j])
                elite_actions.append(actions_batch[i][j])

    return elite_states,elite_actions

In [8]:
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
print(f"No of States - {n_states}")
print(f"No of Actions - {n_actions}")

No of States - 8
No of Actions - 4


In [6]:
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n


#neural network
net = Net(n_states, hidden_size, n_actions)


#loss function
objective = nn.CrossEntropyLoss()


#optimisation function
optimizer = optim.Adam(params=net.parameters(), lr=learning_rate)

for i in range(session_size):
    #generate new sessions
    batch_states,batch_actions,batch_rewards = generate_batch(env, batch_size, t_max=5000)
    elite_states, elite_actions = filter_batch(batch_states,batch_actions,batch_rewards,percentile)
    
    optimizer.zero_grad()
    tensor_states = torch.FloatTensor(elite_states)
    tensor_actions = torch.LongTensor(elite_actions)
    action_scores_v = net(tensor_states)
    loss_v = objective(action_scores_v, tensor_actions)
    loss_v.backward()
    optimizer.step()
    #show results
    mean_reward, threshold = np.mean(batch_rewards),np.percentile(batch_rewards, percentile)
    print("%d: loss=%.3f, reward_mean=%.1f,reward_threshold=%.1f" % (i, loss_v.item(), mean_reward, threshold))
    
    #check if 
    if np.mean(batch_rewards)> completion_score:
        print("Environment has been successfullly completed!")
        break

0: loss=1.384, reward_mean=-167.9,reward_threshold=-82.3
1: loss=1.384, reward_mean=-168.3,reward_threshold=-96.8
2: loss=1.380, reward_mean=-172.8,reward_threshold=-95.4
3: loss=1.376, reward_mean=-169.3,reward_threshold=-92.7
4: loss=1.372, reward_mean=-164.8,reward_threshold=-87.4
5: loss=1.373, reward_mean=-167.6,reward_threshold=-81.2
6: loss=1.376, reward_mean=-161.8,reward_threshold=-82.8
7: loss=1.367, reward_mean=-145.2,reward_threshold=-80.1
8: loss=1.353, reward_mean=-163.4,reward_threshold=-80.6
9: loss=1.365, reward_mean=-167.0,reward_threshold=-76.7
10: loss=1.357, reward_mean=-182.1,reward_threshold=-93.0
11: loss=1.342, reward_mean=-184.9,reward_threshold=-83.4
12: loss=1.345, reward_mean=-186.9,reward_threshold=-75.9
13: loss=1.324, reward_mean=-166.8,reward_threshold=-66.9
14: loss=1.332, reward_mean=-170.9,reward_threshold=-75.7
15: loss=1.324, reward_mean=-163.9,reward_threshold=-62.1
16: loss=1.315, reward_mean=-149.6,reward_threshold=-69.1
17: loss=1.309, reward_m

In [7]:
import gym.wrappers
# env = gym.wrappers.Monitor(gym.make("LunarLander-v2"), directory="videos", force=True)
env = gym.wrappers.Monitor(env,directory="videos1", force=False)
generate_batch(env, 1, t_max=500)
env.close()



: 