<a href="https://colab.research.google.com/github/abyaadrafid/Representation_Learning_RL/blob/main/Learn_Env_From_Representation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt install swig cmake libopenmpi-dev zlib1g-dev
!pip install stable-baselines[mpi]==2.10.0 box2d box2d-kengz

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libopenmpi-dev is already the newest version (2.1.1-8).
cmake is already the newest version (3.10.2-1ubuntu2.18.04.2).
zlib1g-dev is already the newest version (1:1.2.11.dfsg-0ubuntu2.1).
zlib1g-dev set to manually installed.
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  swig3.0
Suggested packages:
  swig-doc swig-examples swig3.0-examples swig3.0-doc
The following NEW packages will be installed:
  swig swig3.0
0 upgraded, 2 newly installed, 0 to remove and 49 not upgraded.
Need to get 1,100 kB of archives.
After this operation, 5,822 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig3.0 amd64 3.0.12-1 [1,094 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig amd64 3.0.12-1 [

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import gym
from gym.spaces import Discrete
import torch
from collections import deque, defaultdict, namedtuple
import random
import numpy as np
import torch.nn as nn
from torch.optim import Adam
from tqdm.notebook import tqdm
import torch.nn.functional as F

In [4]:
MAX_EPISODES = 1000
MAX_EPISODE_LEN = 100
BATCH_SIZE = 15
EMBEDDING_SIZE = 128
SEED = 0
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [5]:
env = gym.make('LunarLander-v2')
env.seed(0)
print(env.action_space)
print(env.observation_space)

Discrete(4)
Box(-inf, inf, (8,), float32)


In [6]:
class DQN(nn.Module):
  def __init__(self, state_size, fc1_size, fc2_size, action_size, seed):
    super(DQN, self).__init__()
    self.seed = torch.manual_seed(seed)
    self.layers = nn.Sequential(
        nn.Linear(state_size, fc1_size),
        nn.ReLU(),
        nn.Linear(fc1_size, fc2_size),
        nn.ReLU(),
        nn.Linear(fc2_size, action_size)
    )
      
  def forward(self, x):
    return self.layers(x)    

In [7]:
class RandomAgent() :
  def __init__(self, seed : int, action_space : Discrete) :
    self.seed = seed
    self.action_space = action_space

  def act(self, observation = None) :
    return self.action_space.sample()

In [8]:
class TrainedAgent(nn.Module):
  def __init__(self, path, state_size = env.observation_space.shape[0], fc1_size = 128, fc2_size = 256, action_size = env.action_space.n):
    super(TrainedAgent, self).__init__()
    self.network = DQN(state_size, fc1_size, fc2_size, action_size , 0)
    self._load_weights(path)
  
  def _load_weights(self, path):
    if torch.cuda.is_available() :
      self.network.load_state_dict(torch.load(path))
    else :
      self.network.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
    self.network.eval()
  
  def act(self, state):
    state = torch.tensor(state)
    return np.argmax(self.network(state).cpu().data.numpy())

In [9]:
collector_config = {
    "seed" : 0,
    "env" : env,
    "agent" : "trained",
    "agent_weights_path" : "/content/drive/MyDrive/dqn_weights.pt",
    "max_episodes" : MAX_EPISODES,
    "max_episode_len" : MAX_EPISODE_LEN,
    "action_space" : env.action_space,
}

In [10]:
class ExperienceCollector():
  def __init__(self, config : dict):
    self.seed = config.get("seed",0)
    self.env = config.get("env")
    self.agent_type = config.get("agent", "random")
    self.max_episode_len = config.get("max_episode_len", 300)
    self.max_episodes = config.get("max_episodes")
    self.action_space = config.get("action_space", Discrete(4))
    self.memory = []
    self.agent_weights = config.get("agent_weights_path", None)

    self.agent = self._make_agent()
    self.current_episode = 0
  
  def __len__(self):
    return len(self.memory)
  
  def _make_agent(self):
    if self.agent_type == "random" :
      return RandomAgent(self.seed, self.action_space)
    elif self.agent_type == "trained" :
      return TrainedAgent(self.agent_weights)

  def add_episode(self, episode):
    if self.current_episode >= self.max_episodes :
      index = np.random.randint(0, self.max_episodes)
      self.memory[index] = episode

    else :
      self.memory.append(episode)
    self.current_episode +=1

  def sample(self):
    states = torch.zeros((MAX_EPISODE_LEN,env.observation_space.shape[0]))
    actions = torch.full((MAX_EPISODE_LEN, ), fill_value = -1, dtype = torch.float32)
    rewards = torch.zeros((MAX_EPISODE_LEN))
    next_states = torch.zeros((MAX_EPISODE_LEN,env.observation_space.shape[0]))
    dones = torch.ones((MAX_EPISODE_LEN), dtype = torch.float32)

    episode = random.sample(self.memory, k=1)
    episode = np.array(episode, dtype=object).reshape(-1,5)

    for index, step in enumerate(episode) :
      states[index] = torch.from_numpy(step[0])
      actions[index] = step[1]
      rewards[index] = step[2]
      next_states[index] = torch.from_numpy(step[3])
      dones[index] = step[4]

    return states.to(device), actions.to(device), rewards.to(device), next_states.to(device), dones.to(device)


  def collect(self, num_episodes : int = 0, verbose = False) :
    for _ in range(num_episodes) :
      current_episode = []
      episode_length = 0
      state = self.env.reset()
      done = False

      while not done :
        if episode_length >= self.max_episode_len : break
        action = self.agent.act(state)
        next_state, reward, done, _ = self.env.step(action)

        if done : done = 1 
        else : done = 0

        current_episode.append([state, action, reward, next_state, done])
        episode_length +=1
        
        state = next_state
      self.add_episode(current_episode)
    if verbose : 
      print(f'{num_episodes} episodes added to memory')

In [11]:
collector = ExperienceCollector(collector_config)
collector.collect(100, verbose = True)

100 episodes added to memory


In [12]:
collector.__len__()

100

In [13]:
_ = collector.sample()

In [14]:
class StateEncoder(nn.Module) :
  def __init__(self, state_size : int, embedding_size : int, fc1_size : int = 64):
    super(StateEncoder, self).__init__()
    self.state_enc = nn.Sequential(
        nn.Linear(state_size, fc1_size),
        nn.ReLU(),
        nn.Linear(fc1_size, embedding_size)
    )
  def forward(self, state):
    return self.state_enc(state)

In [15]:
class ActionEncoder(nn.Module) :
  def __init__(self, embedding_size,fc1_size :int = 16) :
    super(ActionEncoder, self).__init__()
    self.action_enc = nn.Sequential(
      nn.Linear(1, fc1_size),
      nn.ReLU(),
      nn.Linear(fc1_size, embedding_size)
    )
  def forward(self, action) :
    return self.action_enc(action.unsqueeze(-1))


In [16]:
class StateDecoder(nn.Module) :
  def __init__(self, state_size : int, embedding_size : int, fc1_size : int = 64):
    super(StateDecoder, self).__init__()
    self.state_dec = nn.Sequential(
        nn.Linear(embedding_size, fc1_size),
        nn.ReLU(),
        nn.Linear(fc1_size, state_size)
    )
  def forward(self, state):
    return self.state_dec(state)

In [17]:
class RewardModel(nn.Module) :
  def __init__(self, embedding_size=16, fc1_size = 8):
    super(RewardModel, self).__init__()
    self.layers = nn.Sequential(
        nn.Linear(embedding_size*2, fc1_size),
        nn.ReLU(),
        nn.Linear(fc1_size, 1)
    )
  
  def forward(self, x):
    return self.layers(x)

In [18]:
class DonePredictor(nn.Module) :
  def __init__(self, embedding_size=16, fc1_size = 8):
    super(DonePredictor, self).__init__()
    self.layers = nn.Sequential(
        nn.Linear(embedding_size, fc1_size),
        nn.ReLU(),
        nn.Linear(fc1_size, 1),
        nn.Sigmoid()
    )
  
  def forward(self, x):
    return self.layers(x)

In [19]:
class WorldModel(nn.Module) :
  def __init__(self, embedding_size, hidden_size= 16, state_size = env.observation_space.shape[0]) :
    super(WorldModel, self).__init__()
    self.hidden_size = hidden_size
    self.gru = nn.GRU(input_size = embedding_size*2, hidden_size = hidden_size)
    self.state_predictor = nn.Sequential(
        nn.Linear(hidden_size, embedding_size),
        nn.ReLU(),
        nn.Linear(embedding_size, state_size)
    )
    self.reward_model = RewardModel(embedding_size)
    self.done_model = DonePredictor(hidden_size)
    self.init_hidden()
    
  def init_hidden(self):
    self.hidden = torch.zeros(1, self.hidden_size).to(device)
  
  def forward(self, encoded_states, actions):
    inputs = torch.cat([encoded_states, actions], dim = -1)
    output, self.hidden = self.gru(inputs, self.hidden)
    rewards = self.reward_model(inputs)
    dones = self.done_model(output)
    state = self.state_predictor(output)

    return state, rewards.squeeze(-1), dones.squeeze(-1)

In [20]:
state_encoder = StateEncoder(env.observation_space.shape[0], EMBEDDING_SIZE).to(device)

In [21]:
state_decoder = StateDecoder(env.observation_space.shape[0], EMBEDDING_SIZE).to(device)

In [22]:
action_encoder = ActionEncoder(EMBEDDING_SIZE).to(device)

In [23]:
wm = WorldModel(EMBEDDING_SIZE, hidden_size = 16, state_size = env.observation_space.shape[0]).to(device)

In [24]:
transition_loss = nn.MSELoss()
reward_loss = nn.MSELoss()
reconstruction_loss = nn.MSELoss()
done_loss = nn.MSELoss()
optimizer = Adam(list(state_encoder.parameters()) + list(wm.parameters()) + list(action_encoder.parameters()) + list(state_decoder.parameters()))

In [25]:
import time
import statistics

In [26]:
def wm_learn_env(num_episodes, collect_every = 500, collect_number = 500, print_every = 50) :
  t_losses, r_losses, rec_losses, d_losses, losses = [], [], [], [], []

  for episode in tqdm(range(num_episodes)) :
    state_encoder.zero_grad()
    action_encoder.zero_grad()
    wm.init_hidden()
    wm.zero_grad()
    states, actions, rewards, next_states, dones = collector.sample()

    if states is None : return 

    encoded_states = state_encoder(states)
    decoded_states = state_decoder(encoded_states)
    encoded_actions = action_encoder(actions)

    predicted_next_states, predicted_rewards, predicted_dones = wm(encoded_states, encoded_actions)

    t_loss = transition_loss(predicted_next_states, next_states)
    r_loss = reward_loss(predicted_rewards, rewards)
    rec_loss = reconstruction_loss(states, decoded_states)
    d_loss = done_loss(dones, predicted_dones)
    loss = t_loss + r_loss +rec_loss + d_loss

    loss.backward()
    losses.append(loss.item())
    t_losses.append(t_loss.item())
    r_losses.append(r_loss.item())
    rec_losses.append(rec_loss.item())
    d_losses.append(d_loss.item())
    optimizer.step()

    if episode % print_every == 0 :
      print(f'Episode {episode} :\n\
      Avg Transition_loss : {statistics.mean(t_losses)} \n\
      Avg Reward_loss : {statistics.mean(r_losses)}\n\
      Avg Reconstruction_loss : {statistics.mean(rec_losses)}\n\
      Avg End loss : {statistics.mean(d_losses)}')
    
    if episode % collect_every == 0 :
      collector.collect(collect_number, verbose=False)

  return t_losses, r_losses, rec_losses, d_losses,losses

In [None]:
t_loss, r_loss, rec_loss, e_loss,loss = wm_learn_env(100000, print_every=5000)

  0%|          | 0/100000 [00:00<?, ?it/s]

Episode 0 :
      Avg Transition_loss : 1.7648247480392456 
      Avg Reward_loss : 183.76171875
      Avg Reconstruction_loss : 1.6672707796096802
      Avg End loss : 0.21731612086296082


In [None]:
statistics.mean(loss)

In [None]:
agent = TrainedAgent('/content/drive/MyDrive/dqn_weights.pt').to(device)

In [None]:
def run_episode(state, agent, num_steps =300):
  done = False
  steps = 0
  rewards = 0
  for _ in tqdm(range(num_steps)) :
    action = agent.act(state)
    action = torch.tensor(action, dtype=torch.float32).unsqueeze(0).to(device)
    encoded_action = action_encoder(action)
    encoded_state = state_encoder(state)

    next_state, reward, done = wm(encoded_action, encoded_state)
    state = next_state.clone().detach()

    rewards += reward.detach().item()
    if bool(done.argmax().item()) : break
  
  return rewards


In [None]:
run_episode(torch.Tensor(env.reset()).reshape(1,-1).to(device), agent)