# Download dependencies and import libraries

In [20]:
!pip install gym > /dev/null 2>&1

!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1

In [21]:
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1



In [22]:
!pip3 install box2d-py
!pip3 install gym[Box_2D]



In [23]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only

import torch
import torch.nn as nn
import torch.nn.functional as F 
from torch import optim
import numpy as np
import pandas as pd

import seaborn as sns
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay
from IPython.display import clear_output
from pathlib import Path

import random, os.path, math, glob, csv, base64, itertools, sys
from pprint import pprint

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import io
from IPython.display import HTML

from torch.utils.tensorboard import SummaryWriter

from pyvirtualdisplay import Display


# Useful functions

In [24]:
def show_video(directory):
  """
  Visualize the environments.
  ---
  INPUTS
    directory: (str) Ppath for the save directory.
  """
  html = []
  for mp4 in Path(directory).glob("*.mp4"):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

def display_env(env_name = 'LunarLander-v2', savePath = "./gym-results"):
  """
  Display one episode of gym environment.
  ---
  INPUTS
    env_name: (str) environment name as prescribed in gym library.
    savePath: (str) path to save the video.
  """
  # load env.
  env = gym.make(env_name)
  # wrap env in order to save our experiment on a file.
  env = Monitor(env, savePath, force=True, video_callable=lambda episode: True)

  done = False
  obs = env.reset()
  while not done:
      action = env.action_space.sample()
      obs, reward, done, info = env.step(action)
  env.close()
  show_video(savePath)
  
def fix_seed(seed):
  """
  Fix the ramdom seed.
  ---
  INPUTS
    seed: (int) the random seed.
  """
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

# Basic Setups

In [25]:
# prepare the visualisation window
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f4da1ee84d0>

In [26]:
# fix the seed
fix_seed(0)

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [29]:
display_env(env_name = 'LunarLander-v2', savePath = "./gym-results")

In [None]:
import urllib.request
urllib.request.urlretrieve('http://www.atarimania.com/roms/Roms.rar','Roms.rar')
!pip install unrar
!unrar x Roms.rar
!mkdir rars
!mv HC\ ROMS.zip   rars
!mv ROMS.zip  rars
!python -m atari_py.import_roms rars

# Network

In [30]:
class DQN(nn.Module):
  def __init__(self, input_dim, hid1_dim, hid2_dim, n_actions):
    super(DQN, self).__init__()
    """
    The DQN network Class.
    ---
    INPUTS:
      input_dim: (int) the state dimensionss.
      hid1_dim: (int) the first hidden dimension.
      hid2_dim: (int) the second hidden dimension.
      n_actions: (int) the action dimensions.
    """
    self.input_dim = input_dim
    self.hid1_dim = hid1_dim
    self.hid2_dim = hid2_dim
    self.n_actions = n_actions
    self.net = nn.Sequential(
            nn.Linear(self.input_dim, self.hid1_dim),
            nn.ReLU(),
            nn.Linear(self.hid1_dim, self.hid2_dim),
            nn.ReLU(),
            nn.Linear(self.hid2_dim, self.n_actions)
        )

  def forward(self, state):
    return self.net(state.float())

# Replay Buffer

In [31]:
class ReplayBuffer:
  def __init__(self,buffer_size,state_dims):
    """
    The Replay Buffer Class. 
    First in first out buffer, used as memory.
    Divided into 5 different buffers:
      1- state_buffer: (numpy.ndarray) the current states buffer.
      2- next_state_buffer: (numpy.ndarray) the next states buffer.
      3- action_buffer: (numpy.ndarray) the actions buffer.
      4- reward_buffer: (numpy.ndarray) the reward buffer.
      5- termination_buffer: (numpy.ndarray) the termination buffer. Used to track with end of episods actions.
    ---
    INPUTS
      buffer_size: (int) the buffer max size.
      state_dims: (int) the state dimensions.
    """
    self.state_dims = state_dims
    self.buffer_size = buffer_size
    # counter to keep track of the buffer index
    self.counter = 0
    # initialise the buffers
    self.state_buffer = np.zeros((buffer_size,*self.state_dims), dtype=np.float32)
    self.next_state_buffer = np.zeros((buffer_size,*self.state_dims), dtype=np.float32)
    self.action_buffer = np.zeros(buffer_size, dtype=np.float32)
    self.reward_buffer = np.zeros(buffer_size, dtype=np.float32)
    self.termination_buffer = np.zeros(buffer_size, dtype=np.bool)

  def push(self, state, action, reward, next_state, done):
    """
    Push state, action, reward, next_state, done into the reply buffer
    ---
    state, action, reward, next_state, done: (numpy.ndarray) 
    """
    # mode buffer_size to reset the index when exceed the max size.
    index = self.counter % self.buffer_size
    # fill the buffers
    self.state_buffer[index] = state
    self.next_state_buffer[index] = next_state
    self.action_buffer[index] = action
    self.reward_buffer[index] = reward
    self.termination_buffer[index] = done
    # increase the conter
    self.counter += 1

  def sample(self,batch_size, device):
    """
    Sample batch of inputs.
    ---
    batch_size: (int) the batch size.
    device: (str) the available device.
    """
    filled_buffer = min(self.buffer_size, self.counter)
    batch_idx = np.random.choice(filled_buffer, batch_size, replace=False)

    # convert to tensor
    state = torch.tensor(self.state_buffer[batch_idx]).to(device)
    actions = torch.tensor(self.action_buffer[batch_idx]).to(device)
    rewards = torch.tensor(self.reward_buffer[batch_idx]).to(device)
    next_states = torch.tensor(self.next_state_buffer[batch_idx]).to(device)
    terminations = torch.tensor(self.termination_buffer[batch_idx]).to(device)

    return state, actions, rewards, next_states, terminations

  def __len__(self):
    return self.buffer_size

# Agent

In [32]:
class Agent:
  def __init__(self, env_name, gamma=0.99, epsilon=1, epsilon_min=0.01, epsilon_decrement=0.001, learning_rate=0.0001, batch_size=128,
               n_episodes = 700, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128, path=None, tb_path=None, device = 'cpu', printLog = False ):
    """
    The Agent Clsss.
    ---
    INPUTS:
      env_name: (str) the environment name.
      gamma: (float) bellman equation constant.
      epsilon: (float) epsilon greedy statring value.
      epsilon_min: (float) min epsilon value.
      epsilon_decrement: (float) epsilon decrement.
      learning_rate: (float) learning rate.
      batch_size: (int) batch size.
      n_episodes: (int) number of games.
      n_steps: (int) max number of steps per episode.
      buffer_size: (int) max buffer size.
      hid1_dim: (int) first hidden dimension.
      hid2_dim: (int) second hidden dimension.
      path: (str) path for the saving directory.
      tb_path: (str) path for the tensorboard directory.
      device: (str) the available device.
      printLog: (bool) true to print the log while training.

    """
    self.device = device
    self.env_name = env_name
    self.env = gym.make(self.env_name)
    self.gamma = gamma
    self.epsilon = epsilon
    self.epsilon_min = epsilon_min
    self.epsilon_decrement = epsilon_decrement
    self.learning_rate = learning_rate
    self.batch_size = batch_size
    self.n_episodes = n_episodes
    self.n_steps = n_steps
    self.buffer_size = buffer_size
    self.hid1_dim = hid1_dim
    self.hid2_dim = hid2_dim
    self.state_dims = self.env.observation_space.shape[0]
    self.path = path
    self.tb_path = tb_path
    self.printLog = printLog
    # inialise tensorboard writer
    self.sw = SummaryWriter(self.tb_path)
    # initialise the buffer
    self.buffer = ReplayBuffer(self.buffer_size,[self.state_dims])
    # initialise the networks
    self.dqn = DQN(self.state_dims, self.hid1_dim, self.hid2_dim, self.env.action_space.n).to(self.device)
    self.target_net = DQN(self.state_dims, self.hid1_dim, self.hid2_dim, self.env.action_space.n).to(self.device)
    self.target_net.load_state_dict(self.dqn.state_dict())
    self.target_net.eval()
    # initialise the loss
    self.loss_fn = torch.nn.MSELoss()
    self.optimizer = torch.optim.Adam(self.dqn.parameters(), lr=self.learning_rate)
    # list to save the reward per epoch
    self.rewards = [] 
    self.periodic_reward = 0

  def epsilon_greedy_action(self, state, env):
    """
    Epsilon greedy function.
    explore or learn depending, depend on epsilon.
    ---
    INPUT
      state: the current state.
      env: the environment.
    OUTPUT
      action: the current action.
    """
    explore = np.random.uniform() < self.epsilon
    if explore:
      action = env.action_space.sample()
    else:
      state = torch.tensor([ state ]).to(self.device)
      actions = self.dqn.forward(state)
      action = torch.argmax(actions).item()
    return action
    
  def step(self, state, env):
    """
    One step of gradient update.
    ---
    INPUTS
      state: the current state.
      env: the environment.
    OUTPUTS
      next_state: next state.
      env: the environment.
      done: (bool) true if the episode ended.

    """
    # do one env step and add the results to the memory
    self.dqn.eval()
    action = self.epsilon_greedy_action(state, env)
    next_state, reward, done, _ = env.step(action)
    self.buffer.push(state,action,reward,next_state,done)
    self.dqn.train()
    # calculate the accomulated reward per episode
    self.periodic_reward += reward

    # clear the gradient graph  
    self.optimizer.zero_grad()
    # sample one batch form the memory
    state_batch, action_batch, reward_batch, next_state_batch, termination_batch = self.buffer.sample(self.batch_size, self.device)
    state_batch, action_batch, reward_batch, next_state_batch, termination_batch = state_batch, action_batch, reward_batch, next_state_batch, termination_batch
    # one step of bellman equation
    q_eval = self.dqn(state_batch)[np.arange(self.batch_size),action_batch.long()] 
    q_next = self.target_net(next_state_batch).detach()
    q_next[termination_batch] = 0.0 # for terminal y = r
    target = reward_batch + self.gamma * q_next.max(dim=1)[0] 
    # calculate the loss
    loss = self.loss_fn(target,q_eval).to(self.device)
    loss.backward()
    # update the parameters
    self.optimizer.step() 

    # decrease epsilon or keep the minimum value
    if self.epsilon > self.epsilon_min:
      self.epsilon = self.epsilon - self.epsilon_decrement
    else: 
      self.epsilon = self.epsilon_min

    return next_state, env, done

  def episode(self, env):
    """
    One episode. And update the target network.
    ---
    INPUT
      env: the environment.
    """
    # reset the environment
    state = env.reset()
    done = False
    step = 0
  
    while not done and step < self.n_steps:
      state, env, done = self.step(state, env)
      step += 1

    # update the targer network
    self.target_net.load_state_dict(self.dqn.state_dict())


  def train(self):
    """
    The training function.
    """
    # make sure the size of the memory if larger than the batch size.
    state = self.env.reset()
    for _ in range(self.batch_size):
      self.dqn.eval()
      action = self.env.action_space.sample() #self.epsilon_greedy_action(state)
      next_state, reward, done, _ = self.env.step(action)
      self.buffer.push(state,action,reward,next_state,done)
      state = next_state

    # training loop
    for i in range(self.n_episodes):
      save_path = self.path + "/#" + str(i)
      self.env.reset() #gym.make(self.env_name)
      env = self.env
      if (i % 100) == 0:
        env = Monitor(env, save_path, force=True, video_callable=lambda episode: True)
      self.episode(env)
      if ((i + 1) % 100 == 0) and self.printLog:
        avg_reward = np.mean(self.rewards[-50:])
        print('EPISODE ', i+1 , 'reward %.2f' % self.periodic_reward, 'average reward %0.2f'% avg_reward, 'epssilon %0.2f' % self.epsilon)
        self.env.close()
        ### show_video(save_path)
        print()
      # update tensor board
      self.rewards.append(self.periodic_reward)
      self.sw.add_scalar('periodic reward per episode', self.periodic_reward, i)
      avg_reward = np.mean(self.rewards[-50:])
      self.sw.add_scalar('average reward per episode', avg_reward, i)
      self.sw.add_scalar('ebs_history per episode', self.epsilon, i)
      self.periodic_reward = 0


# Acrobot-v1

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/Acrobot-v1/tb_DQN

In [None]:
!mkdir /content/drive/MyDrive/Acrobot-v1
!mkdir /content/drive/MyDrive/Acrobot-v1/log_DQN
!mkdir /content/drive/MyDrive/Acrobot-v1/tb_DQN

In [None]:
agent1 = Agent(env_name="Acrobot-v1", path="/content/drive/MyDrive/Acrobot-v1/log_DQN", tb_path="/content/drive/MyDrive/Acrobot-v1/tb_DQN")
agent1.train()

# MountainCar-v0

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/MountainCar-v0/tb_DQN

In [None]:
!mkdir /content/drive/MyDrive/MountainCar-v0
!mkdir /content/drive/MyDrive/MountainCar-v0/log_DQN
!mkdir /content/drive/MyDrive/MountainCar-v0/tb_DQN

agent1 = Agent(env_name="MountainCar-v0", path="/content/drive/MyDrive/MountainCar-v0/log_DQN", tb_path="/content/drive/MyDrive/MountainCar-v0/tb_DQN")
agent1.train()

# Pong-ram-v0

In [None]:
!kill 476
%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/Pong-ram-v0/tb_DQN

In [None]:
!mkdir /content/drive/MyDrive/Pong-ram-v0
!mkdir /content/drive/MyDrive/Pong-ram-v0/log_DQN
!mkdir /content/drive/MyDrive/Pong-ram-v0/tb_DQN

agent1 = Agent(env_name="Pong-ram-v0", path="/content/drive/MyDrive/Pong-ram-v0/log_DQN", tb_path="/content/drive/MyDrive/MountainCar-v0/tb_DQN")
agent1.train()

# pong-ram-v0_exp2


In [None]:
# !rm -r /content/drive/MyDrive/Pong-ram-v0_exp2/tb_DQN/*

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/Pong-ram-v0_exp2/tb_DQN

In [None]:
# !mkdir /content/drive/MyDrive/Pong-ram-v0_exp2
# !mkdir /content/drive/MyDrive/Pong-ram-v0_exp2/log_DQN
# !mkdir /content/drive/MyDrive/Pong-ram-v0_exp2/tb_DQN

agent1 = Agent(env_name="Pong-ram-v0", epsilon_decrement=0.000001, learning_rate=0.0001, 
               batch_size=128,n_episodes = 1000, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
               path="/content/drive/MyDrive/Pong-ram-v0_exp2/log_DQN", tb_path="/content/drive/MyDrive/Pong-ram-v0_exp2/tb_DQN")
agent1.train()

KeyboardInterrupt: ignored

# pong-ram-v0_exp3

In [None]:
# learning rate scheduler, learning_rate=0.0001, 4 actions per step, decrease epsilone after the episode not the step

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/Pong-ram-v0_exp3/tb_DQN

In [None]:
# !rm -r /content/drive/MyDrive/Pong-ram-v0_exp3/tb_DQN/*

In [None]:
# !mkdir /content/drive/MyDrive/Pong-ram-v0_exp3
# !mkdir /content/drive/MyDrive/Pong-ram-v0_exp3/log_DQN
# !mkdir /content/drive/MyDrive/Pong-ram-v0_exp3/tb_DQN

agent1 = Agent(env_name="Pong-ram-v0", epsilon_decrement=0.0011, learning_rate=0.0001, 
               batch_size=128,n_episodes = 1000, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
               path="/content/drive/MyDrive/Pong-ram-v0_exp3/log_DQN", tb_path="/content/drive/MyDrive/Pong-ram-v0_exp3/tb_DQN")
agent1.train()

# pong-ram-v0_exp4

In [None]:
# No learnin rate scheduler, learning_rate=0.0005, 1 actions per step, decrease epsilone the step with epsilon_decrement=0.0000005

In [None]:
# !rm -r /content/drive/MyDrive/Pong-ram-v0_exp4/tb_DQN/*

In [None]:
%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/Pong-ram-v0_exp4/tb_DQN

In [None]:
# !mkdir /content/drive/MyDrive/Pong-ram-v0_exp4
# !mkdir /content/drive/MyDrive/Pong-ram-v0_exp4/log_DQN
# !mkdir /content/drive/MyDrive/Pong-ram-v0_exp4/tb_DQN

agent1 = Agent(env_name="Pong-ram-v0", epsilon_decrement=0.0000005, learning_rate=0.0005, 
               batch_size=128,n_episodes = 10000, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
               path="/content/drive/MyDrive/Pong-ram-v0_exp4/log_DQN", tb_path="/content/drive/MyDrive/Pong-ram-v0_exp4/tb_DQN")
agent1.train()

KeyboardInterrupt: ignored

# pong-ram-v0_exp5

In [None]:
# No learnin rate scheduler, learning_rate=0.0003, 1 actions per step, decrease epsilone the step with epsilon_decrement=0.0000005

In [None]:
%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/Pong-ram-v0_exp5/tb_DQN

In [None]:
!mkdir /content/drive/MyDrive/Pong-ram-v0_exp5
!mkdir /content/drive/MyDrive/Pong-ram-v0_exp5/log_DQN
!mkdir /content/drive/MyDrive/Pong-ram-v0_exp5/tb_DQN

agent1 = Agent(env_name="Pong-ram-v0", epsilon_decrement=0.0000005, learning_rate=0.0003, 
               batch_size=128,n_episodes = 10000, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
               path="/content/drive/MyDrive/Pong-ram-v0_exp5/log_DQN", tb_path="/content/drive/MyDrive/Pong-ram-v0_exp5/tb_DQN")
agent1.train()

KeyboardInterrupt: ignored

# pong-ram-v0_exp6

In [None]:
# No learnin rate scheduler, learning_rate=0.0001, 1 actions per step, decrease epsilone the step with epsilon_decrement=0.0000005

In [None]:
%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/Pong-ram-v0_exp6/tb_DQN

In [None]:
!mkdir /content/drive/MyDrive/Pong-ram-v0_exp6
!mkdir /content/drive/MyDrive/Pong-ram-v0_exp6/log_DQN
!mkdir /content/drive/MyDrive/Pong-ram-v0_exp6/tb_DQN

agent1 = Agent(env_name="Pong-ram-v0", epsilon_decrement=0.0000005, learning_rate=0.0001, 
               batch_size=128,n_episodes = 10000, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
               path="/content/drive/MyDrive/Pong-ram-v0_exp6/log_DQN", tb_path="/content/drive/MyDrive/Pong-ram-v0_exp6/tb_DQN")
agent1.train()

KeyboardInterrupt: ignored

# pong-ram-v0_exp7

In [None]:
# No learnin rate scheduler, batch_size = 256, learning_rate=0.00001, 1 actions per step, decrease epsilone the step with epsilon_decrement=0.0000005

In [None]:
%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/Pong-ram-v0_exp7/tb_DQN

In [None]:
!mkdir /content/drive/MyDrive/Pong-ram-v0_exp7
!mkdir /content/drive/MyDrive/Pong-ram-v0_exp7/log_DQN
!mkdir /content/drive/MyDrive/Pong-ram-v0_exp7/tb_DQN

agent1 = Agent(env_name="Pong-ram-v0", epsilon_decrement=0.0000005, learning_rate=0.00001, 
               batch_size=256,n_episodes = 10000, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
               path="/content/drive/MyDrive/Pong-ram-v0_exp7/log_DQN", tb_path="/content/drive/MyDrive/Pong-ram-v0_exp7/tb_DQN")
agent1.train()

# pong-ram-v0_exp8

In [None]:
# No learnin rate scheduler, batch_size = 256, learning_rate=0.000015, 1 actions per step, decrease epsilone the step with epsilon_decrement=1.25e-7

In [None]:
%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/Pong-ram-v0_exp8/tb_DQN

In [None]:
!mkdir /content/drive/MyDrive/Pong-ram-v0_exp8
!mkdir /content/drive/MyDrive/Pong-ram-v0_exp8/log_DQN
!mkdir /content/drive/MyDrive/Pong-ram-v0_exp8/tb_DQN

agent1 = Agent(env_name="Pong-ram-v0", epsilon_decrement=1.25e-7, learning_rate=0.000015, 
               batch_size=256,n_episodes = 10000, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
               path="/content/drive/MyDrive/Pong-ram-v0_exp8/log_DQN", tb_path="/content/drive/MyDrive/Pong-ram-v0_exp8/tb_DQN")
agent1.train()

torch.save(agent1.state_dict(), "/content/drive/MyDrive/Pong-ram-v0_exp8")

mkdir: cannot create directory ‘/content/drive/MyDrive/Pong-ram-v0_exp8’: File exists
mkdir: cannot create directory ‘/content/drive/MyDrive/Pong-ram-v0_exp8/log_DQN’: File exists
mkdir: cannot create directory ‘/content/drive/MyDrive/Pong-ram-v0_exp8/tb_DQN’: File exists


In [None]:

torch.save(agent1.state_dict(), "/content/drive/MyDrive/Pong-ram-v0_exp8/agent.pth")

# pong-ram-v0_exp9

In [None]:
# !rm -r /content/drive/MyDrive/Pong-ram-v0_exp9/tb_DQN/*

In [None]:
!mkdir /content/drive/MyDrive/Pong-ram-v0_exp9
!mkdir /content/drive/MyDrive/Pong-ram-v0_exp9/log_DQN
!mkdir /content/drive/MyDrive/Pong-ram-v0_exp9/tb_DQN

In [None]:
%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/Pong-ram-v0_exp9/tb_DQN

In [None]:
agent1 = Agent(env_name="Pong-ram-v0", epsilon_min=0.02, epsilon_decrement=5e-8, learning_rate=0.000015, 
               batch_size=256,n_episodes = 30000, n_steps = 10000, buffer_size = 150000, hid1_dim=250, hid2_dim=128,  
               path="/content/drive/MyDrive/Pong-ram-v0_exp9/log_DQN", tb_path="/content/drive/MyDrive/Pong-ram-v0_exp9/tb_DQN")
agent1.train()

torch.save(agent1.state_dict(), "/content/drive/MyDrive/Pong-ram-v0_exp9")

# LurantLander discrete

In [None]:
!mkdir /content/drive/MyDrive/LunarLander-v2/lr
!mkdir /content/drive/MyDrive/LunarLander-v2/lr/0.0001
# !mkdir /content/drive/MyDrive/LunarLander-v2/batch
# !mkdir /content/drive/MyDrive/LunarLander-v2/lr/#1
# !mkdir /content/drive/MyDrive/LunarLander-v2/log_DQN
# !mkdir /content/drive/MyDrive/LunarLander-v2/tb_DQN

In [None]:
%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/LunarLander-v2/lr/0.0001


In [None]:
agent1 = Agent(env_name="LunarLander-v2", epsilon_decrement=5e-4 , learning_rate=0.003, 
               batch_size=64,n_episodes = 500, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
               path="", tb_path="/content/drive/MyDrive/LunarLander-v2/lr/0.0001")
agent1.train()

torch.save(agent1.state_dict(), "/content/drive/MyDrive/LunarLander-v2/lr/0.0001")

In [None]:

torch.save(agent1.target_net.state_dict(), "/content/drive/MyDrive/LunarLander-v2/lr/0.0001" + "/model.pt")

In [None]:
# !rm -r /content/drive/MyDrive/LunarLander-v2/lr/0.0001/*

In [None]:
# lr = 0.1

lr =  0.1
path = "/content/drive/MyDrive/LunarLander-v2/lr/" + str(lr)
!mkdir path

%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/LunarLander-v2/lr/{lr}

# agent1 = Agent(env_name="LunarLander-v2", epsilon_decrement=5e-4 , learning_rate=lr, 
#                batch_size=64,n_episodes = 500, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
#                path="", tb_path=path)
# agent1.train()

# torch.save(agent1.target_net.state_dict(), path + "/model.pt")

In [None]:
# lr = 0.01

lr =  0.01
path = "/content/drive/MyDrive/LunarLander-v2/lr/" + str(lr)
!mkdir path

%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/LunarLander-v2/lr/{lr}

# agent1 = Agent(env_name="LunarLander-v2", epsilon_decrement=5e-4 , learning_rate=lr, 
#                batch_size=64,n_episodes = 500, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
#                path="", tb_path=path)
# agent1.train()

# torch.save(agent1.target_net.state_dict(), path + "/model.pt")

In [None]:
# lr = 0.001

lr =  0.001
path = "/content/drive/MyDrive/LunarLander-v2/lr/" + str(lr)
!mkdir path

%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/LunarLander-v2/lr/{lr}

# agent1 = Agent(env_name="LunarLander-v2", epsilon_decrement=5e-4 , learning_rate=lr, 
#                batch_size=64,n_episodes = 500, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
#                path="", tb_path=path)
# agent1.train()

# torch.save(agent1.target_net.state_dict(), path + "/model.pt")

In [None]:
# lr = 0.0001

lr =  0.0001
path = "/content/drive/MyDrive/LunarLander-v2/lr/" + str(lr)
!mkdir path

%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/LunarLander-v2/lr/{lr}

# agent1 = Agent(env_name="LunarLander-v2", epsilon_decrement=5e-4 , learning_rate=lr, 
#                batch_size=64,n_episodes = 500, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
#                path="", tb_path=path)
# agent1.train()

# torch.save(agent1.target_net.state_dict(), path + "/model.pt")

In [None]:
# lr = 0.00001

lr =  0.00001
path = "/content/drive/MyDrive/LunarLander-v2/lr/" + str(lr)
!mkdir path

%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/LunarLander-v2/lr/{lr}

# agent1 = Agent(env_name="LunarLander-v2", epsilon_decrement=5e-4 , learning_rate=lr, 
#                batch_size=64,n_episodes = 500, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
#                path="", tb_path=path)
# agent1.train()

# torch.save(agent1.target_net.state_dict(), path + "/model.pt")

In [None]:
!mkdir /content/drive/MyDrive/LunarLander-v2/bz

In [None]:
# bz = 128

bz =  128
path = "/content/drive/MyDrive/LunarLander-v2/bz/" + str(bz)
!mkdir path

%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/LunarLander-v2/bz/{bz}

# agent1 = Agent(env_name="LunarLander-v2", epsilon_decrement=5e-4 , learning_rate=0.001, 
#                batch_size=bz,n_episodes = 500, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
#                path="", tb_path=path)
# agent1.train()

# torch.save(agent1.target_net.state_dict(), path + "/model.pt")

In [None]:
# bz = 256

bz =  256
path = "/content/drive/MyDrive/LunarLander-v2/bz/" + str(bz)
!mkdir path

%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/LunarLander-v2/bz/{bz}

# agent1 = Agent(env_name="LunarLander-v2", epsilon_decrement=5e-4 , learning_rate=0.001, 
#                batch_size=bz,n_episodes = 500, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
#                path="", tb_path=path)
# agent1.train()

# torch.save(agent1.target_net.state_dict(), path + "/model.pt")

In [None]:
# bz = 512

bz =  512
path = "/content/drive/MyDrive/LunarLander-v2/bz/" + str(bz)
!mkdir path

%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/LunarLander-v2/bz/{bz}

# agent1 = Agent(env_name="LunarLander-v2", epsilon_decrement=5e-4 , learning_rate=0.001, 
#                batch_size=bz,n_episodes = 500, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
#                path="", tb_path=path)
# agent1.train()

# torch.save(agent1.target_net.state_dict(), path + "/model.pt")

In [None]:
# bz = 32

bz =  32
path = "/content/drive/MyDrive/LunarLander-v2/bz/" + str(bz)
!mkdir path

%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/LunarLander-v2/bz/{bz}

# agent1 = Agent(env_name="LunarLander-v2", epsilon_decrement=5e-4 , learning_rate=0.001, 
#                batch_size=bz,n_episodes = 500, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
#                path="", tb_path=path)
# agent1.train()

# torch.save(agent1.target_net.state_dict(), path + "/model.pt")

In [None]:
!mkdir /content/drive/MyDrive/LunarLander-v2/ed

In [None]:
# ed = 5e-5

ed =  5e-5
path = "/content/drive/MyDrive/LunarLander-v2/ed/" + str(ed)
!mkdir path

%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/LunarLander-v2/ed/{ed}

# agent1 = Agent(env_name="LunarLander-v2", epsilon_decrement=ed , learning_rate=0.001, 
#                batch_size=128,n_episodes = 500, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
#                path="", tb_path=path)
# agent1.train()

# torch.save(agent1.target_net.state_dict(), path + "/model.pt")

In [None]:
# ed = 5e-6

ed =  5e-6
path = "/content/drive/MyDrive/LunarLander-v2/ed/" + str(ed)
!mkdir pathd

%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/LunarLander-v2/ed/{ed}

# agent1 = Agent(env_name="LunarLander-v2", epsilon_decrement=ed , learning_rate=0.001, 
#                batch_size=128,n_episodes = 500, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
#                path="", tb_path=path)
# agent1.train()

# torch.save(agent1.target_net.state_dict(), path + "/model.pt")

In [None]:
# ed = 5e-3

ed =  5e-3
path = "/content/drive/MyDrive/LunarLander-v2/ed/" + str(ed)
!mkdir path

%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/LunarLander-v2/ed/{ed}

# agent1 = Agent(env_name="LunarLander-v2", epsilon_decrement=ed , learning_rate=0.001, 
#                batch_size=128,n_episodes = 500, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
#                path="", tb_path=path)
# agent1.train()

# torch.save(agent1.target_net.state_dict(), path + "/model.pt")

In [None]:
# ed = 5e-2

ed =  5e-2
path = "/content/drive/MyDrive/LunarLander-v2/ed/" + str(ed)
!mkdir path

%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/LunarLander-v2/ed/{ed}

# agent1 = Agent(env_name="LunarLander-v2", epsilon_decrement=ed , learning_rate=0.001, 
#                batch_size=128,n_episodes = 500, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
#                path="", tb_path=path)
# agent1.train()

# torch.save(agent1.target_net.state_dict(), path + "/model.pt")

In [None]:
# !rm -r /content/drive/MyDrive/LunarLander-v2/ed/0.0005/*

In [None]:
ed = 5e-4

ed =  5e-4
path = "/content/drive/MyDrive/LunarLander-v2/ed/" + str(ed)
!mkdir pathd

%reload_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/LunarLander-v2/ed/{ed}

agent1 = Agent(env_name="LunarLander-v2", epsilon_decrement=ed , learning_rate=0.001, 
               batch_size=128,n_episodes = 500, n_steps = 5000, buffer_size = 100000, hid1_dim=200, hid2_dim=128,  
               path="", tb_path=path)
agent1.train()

torch.save(agent1.target_net.state_dict(), path + "/model.pt")

# Github 

In [17]:
!git clone https://github.com/ashrafhatim/reinforcement-learning-project-.git

Cloning into 'reinforcement-learning-project-'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 28 (delta 5), reused 26 (delta 5), pack-reused 0[K
Unpacking objects: 100% (28/28), done.


In [18]:
!python /content/reinforcement-learning-project-/src/discrete/main.py --displayEnv True --n-episodes 3 --env-name "LunarLander-v2" --path "../" --tb-path "../" --printLog True

hey everything is done!
