In [None]:
# @title Install dependencies
!sudo apt-get update > /dev/null 2>&1
!sudo apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install rarfile --quiet
!pip install stable-baselines3[extra] ale-py==0.7.4 --quiet
!pip install box2d-py --quiet
!pip install gym pyvirtualdisplay --quiet

# Imports
import io
import os
import glob
import torch
import base64
import stable_baselines3

import numpy as np
import matplotlib.pyplot as plt

from stable_baselines3 import DQN
from stable_baselines3.common.results_plotter import ts2xy, load_results
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_atari_env

import gym
from gym import spaces
from gym.wrappers import Monitor,RecordVideo

print(gym.__version__)

# @title Plotting/Video functions
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else:
    print("Could not find video")


def wrap_env(env):
  env = Monitor(env, './video', force=True)
  # env = RecordVideo(env, './video')
  return env

class LoggingCallback:
    def __init__(self,threshold,trial_number,patience):
      '''
      threshold: int tolerance for increase in reward
      trial_number: int Prune after minimum number of trials
      patience: int patience for the threshold
      '''
      self.threshold = threshold
      self.trial_number  = trial_number
      self.patience = patience
      self.cb_list = [] #Trials list for which threshold is reached
    def __call__(self,study:optuna.study, frozen_trial:optuna.Trial):
      #Setting the best value in the current trial
      study.set_user_attr("previous_best_value", study.best_value)
      
      #Checking if the minimum number of trials have pass
      if frozen_trial.number >self.trial_number:
          previous_best_value = study.user_attrs.get("previous_best_value",None)
          #Checking if the previous and current objective values have the same sign
          if previous_best_value * study.best_value >=0:
              #Checking for the threshold condition
              if abs(previous_best_value-study.best_value) < self.threshold: 
                  self.cb_list.append(frozen_trial.number)
                  #If threshold is achieved for the patience amount of time
                  if len(self.cb_list)>self.patience:
                      print('The study stops now...')
                      print('With number',frozen_trial.number ,'and value ',frozen_trial.value)
                      print('The previous and current best values are {} and {} respectively'
                              .format(previous_best_value, study.best_value))
                      study.stop()

log_dir = "models"
os.makedirs(log_dir,exist_ok=True) 

def objective(trial:optuna.Trial):

  # Create environment
  env = gym.make('LunarLander-v2')
  env = stable_baselines3.common.monitor.Monitor(env, log_dir)

  #Trial will suggest a set of hyperparamters from the specified range
  hyperparameters = sample_dqn_params(trial)
  model_dqn = DQN("MlpPolicy", env, **hyperparameters) #Set verbose to 1 to observe training logs. We encourage you to set the verbose to 1.

  # define learning steps
  trained_dqn = model_dqn.learn(total_timesteps=1, log_interval=10)#100000 , callback=callback
  # save model
  trained_dqn.save('models/dqn_{}.pth'.format(trial.number)) 

  x, y = ts2xy(load_results(log_dir), 'episodes') # timesteps
  # clear_output(wait=True)
  #For the given hyperparamters, determine reward
  reward = sum(y)
  return reward

#Create a study object and specify the direction as 'maximize'
#As you want to maximize reward
#Pruner stops not promising iterations
#Use a pruner, else you will get error related to divergence of model
#You can also use Multivariate samplere
#sampler = optuna.samplers.TPESampler(multivarite=True,seed=42)
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(study_name="dqn_study",direction='maximize',
                            sampler = sampler, pruner=optuna.pruners.HyperbandPruner())

logging_callback = LoggingCallback(threshold=10, patience=30, trial_number=5)
#You can increase the n_trials for a better search space scanning
study.optimize(objective, n_trials=4, catch=(ValueError,),callbacks=[logging_callback])

nn_layers = [64,64] #This is the configuration of your neural network. Currently, we have two layers, each consisting of 64 neurons.
                    #If you want three layers with 64 neurons each, set the value to [64,64,64] and so on.

learning_rate = 0.001 #This is the step-size with which the gradient descent is carried out.
                      #Tip: Use smaller step-sizes for larger networks.

## save the model 
dir_prefix = "./files/"
log_dir_windy = dir_prefix + "DQN_windy/"
os.makedirs(log_dir_windy, exist_ok=True)

model1.save(log_dir_windy + "DQN_windy_model.zip")



dir_prefix = "./files/"
log_dir_obstacle = dir_prefix + "DQN_obstacle/"
os.makedirs(log_dir_obstacle, exist_ok=True)

model2.save(log_dir_obstacle + "DQN_obstacle_model.zip")

import gym
from gym import spaces

print('run pytorch model')
import gym
import torch as th
import torch.nn as nn
import numpy as np

import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
from PIL import Image
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

!git clone  https://github.com/Hasanaldhahi3/atchekegroup1lunarlanding.git

import importlib.util
# log_dir_ = dir_prefix + "DQN_Youtube/"
# os.makedirs(log_dir_, exist_ok=True)
spec=importlib.util.spec_from_file_location("DeepQNetwork","/content/atchekegroup1lunarlanding/YoutubeCodeRepository/ReinforcementLearning/DeepQLearning/simple_dqn_torch_2020.py")
foo_1 = importlib.util.module_from_spec(spec)
spec.loader.exec_module(foo_1)

spec=importlib.util.spec_from_file_location("plotLearning","/content/atchekegroup1lunarlanding/YoutubeCodeRepository/ReinforcementLearning/DeepQLearning/utils.py")
foo_2 = importlib.util.module_from_spec(spec)
spec.loader.exec_module(foo_2)

spec=importlib.util.spec_from_file_location("Agent","/content/atchekegroup1lunarlanding/YoutubeCodeRepository/ReinforcementLearning/DeepQLearning/simple_dqn_torch_2020.py")
foo_3 = importlib.util.module_from_spec(spec)
spec.loader.exec_module(foo_3)


dir_prefix = "./files/"
log_dir_obstacle = dir_prefix + "DQN_obstacle/"
os.makedirs(log_dir_obstacle, exist_ok=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
# from YoutubeCodeRepository.ReinforcementLearning.DeepQLearning import simple_dqn_torch_2020

print(f"Is CUDA supported by this system?{torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

# Storing ID of current CUDA device
# cuda_id = torch.cuda.current_device()
# print(f"ID of current CUDA device:{torch.cuda.current_device()}")

# print(f"Name of current CUDA device:{torch.cuda.get_device_name(cuda_id)}")
# import gym

cuda = torch.device('cuda')  # Default CUDA device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
from stable_baselines3 import PPO
from stable_baselines3 import DQN

# model_path = "".format('dqn_lunar')

model_path = log_dir_obstacle + "dqn_0.zip"
model_test = DQN.load(model_path)
print('loaded model')
# for key, value in model_test.get_parameters().items():
#     print(key, value.shape)

env = gym.make("LunarLander-v4").unwrapped

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

paramshapes = model_test.get_parameters()


def copy_dqn_weights(baselines_model):
    torch_dqn = foo_1.DeepQNetwork(lr=0.001, n_actions=4, input_dims=[8], fc1_dims=256, fc2_dims=256)
    model_params = baselines_model.get_parameters()
    # Get only the policy parameters
    model_params = model_params['policy']
    policy_keys = [key for key in model_params.keys() if "pi" in key or "c" in key]
    policy_params = [model_params[key] for key in policy_keys]

    for (th_key, pytorch_param), key, policy_param in zip(torch_dqn.named_parameters(), policy_keys, policy_params):
        param = policy_param.copy()
        # Copy parameters from stable baselines model to pytorch model

        # Conv layer
        if len(param.shape) == 4:
            # https://gist.github.com/chirag1992m/4c1f2cb27d7c138a4dc76aeddfe940c2
            # Tensorflow 2D Convolutional layer: height * width * input channels * output channels
            # PyTorch 2D Convolutional layer: output channels * input channels * height * width
            param = np.transpose(param, (3, 2, 0, 1))

        # weight of fully connected layer
        if len(param.shape) == 2:
            param = param.T

        # bias
        if 'b' in key:
            param = param.squeeze()

        param = torch.from_numpy(param)
        pytorch_param.data.copy_(param.data.clone())

    return torch_dqn


dqn_torch_v = copy_dqn_weights(model_test)
ct = 0

for child in dqn_torch_v.children():
    ct += 1
    if ct < 2:
        for param in child.parameters():
            print(param)
            print(ct)
            param.requires_grad = False



print(dqn_torch_v.parameters())




for param in dqn_torch_v.parameters():
  param.requires_grad = False
num_ftrs = 64  # 8 states we have for the polly to move 
num_classes = 4 # number of Actions at final layer 
# ResNet final fully connected layer
dqn_torch_v.fc = nn.Linear(num_ftrs, num_classes)
dqn_torch_v.to(device)
optimizer = torch.optim.Adam(dqn_torch_v.parameters(), lr=1e-2)
loss_fn = nn.CrossEntropyLoss()


# import gym


# # from YoutubeCodeRepository.ReinforcementLearning.DeepQLearning.utils import plotLearning

# import numpy as np


# def obs_to_torch(obs):
#     # TF: NHWC
#     # PyTorch: NCHW
#     # https://discuss.pytorch.org/t/dimensions-of-an-input-image/19439
#     # obs = np.transpose(obs, (0, 3, 1, 2))
#     # # Normalize
#     # obs = obs / 255.0
#     obs = th.tensor(obs).float()
#     obs = obs.to(device)
#     return obs


# env = gym.make('LunarLander-v4')

# episode_reward = 0
# done = False
# obs = env.reset()
# print(next(dqn_torch_v.parameters()).device)
# while not done:
#     action = th.argmax(dqn_torch_v(obs_to_torch(obs))).item()
#     # action = env.action_space.sample()
#     obs, reward, done, _ = env.step(action)
#     episode_reward += reward

# print(episode_reward)

import gym
# from simple_dqn_torch_2020 import Agent

import numpy as np

if __name__ == '__main__':
    env = gym.make('LunarLander-v2')
    agent = foo_3.Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4, eps_end=0.01,
                  input_dims=[8], lr=0.001)
    scores, eps_history = [], []
    n_games = 500
    
    for i in range(n_games):
        score = 0
        done = False
        observation = env.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.store_transition(observation, action, reward, 
                                    observation_, done)
            agent.learn()
            observation = observation_
        scores.append(score)
        eps_history.append(agent.epsilon)

        avg_score = np.mean(scores[-100:])

        print('episode ', i, 'score %.2f' % score,
                'average score %.2f' % avg_score,
                'epsilon %.2f' % agent.epsilon)
    x = [i+1 for i in range(n_games)]
    filename = 'lunar_lander.png'
    foo_2.plotLearning(x, scores, eps_history, filename)