In [1]:
import gym
import math
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from collections import namedtuple, deque
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [2]:
from pybullet_envs.bullet.kukaGymEnv import KukaGymEnv
from pybullet_envs.bullet.kuka_diverse_object_gym_env import KukaDiverseObjectEnv
import random
import os
from gym import spaces
import time
import pybullet as p
from pybullet_envs.bullet import kuka
import numpy as np
import pybullet_data
import pdb
import distutils.dir_util
import glob
from pkg_resources import parse_version
import gym

current_dir=C:\Users\Kami\anaconda3\envs\robot_grasping\lib\site-packages\pybullet_envs\bullet


In [3]:
env = KukaDiverseObjectEnv(urdfRoot=pybullet_data.getDataPath(),
                           actionRepeat=80,
                           isEnableSelfCollision=True,
                           renders=True,
                           isDiscrete=False,
                           maxSteps=15,
                           dv=0.06,
                           removeHeightHack=False,
                           blockRandom=0.,
                           cameraRandom=0,
                           width=64,
                           height=64,
                           numObjects=1,
                           isTest=True)

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('GPU IS AVAILABLE :D') 
else:  
    device = torch.device("cpu") 
    print('GPU not available')

GPU IS AVAILABLE :D


In [7]:
class DQN(nn.Module):
    def __init__(self, h, w):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1)
        self.conv2 = nn.Conv2d(32, 32, kernel_size=2, stride=1)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=2, stride=1)
          
        self.fn1 = nn.Linear(7*7*32, 32)
        
        self.fn2 = nn.Linear(3, 32) # need to change this to 4
        
        self.fn3 = nn.Linear(32, 32)
        self.fn4 = nn.Linear(32, 1)
        
    def forward(self, s, a): # s is the observation, a is the action 
        s = s.to(device)
        a = a.to(device)
        
        s = F.relu(F.max_pool2d(self.conv1(s), 2))
        s = F.relu(F.max_pool2d(self.conv2(s), 2))
        s = F.relu(F.max_pool2d(self.conv3(s), 2))
        
        s = s.view(-1, 7*7*32) 
        s = F.relu(self.fn1(s))
        
        a = F.relu(self.fn2(a))
        
        val = torch.add(s, a)
        
        val = F.relu(self.fn3(val))
        
        return self.fn4(val)

In [8]:
env.reset()

BATCH_SIZE = 128
GAMMA = 0.9
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 50

init_screen = env._get_observation()
screen_height, screen_width, _ = init_screen.shape

n_actions = 7 

policy_net = DQN(screen_height, screen_width).to(device)
target_net = DQN(screen_height, screen_width).to(device)
target_net.load_state_dict(policy_net.state_dict()) 
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
# memory = ReplayBuffer(100000)

In [9]:
policy_net.load_state_dict(torch.load('Random Optim Policy-net DDQN.pt'), strict=False)
target_net.load_state_dict(torch.load('Random Optim Target-net DDQN.pt'), strict=False)

<All keys matched successfully>

In [10]:
steps_done = 0

In [11]:
def select_action(state): # random action selection 
    global steps_done
    steps_done += 1
    
    actions = torch.zeros(16,3)
    for i in range(16):
        actions[i] = torch.tensor(env.action_space.sample())

    states = state.tile(16,).reshape(16, 3, 64, 64)
    q_val = policy_net(states, actions)
    
    action = actions[torch.argmax(q_val)]
    
    return action

In [12]:
def get_state(state):
    state = state.transpose((2,0,1))
    state = torch.from_numpy(state)
    state = state.float()
    return state.unsqueeze(0)

In [13]:
def test(num_episodes=10, max_episode_length=15):
    episode_durations = []
    eps_history = []
    rewards = []
    success_rate = 0
    
    for i_episode in range(num_episodes):
        state = env.reset()
        state = get_state(state)
        
        for t in range(max_episode_length):
            action = select_action(state)
            next_state, reward, done, _ = env.step(action) #.item())
            
            if reward == 1:
                print('success')
                success_rate += 1
                                        
            state = get_state(next_state)
            
            if done:
                episode_durations.append(t+1)
                break
        rewards.append(reward)
        
    print('Complete')
    return episode_durations, rewards, success_rate