In [1]:
import gym
import swingUp
import numpy as np
import numpy.random as rnd
import torch as pt
import matplotlib.pyplot as plt
%matplotlib inline

Here is code from the previous assignment for convenience. 

In [2]:
class nnQ(pt.nn.Module):
    """
    Here is a basic neural network with for representing a policy 
    """
    
    def __init__(self,stateDim,numActions,numHiddenUnits,numLayers):
        super().__init__()
        
        InputLayer = [pt.nn.Linear(stateDim+numActions,numHiddenUnits),
                      pt.nn.ReLU()]
        
        HiddenLayers = []
        for _ in range(numLayers-1):
            HiddenLayers.append(pt.nn.Linear(numHiddenUnits,numHiddenUnits))
            HiddenLayers.append(pt.nn.ReLU())
            
        
        OutputLayer = [pt.nn.Linear(numHiddenUnits,1)]
        
        AllLayers = InputLayer + HiddenLayers + OutputLayer
        self.net = pt.nn.Sequential(*AllLayers)
        
        self.numActions = numActions
        
    def forward(self,x,a):
        x = pt.tensor(x,dtype=pt.float32)

        b = pt.nn.functional.one_hot(pt.tensor(a),self.numActions)
        
        c = b.float().detach()
        y = pt.cat([x,c])
        
        return self.net(y)
            
class sarsaAgent:
    def __init__(self,stateDim,numActions,numHiddenUnits,numLayers,
                epsilon=.1,gamma=.9,alpha=.1):
        self.Q = nnQ(stateDim,numActions,numHiddenUnits,numLayers)
        self.gamma = gamma
        self.epsilon = epsilon
        self.alpha = alpha
        self.numActions = numActions
        self.s_last = None
        
    def action(self,x):
        # This is an epsilon greedy selection
        if rnd.rand() < self.epsilon:
            a = rnd.randint(numActions)
        else:
            qBest = -np.inf
            for aTest in range(self.numActions):
                qTest = self.Q(x,aTest).detach().numpy()[0]
                if qTest > qBest:
                    qBest = qTest
                    a = aTest
        return a
    
    def update(self,s,a,r,s_next,done):
        
        # Compute the TD error, if there is enough data
        update = True
        if done:
            Q_cur = self.Q(s,a).detach().numpy()[0]
            delta = r - Q_cur
            self.s_last = None
            Q_diff = self.Q(s,a)
        elif self.s_last is not None:
            Q_next = self.Q(s,a).detach().numpy()[0]
            Q_cur = self.Q(self.s_last,self.a_last).detach().numpy()[0]
            delta = self.r_last + self.gamma * Q_next - Q_cur
            Q_diff = self.Q(self.s_last,self.a_last)
        else:
            update = False
            
        # Update the parameter via the semi-gradient method
        if update:
            self.Q.zero_grad()
            Q_diff.backward()
            for p in self.Q.parameters():
                p.data.add_(self.alpha*delta,p.grad.data)
                
        
            
            
        
        if not done:
            self.s_last = np.copy(s)
            self.a_last = np.copy(a)
            self.r_last = np.copy(r)


The simulation is slightly modified from the previous homework. In particular, the episode lengths are restricted to be at most 500. 

In [None]:
# This is the environment
env = swingUp.SwingUpEnv()

# For simplicity, we only consider forces of -1 and 1
numActions = 2
Actions = np.linspace(-1,1,numActions)

# This is our learning agent
gamma = .95

agent = sarsaAgent(5,numActions,20,2,epsilon=5e-2,gamma=gamma,alpha=1e-4)
maxSteps = 2e5

# This is a helper to deal with the fact that x[2] is actually an angle
x_to_y = lambda x : np.array([x[0],x[1],np.cos(x[2]),np.sin(x[2]),x[3]])

R = []
UpTime = []

step = 0
ep = 0
maxLen = 500
while step < maxSteps:
    ep += 1
    x = env.reset()
    C = 0.
    
    done = False
    t = 1
    while not done:
        t += 1
        step += 1
        y = x_to_y(x)
        a = agent.action(y)
        u = Actions[a:a+1]
        env.render()
        x_next,c,done,info = env.step(u)
        
        max_up_time = info['max_up_time']
        y_next = x_to_y(x_next)

        C += (1./t)*(c-C)
        agent.update(y,a,c,y_next,done)
        x = np.copy(x_next)
        if done:
            break
            
        if step >= maxSteps:
            break
            
        if t > maxLen:
            agent.s_last = None
            break
            
        
        R.append(C)
    UpTime.append(max_up_time)
    #print('t:',ep+1,', R:',C,', L:',t-1,', G:',G,', Q:', Q_est, 'U:', max_up_time)
    print('Episode:',ep,'Total Steps:',step,', Ave. Reward:',C,', Episode Length:',t-1, 'Max Up-Time:', max_up_time)
env.close()

plt.plot(UpTime)

Episode: 1 Total Steps: 38 , Ave. Reward: -20.289310762240213 , Episode Length: 38 Max Up-Time: 0
Episode: 2 Total Steps: 83 , Ave. Reward: -15.6507661661448 , Episode Length: 45 Max Up-Time: 0
Episode: 3 Total Steps: 134 , Ave. Reward: -9.547764235128634 , Episode Length: 51 Max Up-Time: 0
Episode: 4 Total Steps: 634 , Ave. Reward: 0.0 , Episode Length: 500 Max Up-Time: 0
Episode: 5 Total Steps: 1134 , Ave. Reward: 0.0 , Episode Length: 500 Max Up-Time: 0
Episode: 6 Total Steps: 1634 , Ave. Reward: 0.0 , Episode Length: 500 Max Up-Time: 0
Episode: 7 Total Steps: 2134 , Ave. Reward: 0.0 , Episode Length: 500 Max Up-Time: 0
Episode: 8 Total Steps: 2634 , Ave. Reward: 0.0 , Episode Length: 500 Max Up-Time: 0
Episode: 9 Total Steps: 3134 , Ave. Reward: 0.0 , Episode Length: 500 Max Up-Time: 0
Episode: 10 Total Steps: 3634 , Ave. Reward: 0.0 , Episode Length: 500 Max Up-Time: 0
Episode: 11 Total Steps: 4134 , Ave. Reward: 0.0 , Episode Length: 500 Max Up-Time: 0
Episode: 12 Total Steps: 46

Episode: 95 Total Steps: 44712 , Ave. Reward: -2.0380231450620445 , Episode Length: 481 Max Up-Time: 14
Episode: 96 Total Steps: 44962 , Ave. Reward: -4.108400833547603 , Episode Length: 250 Max Up-Time: 37
Episode: 97 Total Steps: 45000 , Ave. Reward: -17.13404181578279 , Episode Length: 38 Max Up-Time: 0
Episode: 98 Total Steps: 45066 , Ave. Reward: -2.9493504842557714 , Episode Length: 66 Max Up-Time: 0
Episode: 99 Total Steps: 45149 , Ave. Reward: -1.9148583649691278 , Episode Length: 83 Max Up-Time: 0
Episode: 100 Total Steps: 45245 , Ave. Reward: -2.1012017088414523 , Episode Length: 96 Max Up-Time: 0
Episode: 101 Total Steps: 45362 , Ave. Reward: -0.7539530479312598 , Episode Length: 117 Max Up-Time: 0
Episode: 102 Total Steps: 45456 , Ave. Reward: -1.7329773605930514 , Episode Length: 94 Max Up-Time: 0
Episode: 103 Total Steps: 45541 , Ave. Reward: -2.4197586217095326 , Episode Length: 85 Max Up-Time: 0
Episode: 104 Total Steps: 45723 , Ave. Reward: -0.8046145127120513 , Episod

# Question 

Implement deep Q-learning as described in the paper here:

https://daiwk.github.io/assets/dqn.pdf

In this paper, we have the states, and so there is no need to do the pre-processing described there.

In my tests on this problem, it works substantially better than the SARSA  implementation 
with the following design choices:
* Use the same Q-network architecture as used  in the SARSA algorithm
* Same step size, discount factor, and learning rate as above
* Mini-batch size of 20
* Update the target network every 100 steps

The deep Q-learning method can be implemented via a modification of the SARSA code above.

You could probably make it work even better with further tuning.


In [None]:
# Implement this code below and test it.