# Imports

In [1]:
import gym
import os
import numpy as np
from random import randint
from sklearn.utils import shuffle
import math
import torch
import torch.nn.functional as F
import torch.autograd as autograd 
from torch import nn
from torch import optim
import random
from collections import deque
from IPython.display import clear_output
import matplotlib.pyplot as plt
import tqdm as tqdm
from statistics import mean, stdev

env = gym.make('MountainCarContinuous-v0')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.get_device_name(torch.cuda.current_device())



'GeForce GTX 1050 Ti'

In [2]:
print("Action space:" , env.action_space) 
print("Observation matrix:", env.observation_space)
print("Observation min:",env.observation_space.low)
print("Observation max:", env.observation_space.high)

Action space: Box(1,)
Observation matrix: Box(2,)
Observation min: [-1.2  -0.07]
Observation max: [0.6  0.07]


# Functions

In [3]:
class ReLU(nn.Module):
    #Regular Relu except it subtracts by mean
    def forward(self, inp): return (inp.clamp_min(0.) - inp.clamp_min(0.).mean())

class DeepQ(nn.Module):
    def __init__(self, layers):
        
        super(DeepQ, self).__init__()

        modules = []
        modules.append(nn.Linear(layers[0]+1, layers[1]))
        
        for i in range(1,len(layers)-1):
            modules.append(ReLU())
            modules.append(nn.Linear(layers[i], layers[i+1]))
            
        self.layers = nn.Sequential(*modules)
        
    def forward(self,x):
        return self.layers(x.float())
    
    
    
#Class is useful so that I remember the order of appends using shift+tab
class Memory():
    def __init__(self, maxMemory):
        #Deque is a fifo list with a max length
        self.memory = deque(maxlen = maxMemory)
        
    #Passes in Bellman equation parameters
    def update(self, goal, state, action, reward, next_state, done):
        self.memory.append((goal, state, action, reward, next_state, done))
    
    #Takes a sample of size batch_size from memory
    def sample(self, batch_size):
        #Zip into tuples all states, then all actions, then all rewards, etc
        return zip(*random.sample(self.memory, batch_size))
    
    def __len__(self):
        return len(self.memory)
    
    def __repr__(self):
        return str(self.memory)
    
def epsilon_decay(step, eps_max, eps_min, eps_decay):
    return eps_min + (eps_max - eps_min) * math.exp(-1. * step * eps_decay)

def update_model(model,target_model,opt,memory,discount,batch_size):

    #Take a sample from memory with batch size
    goals, states, actions, rewards, next_states, dones = memory.sample(batch_size)

    goals       = torch.tensor(goals ,dtype = torch.float, device = device)  
    states      = torch.tensor(states ,dtype = torch.float, device = device)
    next_states = torch.tensor(next_states, dtype = torch.float, device = device)
    actions     = torch.tensor(actions, dtype = torch.long, device = device).unsqueeze(1)
    rewards     = torch.tensor(rewards, dtype = torch.float, device = device)
    dones       = torch.tensor(dones, dtype = torch.float, device = device)
    
    #Take Q from current state based on actions taken at that state
    Qs = model(torch.cat([goals,states], dim=1))    
    
    Qs = Qs.squeeze()
        
    #Calculate next Q using target model and get the biggest one
    
    next_Qs = target_model(torch.cat([goals,next_states], dim=1)).max(dim=1)[0]
    
    #print(next_Qs)
    
    bellman = rewards + (1-dones) * discount * next_Qs
    #print(f"Bellman: {bellman}")
    
    #MSE loss between Bellman from target model and actions Qs from current model
    loss = loss_func(bellman,Qs) 
    
    #print(f"Loss: {loss}")
    
    #Gradient 
    loss.backward()
    opt.step()
    opt.zero_grad()

    return loss


def plot(rewards, text):
    clear_output(True)
    fig = plt.figure(figsize=(20,5))
    plt.xlabel(text, ha="center")
    plt.title(f"Rewards, Batch # {len(rewards)}")
    plt.plot(rewards)
    plt.show()
    return fig

# Run

In [None]:
def run(layers, batch_size, discount, eps_max, eps_min, eps_decay, update_freq, lr, memory_size, loss_func, noise_scale):
    
    goal = np.array([env.goal_position])
    low = env.action_space.low[0] 
    high = env.action_space.high[0]
    noise = noise_scale
    
    model = DeepQ(layers).cuda()
    target_model = DeepQ(layers).cuda()
    opt = optim.Adam(model.parameters(), lr=lr)
    
    rewards = []
    memory = Memory(int(memory_size))
    
    max_reward = 0
    best_episode = 0
    iter = 0
    
    name = (f"      Layers: {layers}"
            f"      Loss function: {loss_func.__name__}"
            f"      Batch size: {batch_size}"
            f"      Discount: {discount}"
            f"      Epsilon: {eps_max} to {eps_min} decay: {eps_decay}"
            f"      Learning rate: {lr}"
            f"      Update freq: {update_freq}"
            f"      Memory size: {memory_size}")
           
    for episode in range(1,num_episodes+1):
        
        state = env.reset()
        episode_rewards = 0
        done = False

        
        while not done: #Run until poll falls or env time ends
            
            #Lets you watch, but trains so much faster if you don't
            #env.render()
                    
            #Calculate Q Value
            Q = model(torch.cat([torch.tensor(goal, device=device),torch.tensor(state, device = device)]).unsqueeze(0))
                        
            epsilon = epsilon_decay(iter, eps_max, eps_min, eps_decay)
            
            #Do we do take a random action?
            if random.random() > epsilon:
                action = [torch.clamp(Q + noise, low, high).item()] #Clamp makes sure it's within acceptable range
            else:
                action = [np.random.uniform(low, high)] #Random action
        
            #Take step and update memory
            next_state, reward, done, info = env.step(action) #reward is {-1,0}
            
            memory.update(goal, state, action, reward, next_state, done)
            memory.update([state[0]],state, action, 0, next_state, True) #HER implementation?

            #Updates the model if you can take a proper batch
            if len(memory) > batch_size:
                loss = update_model(model,target_model,opt,memory,discount,batch_size) #SGD w/ return loss
        
            #Update target model every update_freq steps:
            if iter%update_freq == 0: 
                target_model.load_state_dict(model.state_dict())

            #Updates
            state = next_state
            episode_rewards += reward
            iter += 1

            
        rewards.append(episode_rewards)    
        if episode_rewards > max_reward:
            max_reward = episode_rewards
            best_episode = episode
            
        plot(rewards, name)
        print(epsilon, max_reward)
        
        #End condition
        if len(rewards)>100:
            average = mean(rewards[-100:])
            print(average)
            if average > -110:
                env.close()
                print(f"Finished in {episode} episodes!")
                return episode

# Set up

In [None]:
num_episodes = 1000

m = env.observation_space.shape[0]
n = 1

layers = [m,64,64,64,64,n]

loss_func = F.mse_loss

batch_size = 128
memory_size = 1e6
discount = 0.99
eps_max = 1
eps_min = 1e-3
eps_decay = 1e-4
update_freq = 5 #How frequently target network gets updated 
lr = 1e-3
noise_scale = 0.01 #Random noise is added to each move


In [None]:
results = run(layers,  batch_size, discount, eps_max, eps_min, eps_decay, update_freq, lr, memory_size, loss_func, noise_scale)