In [1]:
#%matplotlib inline

import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.utils.tensorboard import SummaryWriter
from IPython.display import clear_output, display

import pandas as pd



env = gym.make('CartPole-v0').unwrapped

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

#plt.ion()

tag_reward = "reward"
tag_loss = "loss"

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
## writer = SummaryWriter(comment='__')

In [3]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'done'))

r_buff_header = ['state', 'action', 'next_state', 'reward', 'done']

In [4]:
class ReplayBuffer(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.header = r_buff_header
        self.buffer = pd.DataFrame(columns=self.header)        
        self.count_i = 0
        self.count_p = 0

    def push(self, df_row):
        self.count_p += 1
        if self.__len__() == self.capacity:
            # Probably exceeded capacity
            #remove a row (probably 1st one) here 
            self.buffer = self.buffer.iloc[1:]
        #add to dataframe here
        self.buffer = pd.concat([self.buffer, df_row])
        
        
    def insert(self, stateV, actonV, next_stateV, rewardV, doneV):
        # Initialise data to lists. 
        self.count_i += 1
        data = [{self.header[0]: stateV, 
                 self.header[1]: actonV, 
                 self.header[2]: next_stateV, 
                 self.header[3]: rewardV, 
                 self.header[4]: doneV}] 
  
        # Creates DataFrame. 
        df = pd.DataFrame(data)
        self.push(df)
        
        #print("len = ", self.__len__(),"\t insert = ", self.count_i, "\t push = ", self.count_p)
        
            
            
    def sample(self, batch_size):
        return self.buffer.sample(batch_size)

    def __len__(self):
        return self.buffer.shape[0]

In [5]:
class DqnAgent(nn.Module):
    
    def __init__(self, n_ip, n_op):
        super(DqnAgent, self).__init__()
        self.fc1 = nn.Linear(n_ip, n_ip*2)
        self.fc2 = nn.Linear(n_ip*2, n_ip)
        self.fc3 = nn.Linear(n_ip, n_op)       
        
    def forward(self,x):        
        x=self.fc1(x)
        x=self.fc2(x)
        x=self.fc3(x)
        return x
        

In [6]:
BATCH_SIZE = 64
GAMMA      = 0.9 # discount factor
EPSILON    = 0.9

CHECK_EVERY = 100
OPTIMIZE_EVERY = 50


STATE_N  = 4
ACTION_N = env.action_space.n

LOG_FOLDER = "log_res/"

# EPS_START  = 0.9
# EPS_END    = 0.05
# EPS_DECAY  = 200
# TARGET_UPDATE = 10

# Since target policy is absolutely greedy version of Explore policy.
# No need of using 2 vfa nets. use 1 network.

## NUM_EPISODES = 3000
NUM_EPISODES = 100



In [7]:
env.reset()
## plt.figure()

eval_count = 0
train_count = 0

In [8]:
qvfa = DqnAgent(STATE_N, ACTION_N).double().to(device)
optimizer = optim.RMSprop(qvfa.parameters())
## memory = ReplayMemory(10000)
buffer = ReplayBuffer(10000)

In [9]:
def select_action(state, ep = 0):    
    sample = random.random()
    state = torch.from_numpy(state).to(device)
    if sample > ep:
        with torch.no_grad():            
            op = qvfa(state)
            #print("op \t" , op)
            values, indices = op.max(0)
            #print("indices \t",indices.item() )
            return indices.item()
    else:
        return env.action_space.sample()


In [10]:
def optimize_model():
    ## clear_output(wait=True)    
    if buffer.__len__() < BATCH_SIZE:
        print("optimizing model Not enough samples in buffer : ",buffer.__len__())
        return
    
    transitions = buffer.sample(BATCH_SIZE) # batch_sized list of transitions
    
    state_batch = transitions[buffer.header[0]].values
    state_batch = torch.from_numpy(np.stack( state_batch, axis=0 )).to(device)
    #print("state \n",state_batch,"\n\n")

    action_batch = torch.tensor(transitions[buffer.header[1]].values.tolist()).view(-1,1).to(device)
    #print("action\n",action_batch,"\n\n")

    next_state_batch = transitions[buffer.header[0]].values
    next_state_batch = torch.from_numpy(np.stack( next_state_batch, axis=0 )).to(device)
    #print("next_state_batch \n",next_state_batch,"\n\n")

    reward_batch = torch.tensor(transitions[buffer.header[3]].values.tolist()).view(-1,1).to(device)
    #print("\n\nreward \n",reward_batch,"\n\n")

    done_batch = torch.tensor(transitions[buffer.header[4]].values.tolist()).view(-1,1).to(device)
    #print("\n\nDone\n",done_batch,"\n\n")

    #print("ravel ",state_batch.ravel())
    qsav = qvfa(state_batch)
    #print("\n\nstate value fn \n",qsav)



    qsa = qvfa(state_batch).gather(1, action_batch)
    #print("\n\nqsa \n",qsa)


    with torch.no_grad():
        qvfa.eval()
        next_state_action_values = qvfa(next_state_batch)
        #print("\n\nnext_state_action_values\n", next_state_action_values)
        max_next_state_values, _indices = next_state_action_values.max(dim=1)
        max_next_state_values = max_next_state_values.view(-1,1)
        #print("\n\nmax_next_state_values\n", max_next_state_values)
        #print("\n\ntorch.tensor(GAMMA)\n", torch.tensor(GAMMA))
        next_state_values = ((max_next_state_values*GAMMA).float()+reward_batch).float()*(1-done_batch).float()
        #print("\n\nnext_state_values\n", next_state_values)
        target = next_state_values.double()
        qvfa.train()


    # implement this 𝛿=𝑄(𝑠,𝑎)−(𝑟+𝛾max𝑎𝑄(𝑠′,𝑎))
    # print("\n\ntarget\n",target)
    # print("\n\nqsa\n",qsa)


    loss = F.smooth_l1_loss(qsa, target)
    # log this loss    
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in qvfa.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
    ## clear_output(wait=True)
    #print("optimizing model loss = ", loss.item())


    
    

In [11]:
cnt = 0
done = False
## plt.figure()
## img = plt.imshow(env.render(mode='rgb_array')) # only call this once
env.render(mode='rgb_array')
for i_episode in range(NUM_EPISODES):
    state = env.reset()
    done = False
    #print("Explore buffer size = ",buffer.__len__())
    while not done:
        #env.render()
        ## img.set_data(env.render(mode='rgb_array')) # just update the data
        env.render(mode='rgb_array')
        ## display.display(plt.gcf())
        ## display.clear_output(wait=True)
        
        action = select_action(state, ep = EPSILON)
        #print(action,"\t",action.data)
        next_state, reward, done, info = env.step(action)        
        #memory.push(state, action, next_state, reward, done)
        buffer.insert(state, action, next_state, reward, done)
        
        ## cnt+=1
        ## print("cnt\t",cnt)
        state = next_state
        if done:
            # this is for the terminal state
            # no next iteration for terminal state
            next_state = [0, 0, 0, 0]
            buffer.insert(state, action, next_state, reward, done)
            ## cnt+=1
            ## print("cnt\t",cnt)

    
    # Optmiize after every episode
    optimize_model()
    
    if i_episode%CHECK_EVERY == 0:
        qvfa.eval()
        total_reward = 0
        clear_output(wait=True)
        
        for _ in range(100):
            done = False            
            state = env.reset()
            while not done : 
                env.render(mode='rgb_array')
                ## img.set_data(env.render(mode='rgb_array')) # just update the data
                ## display.display(plt.gcf())
                ## display.clear_output(wait=True)
                
                action = select_action(state)
                next_state, reward, done, info = env.step(action)
                buffer.insert(state, action, next_state, reward, done)
                state = next_state
                total_reward += reward
                if done:
                    next_state = [0, 0, 0, 0]
                    buffer.insert(state, action, next_state, reward, done)
        total_reward /= 100
        ## writer.add_scalar(tag_reward, total_reward, eval_count)
        eval_count += 1
        # Log this
        clear_output(wait=True)
        print("total reward = ", total_reward)
        qvfa.train()
        
        
print('Complete')
env.render()
env.close()
plt.ioff()
plt.show() 

total reward =  12.32


AttributeError: 'numpy.ndarray' object has no attribute 'dim'