In [None]:
from Env import Env
import math
import random
import numpy as np
from collections import namedtuple

import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F 
from torch.utils.tensorboard import SummaryWriter

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
writer = SummaryWriter(comment='__')
tag_reward = "reward"
tag_loss = "loss"
tag_ep = "epsilon"
r_buff_header = ['state', 'action', 'next_state', 'reward', 'done']

In [None]:
BATCH_SIZE = 128
GAMMA      = 0.9 # discount factor
EPSILON    = 1
ESUB       = 0.01
EMIN       = 0.025
LEARN_RATE = 0.001
EDECAY = 0.99

STATE_N  = Env.DIM * Env.DIM
ACTION_N = Env.ACTION_N

NUM_EPISODES = 100000

MINREWARD = 25
MINREWARD_INCREMENT = 1



In [None]:
class ReplayBuffer(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.header = r_buff_header
        self.buffer = pd.DataFrame(columns=self.header)        

    def push(self, df_row):
        if self.__len__() == self.capacity:
            # Probably exceeded capacity
            #remove a row (probably 1st one) here 
            self.buffer = self.buffer.iloc[1:]
        #add to dataframe here
        self.buffer = pd.concat([self.buffer, df_row])
        
        
    def insert(self, stateV, actonV, next_stateV, rewardV, doneV):
        # Initialise data to lists. 
        data = [{self.header[0]: stateV, 
                 self.header[1]: actonV, 
                 self.header[2]: next_stateV, 
                 self.header[3]: rewardV, 
                 self.header[4]: doneV}] 
  
        # Creates DataFrame. 
        df = pd.DataFrame(data)
        self.push(df)
            
            
    def sample(self, batch_size=0):
        if batch_size == 0:
            return self.buffer
        else:
            return self.buffer.sample(batch_size)

    
    def __len__(self):
        return self.buffer.shape[0]

In [None]:
class DqnAgent(nn.Module):
    
    # tweak hyperparameters
    
    def __init__(self, ip_size = 49  , n_op=Env.ACTION_N):
        super(DqnAgent, self).__init__()  
        self.fc1 = nn.Linear(ip_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 128)
        self.fc4 = nn.Linear(128, 128)
        self.fc5 = nn.Linear(128, n_op)
        
        
    def forward(self, x):
        x = x.view(-1, Env.DIM*Env.DIM)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        
        return x

In [None]:

env = Env()

qvfa = DqnAgent().to(device)
optimizer = optim.Adam(qvfa.parameters(), lr = LEARN_RATE)

criterion = nn.MSELoss()
buffer = ReplayBuffer(1000000)

In [None]:
def select_action(state, ep = 0):    
    
    sample = random.random()    
    #state = state.view(-1,1,Env.DIM,Env.DIM).float()
    if sample < ep:
        return env.sample_valid_action()
    else:
        with torch.no_grad():
            state = torch.from_numpy(state).unsqueeze(0).float().to(device)            
            op = qvfa(state)
            values, indices = op.max(1)
            return indices.item()

In [None]:
### Mod this 

def optimize_model(i_episode = 0):
    BATCH_SIZE = 3
    if buffer.__len__() < BATCH_SIZE:
        print("optimizing model Not enough samples in buffer : ",buffer.__len__())


    transitions = buffer.sample(min(BATCH_SIZE, buffer.__len__()))    

    state_batch = transitions[buffer.header[0]].values
    state_batch = torch.from_numpy(np.stack( state_batch, axis=0 )).float().to(device)
    #state_batch = state_batch.view(-1,1,Env.DIM,Env.DIM).float()

    action_batch = torch.tensor(transitions[buffer.header[1]].values.tolist()).view(-1,1).to(device)

    next_state_batch = transitions[buffer.header[2]].values
    next_state_batch = torch.from_numpy(np.stack( next_state_batch, axis=0 )).float().to(device)
    #next_state_batch = next_state_batch.view(-1,1,Env.DIM,Env.DIM).float()

    reward_batch = torch.tensor(transitions[buffer.header[3]].values.tolist()).view(-1,1).to(device).float()

    done_batch = torch.tensor(transitions[buffer.header[4]].values.tolist()).view(-1,1).to(device)

    qsa = qvfa(state_batch).gather(1, action_batch)

    with torch.no_grad():
        qvfa.eval()
        next_state_action_values = qvfa(next_state_batch)
        max_next_state_values, _indices = next_state_action_values.max(dim=1)
        max_next_state_values = max_next_state_values.view(-1,1)
        next_state_values = ((max_next_state_values*GAMMA).float()+reward_batch).float()*(1-done_batch).float()
        target = next_state_values.float()
        qvfa.train()


    # 𝛿=𝑄(𝑠,𝑎)−(𝑟+𝛾max𝑎𝑄(𝑠′,𝑎))
    optimizer.zero_grad()
    loss = criterion(qsa, target)
    loss.backward()
    # for param in qvfa.parameters():param.grad.data.clamp_(-1, 1)
    optimizer.step()
    #print("loss ",loss.item())
    writer.add_scalar(tag_loss, loss.item(), i_episode)     
    

In [None]:
### Mod this 

for i_episode in range(NUM_EPISODES):
    state = env.reset()
    done = False
    total_reward = 0
    while not done:        
        action = select_action(state, ep = EPSILON)
        next_state, reward, done = env.step(action)        
        
        if done:
            on_board = env.get_count()
            if on_board  == 1:
                # game success
                reward = reward*100
            else:
                # wrong solution
                # no reward
                reward = -reward
                
        total_reward += reward
        buffer.insert(state, action, next_state, reward, done)
        state = next_state        
        
    writer.add_scalar(tag_reward, total_reward, i_episode)
    writer.add_scalar(tag_ep, EPSILON, i_episode)
    
    optimize_model(i_episode)
    
    if i_episode%100 == 0 and i_episode > BATCH_SIZE:
        EPSILON *= EDECAY
    '''
    if EPSILON >= 0.1 and i_episode > BATCH_SIZE and total_reward > MINREWARD:        
        EPSILON -= ESUB
        MINREWARD += MINREWARD_INCREMENT
        if EPSILON == 0:
            EPSILON = EMIN
    '''
        
print('Complete')

In [None]:
Idea: 
    Dont use nn, or FA, use state action dict.
    For every dead end award negetive reward.
    Use forward view TD Lambda to update the values.
    