In [1]:
from Env import Env
import math
import random
import numpy as np
import pandas as pd
import pickle
from collections import namedtuple
from collections import defaultdict
from torch.utils.tensorboard import SummaryWriter

In [2]:
tag_reward = "reward"
tag_se = "states_explored"

dict_tuple2index_path = "dict_tuple2index.pkl"
dataframe_path = "dataframe.pkl"


In [3]:
action_range = list(range(Env.ACTION_N))

In [4]:
BATCH_SIZE = 128
GAMMA      = 0.9 # discount factor
EPSILON    = 1
ESUB       = 0.01
EMIN       = 0.025
LEARN_RATE = 0.001
EDECAY = 0.99

STATE_N  = Env.DIM * Env.DIM
ACTION_N = Env.ACTION_N

NUM_EPISODES = 1000000

LAMBDA = 0.5   # In Between 
LAMBDA_S = 0.9 # Closer to MC 
LAMBDA_F = 0.1 # Closer to TD(0)

MINREWARD = 25
MINREWARD_INCREMENT = 1
INIT_ROW = [-500]*Env.ACTION_N # all actions invalid to start with
ET_INIT = [0]*Env.ACTION_N # 0 trace to start with
OPTIMISTIC_INTI_VAL = 12

In [5]:
env = Env()

In [6]:
def np2tuple(np_array_2d):
    return tuple(np_array_2d.ravel())

In [7]:
tuple2index = defaultdict(lambda:-1)

In [8]:
action_header = list(range(ACTION_N))

In [9]:
df = pd.DataFrame(columns=action_header, dtype='float64')
ets = pd.DataFrame(columns=action_header, dtype='float64')        
etf = pd.DataFrame(columns=action_header, dtype='float64')
et = pd.DataFrame(columns=action_header, dtype='float64')        

In [10]:
def get_index(np_mat):
    tup = np2tuple(np_mat)
    ind = len(tuple2index)
    
    if tuple2index[tup] == -1:        
        tuple2index[tup] = ind
        df.loc[tuple2index[tup]] = INIT_ROW
        et.loc[tuple2index[tup]] = ET_INIT
        # ets.loc[tuple2index[tup]] = ET_INIT
        # etf.loc[tuple2index[tup]] = ET_INIT
        
        temp_env = Env()
        temp_env.set(board_state = np_mat)
        valid_action_numbers = temp_env.get_valid_actions()
        for action_number in valid_action_numbers:
            df[action_number][tuple2index[tup]] = OPTIMISTIC_INTI_VAL
        
    return tuple2index[tup]

In [11]:
def select_action(np_state, ep = 0):
    tup = np2tuple(np_state)
    sample = random.random()
    if sample < ep:
        return env.sample_valid_action()
    else:
        return df.iloc[tuple2index[tup]].idxmax(axis = 0)            

In [23]:
def save_pickle():
    dict_tuple2index = dict(tuple2index)
    df.to_pickle(dataframe_path)

    with open(dict_tuple2index_path, 'wb') as f:
        pickle.dump(dict_tuple2index, f)
        f.close()
    

In [24]:
def load_pickle():
    df = pd.read_pickle(dataframe_path)
    
    dict_tuple2index = pickle.load( open( dict_tuple2index_path, "rb" ) )
    tuple2index = defaultdict(lambda:-1, dict_tuple2index)
    

In [19]:
writer = SummaryWriter(comment='__')

In [20]:
### Mod this 

for i_episode in range(NUM_EPISODES):
    state = env.reset()
    done = False
    total_reward = 0
    
    for col in et.columns:
        et[col].values[:] = 0
        # ets[col].values[:] = 0
        # etf[col].values[:] = 0
    
    while not done:
        index = get_index(state)
        
        # ets = ets * LAMBDA_S
        # etf = etf * LAMBDA_F
        
        action = select_action(state)
        next_state, reward, done = env.step(action)
        next_index = get_index(next_state)
        et[action][index] += 1
        # ets[action][index] += 1
        # etf[action][index] += 1
        
        next_action = select_action(next_state)     
        
        if done:
            on_board = env.get_count()
            if on_board  == 1:
                # game success
                reward = reward*100
            else:
                # wrong solution
                # negative reward
                reward = -(on_board*10)
                
        delta = (reward + (df[next_action][next_index] * GAMMA)) - df[action][index]
        for action_n in range(ACTION_N):
            for row in range(len(tuple2index)):
                if et[action_n][row] == 0 or df[action_n][row] == -500 :
                    continue
                df[action_n][row] += (delta*et[action_n][row]*LEARN_RATE)
                et[action_n][row] = et[action_n][row] * LAMBDA * GAMMA
        
        total_reward += reward        
        state = next_state        
        
    writer.add_scalar(tag_reward, total_reward, i_episode)    
    writer.add_scalar(tag_se, len(tuple2index), i_episode)   
    
    if i_episode % 50 == 0:
        save_pickle()
        
print('Complete')

KeyboardInterrupt: 

In [15]:
print(df)

         0           1           2           3           4      5   \
0    -500.0 -500.000000 -500.000000 -500.000000 -500.000000 -500.0   
1    -500.0 -500.000000 -500.000000 -500.000000 -500.000000 -500.0   
2    -500.0   11.997823 -500.000000 -500.000000 -500.000000 -500.0   
3    -500.0 -500.000000 -500.000000   11.999636 -500.000000 -500.0   
4    -500.0 -500.000000 -500.000000 -500.000000 -500.000000 -500.0   
5    -500.0 -500.000000 -500.000000 -500.000000 -500.000000 -500.0   
6    -500.0 -500.000000 -500.000000 -500.000000 -500.000000 -500.0   
7    -500.0 -500.000000 -500.000000 -500.000000 -500.000000 -500.0   
8    -500.0 -500.000000 -500.000000 -500.000000 -500.000000 -500.0   
9    -500.0   11.999633 -500.000000 -500.000000 -500.000000 -500.0   
10   -500.0 -500.000000 -500.000000 -500.000000 -500.000000 -500.0   
11   -500.0 -500.000000 -500.000000 -500.000000 -500.000000 -500.0   
12   -500.0 -500.000000 -500.000000 -500.000000 -500.000000 -500.0   
13   -500.0 -500.000