In [6]:
import numpy as np
from time import sleep
from IPython.display import clear_output
import matplotlib.pyplot as plt
import pickle

In [7]:
pkl_file = open('Q_Table.pkl', 'rb')

Q_table = pickle.load(pkl_file)

In [8]:
rows = 5
cols = 5
a = (0,1)
A = (4, 1)
b = (0, 3)
B = (2, 3)
start_state = (np.random.randint(0, rows), np.random.randint(0, cols))
n_trails = 100

class gridworld:
    def __init__(self, position=start_state, n_trails=n_trails):
        #duration
        self.n_trails = n_trails

        #creating the grid
        self.grid = np.zeros([rows, cols])
        
        #states 
        self.previous_states = [(0,0)]
        self.current_state = position
        
        self.reward = 0
        
        self.total_rewards = 0
        self.all_actions = []
        self.all_rewards = []
        self.all_states = []
        
    def move(self, action):
        """
        0 = up
        1 = down
        2 = left 
        3 = right"""
        
        if action == 0:
            next_state = (self.current_state[0] - 1, self.current_state[1])
            
        elif action == 1:
            next_state = (self.current_state[0] + 1, self.current_state[1])
            
        elif action == 2:
            next_state = (self.current_state[0], self.current_state[1] - 1)
            
        elif action == 3:
            next_state = (self.current_state[0], self.current_state[1] + 1)
        
        if (next_state[0] >= 0) & (next_state[0] <= rows - 1):
            if (next_state[1] >= 0) & (next_state[1] <= cols - 1):
                return next_state
        
        return self.current_state

 
    def give_reward(self):
        if (self.current_state == a):
            self.current_state = A
            return 10
        
        elif (self.current_state == b):
            self.current_state = B
            return 5
        
        elif (self.current_state == self.previous_states[-1]):
            return -1
        else:
            return 0
        
    def show(self):
        temp = np.zeros([rows, cols])
        temp[self.current_state] = 1
        print(temp)
        self.all_states.append(temp)
        
    def run(self, agent):

        for i in range(self.n_trails):
            
            action = agent.choose()
            self.current_state = self.move(action)
            
            self.reward = self.give_reward()
            self.total_rewards += self.reward
            
            clear_output(wait=True)
            self.show()
            print('old state:{}, action:{}, new state:{}, reward:{}'.format(self.previous_states[-1], action, self.current_state, self.reward))
            sleep(.5) 
            
            self.previous_states.append(self.current_state)
            self.all_actions.append(action)
            self.all_rewards.append(self.reward)
            
            agent.update()
            
        return self.total_rewards

        

In [9]:
class BaseSampliler:
    
    def __init__(self, env, table_size=None, learning_rate=None, epsilon=None, discount=None):
        self.env = env
        self.choice = 0
        self.state = env.current_state
        
        self.reward = env.reward

In [10]:
class random(BaseSampliler):
    def __init__(self, env):
        super().__init__(env)
        
    def choose(self):
        
        self.choice = np.random.randint(0,4)
        
        return self.choice
    def update(self):
        pass

In [11]:
class q_learn(BaseSampliler):
    def __init__(self, env, table_size, learning_rate, epsilon, discount):
        super().__init__(env, table_size, learning_rate, epsilon, discount)
        
        self.epsilon = epsilon
        self.start_epsililon= 1 
        self.end_epsilon = self.env.n_trails // 2
        self.decay = epsilon/(self.end_epsilon - self.start_epsililon)
        
        self.lr = learning_rate
        self.discount = discount
        
        self.low = np.array((0,0))
        self.high = np.array((4,4))
        self.table_size = table_size
        self.os_size = [self.table_size] * len(self.high)
        
        #self.qtable = np.random.uniform(low=-1, high=0, size=(self.os_size + [4]))
        self.qtable = Q_table
        
        self.state
        
    def choose(self):
        
        if np.random.random() > self.epsilon:
            self.choice = np.argmax(self.qtable[self.state])
        else:
            self.choice = np.random.randint(0, 4)
    
        return self.choice
    
    def update(self):
        self.reward = self.env.reward
        
        self.current_q = self.qtable[self.state + (self.choice, )]
        
        self.state = self.env.current_state
        
        self.future_q = np.max(self.qtable[self.state])
        
        self.new_q = (1-self.lr) * self.current_q + self.lr * (self.reward+ self.discount * self.future_q)
        
        self.qtable[self.state + (self.choice, )] = self.new_q
        
        self.epsilon -= self.decay

In [12]:
en0 = gridworld()
r = random(en0)
en0.run(agent=r)

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]]
old state:(4, 1), action:3, new state:(4, 2), reward:0


16

In [13]:
en1 = gridworld()
q = q_learn(en0, table_size=25, learning_rate=.01, epsilon=.1, discount=.1)
en0.run(agent=q)

[[0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
old state:(2, 1), action:0, new state:(1, 1), reward:0


256

In [None]:
#Q_table = q.qtable

In [None]:
'''output = open('Q_Table.pkl', 'wb')
pickle.dump(Q_table, output)
output.close()
'''

In [None]:
en0.total_rewards