In [76]:
import gym
import numpy as np
import pandas as pd
import time
import pickle

In [55]:
epsilon = 1.0
min_epsilon = 0.1
max_epsilon = 1.0
decay_rate = 0.01

class QTable:

    def __init__(self):
        self.actions = [0, 1, 2, 3]
        self.total_episodes = 5000
        self.max_steps = 100

        self.lr_rate = 0.5
        self.gamma = 0.9

        self.Q = pd.DataFrame(columns=self.actions, dtype=np.float64)

    def check_state_exist(self, state):
        # Check state exists in Q table. if not exist then add to Q
        if state not in self.Q.index:
            actions = [0 for _ in range(len(self.actions))]
            self.Q = self.Q.append(pd.Series(actions, index=self.Q.columns, name=state))
        
    def choose_action(self, state):
        self.check_state_exist(state)
        
        action=0
        if np.random.uniform() < epsilon:
            action = np.random.choice(self.actions)
        else:
            action = self.Q.loc[state, :].idxmax()
#             action = action.reindex(np.random.permutation(action.index))  # some actions have same value
#             action = action.idxmax()

        return action

    def learn(self, state, state2, reward, action):
        self.check_state_exist(state2)
        predict = self.Q.loc[state, action]
        target = reward + self.gamma * self.Q.loc[state2, :].max()

        self.Q.loc[state, action] += self.lr_rate * (target - predict)


In [85]:
env = gym.make('FrozenLake-v0')

epsilon = 1.0
min_epsilon = 0.1
max_epsilon = 1.0
decay_rate = 0.01

actions = [0, 1, 2, 3]
total_episodes = 10000
max_steps = 100

lr_rate = 0.81
gamma = 0.96

Q = np.zeros((env.observation_space.n, env.action_space.n))
    
def choose_action(state):
    action=0
    if np.random.uniform(0, 1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state, :])

    return action

def learn(state, state2, reward, action):

    predict = Q[state, action]
    target = reward + gamma * np.max(self.Q[state2, :])

    Q[state, action] = Q[state, action] + lr_rate * (target - predict)


In [86]:
# Start
rewards=[]

for episode in range(total_episodes):
    state = env.reset()
    tot_rew = 0
    t = 0
    
    while t < max_steps:
        # env.render()

        action = choose_action(state)  

        state2, reward, done, info = env.step(action)  

        learn(state, state2, reward, action)

        state = state2

        t += 1
        tot_rew+=reward

        if done:
            break
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode) 
    rewards.append(tot_rew)
    
print ("Score over time: ", sum(rewards)/total_episodes)
print(Q)

Score over time:  0.1331
[[1.59844755e-01 1.23072684e-01 1.30413746e-01 1.16834840e-01]
 [2.57183743e-02 4.12584634e-02 1.85656106e-02 9.22117616e-02]
 [8.86780140e-02 4.26631353e-02 5.92691758e-02 8.40638281e-02]
 [1.24582771e-02 1.06169112e-01 1.00952986e-01 1.02281922e-01]
 [3.08780670e-01 1.50045784e-01 6.69555636e-03 4.95605658e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.87802144e-02 1.53976283e-02 1.31417559e-02 1.84268819e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.50627160e-01 6.44913114e-02 8.85241828e-05 5.69597815e-01]
 [8.66379151e-02 8.05619370e-01 1.09604505e-01 2.93750170e-02]
 [4.55046012e-01 1.43432796e-02 3.14263534e-02 6.42849485e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.39114061e-01 5.21447109e-01 9.32388952e-01 1.32950456e-01]
 [6.82841174e-01 8.21729473e-01 9.98351046e-01 2.75084473e-01]
 [0.00000000e+00 0.00000000e+0

In [88]:
map = []
for i in range(16):
    map.append(np.argmax(QT.Q[i, :]))
act = {0:"L", 1:"D", 2:"R", 3:"U"}
for i in range(len(map)):
    print(act[map[i]], end=' ')
    if (i+1)%4==0:
        print('\n')
    
# pd.to_pickle(QT.Q, "frozenLake_qTable.pkl")
with open("frozenLake_qTable.pkl", 'wb') as f:
    pickle.dump(QT.Q, f)

print("DONE !!!")


L U L D 

L L L L 

U D L L 

L R R L 

DONE !!!


In [79]:
rewards

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0