In [3]:
# import libraries
import gym
import collections
from tensorboardX import SummaryWriter

# import global
ENV   = 'FrozenLake-v1'
GAMMA = 0.9
ALPHA = 0.2
NUM_TEST_EP = 20
video_frames = []

In [5]:
# Agent Class
class Agent:
    def __init__(self,env_name):
        self.env = gym.make(env_name)
        self.cur_state = self.env.reset()
        self.q_table = collections.defaultdict(float)

    # take a random single step in the enviornment
    def random_step_env(self):
        action = self.env.action_space.sample()
        old_state = self.cur_state
        new_state, reward, done, _ = self.env.step(action=action)
        if done:
            self.cur_state = self.env.reset()
        else:
            self.cur_state = new_state
        return old_state,action,reward,new_state
    
    # find best action in a given state
    def best_action(self,state):
        best_action,best_action_value = None,None
        for action in range(self.env.action_space.n):
            action_value = self.q_table[(state,action)]
            if best_action is None or best_action_value<action_value:
                best_action = action
                best_action_value = action_value
        return best_action,best_action_value
    
    # update the q_table
    def update_q_table(self,state,action,reward,new_state):
        _, q_star = self.best_action(new_state)
        new_value = reward + GAMMA*q_star
        old_value = self.q_table[(state,action)]
        self.q_table[(state,action)] = old_value * (1-ALPHA) + new_value * ALPHA
    
    # play an episode of the enviornment
    def play_episode(self,env,render=False):
        total_reward = 0.0
        state = env.reset()
        done = False

        while not done:
            action, _ = self.best_action(state)
            new_state, reward, done, _ = env.step(action)
            total_reward += reward
            state = new_state
            if render:
                env.render()
                video_frames.append(env.render(mode='rgb_array'))
        return total_reward

if __name__ == "__main__":
    test_env = gym.make(ENV)
    agent = Agent(ENV)
    writer = SummaryWriter(comment='-q-learning-tabular')

    itr_num = 0
    best_reward = 0.0
    random_step_num = 100

    while True:
        itr_num+=1
        for _ in range(random_step_num):
            old_state,action,reward,new_state = agent.random_step_env()
            agent.update_q_table(old_state,action,reward,new_state)
        
        reward = 0.0
        for _ in range(NUM_TEST_EP):
            reward+=agent.play_episode(test_env)
        reward/=NUM_TEST_EP
        writer.add_scalar("reward",reward,itr_num)
        if reward>best_reward:
            print('Reward updated from '+str(best_reward)+' to '+str(reward))
            best_reward = reward
        if reward>0.9:
            print('Iteration complete at : '+str(itr_num))
            break
    agent.play_episode(test_env,True)
    writer.close()


Reward updated from 0.0 to 0.1
Reward updated from 0.1 to 0.3
Reward updated from 0.3 to 0.45
Reward updated from 0.45 to 0.6
Reward updated from 0.6 to 0.75
Reward updated from 0.75 to 0.85
Reward updated from 0.85 to 0.95
Iteration complete at : 296


In [6]:
# Save the Episode as video
import numpy as np
import cv2
print(len(video_frames))
arr = np.asarray(video_frames)
size = (256,256)
out = cv2.VideoWriter('project_brown.mp4',cv2.VideoWriter_fourcc(*'DIVX'),15,size)
for i in range(len(video_frames)):
    rgb_img = cv2.cvtColor(arr[i],cv2.COLOR_RGB2BGR)
    out.write(rgb_img)
out.release()


50


OpenCV: FFMPEG: tag 0x58564944/'DIVX' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'
