In [1]:
import gym
import sys
import numpy as np

from gym import error, spaces, utils
from copy import deepcopy

import matplotlib.pyplot as plt

from tqdm import tqdm

from collections import defaultdict




In [2]:
class Q_learning_agent(object):
    def __init__(self, obs_n, act_n = 4, learning_rate=0.01, gamma=0.9, e_greed=0.1):
        self.lr = learning_rate
        self.gamma = gamma
        self.epsilon = e_greed
        self.act_n = act_n
        self.Q = np.zeros((obs_n, act_n))
        
        info = "Q_LearningAgent with state number = " +  str(obs_n) + ", learning rate = " + str(learning_rate) + ", gamma = " + str(gamma) + " created."
        print(info)
    
    def sample(self, S):
        if np.random.uniform(0, 1) < (1.0 - self.epsilon):
            action = self.predict(S)
        else:
            action = np.random.choice(self.act_n)

        return action

    def predict(self, S):
        Q_values = self.Q[S]
        maxQ = np.max(self.Q[S])

        action = np.random.choice(np.where(Q_values == maxQ)[0])
        return action
        
    def learn(self, S, action, reward, S_prime, action_prime, done):
        predit_Q = self.Q[S, action]

        if (done):
            target_Q = reward
        else:
            target_Q = reward + self.gamma * np.max(self.Q[S_prime])

        self.Q[S, action] += self.lr * (target_Q - predit_Q)

In [3]:
def run_episode(env, agent):
    S = env.reset()
    action = agent.sample(S)
    
    total_reward = 0
    while (True):
        S_prime, reward, done, _ = env.step(action)
        action_prime = agent.sample(S_prime)

        agent.learn(S, action, reward, S_prime, action_prime, done)

        action = action_prime
        S = S_prime
        total_reward += reward

        if done:
            break
    
    return total_reward

In [4]:
def main():
    env = gym.make("MountainCar-v0")

    q_agent = Q_learning_agent(obs_n=env.observation_space.n,  act_n=env.action_space.n, 
        learning_rate=0.5, gamma = 0.9, e_greed=0.1)


    q_reward_history = []

    for i in range(20):
        q_reward_list = []

        for episode in range(500):
            q_reward = run_episode(env, q_agent)
            q_reward_list.append(q_reward)


        q_reward_history.append(q_reward_list)


    return q_reward_history

In [None]:
reward_history = main()