# Q-learning

# エージェントを動かす
0: 左へおす\
1: 何もしない\
2: 右へおす
```
action = 0~2
env.step(action)
```

In [1]:
import gym
from gym import wrappers
import numpy as np
env = gym.make('MountainCar-v0')
env = wrappers.Monitor(env, './gym-results/QL', force=True)

In [2]:
observation = env.reset()
print(observation)

[-0.53406627  0.        ]


In [3]:
GAMMA = 0.99 #時間割引率
ALPHA = 0.2 #learning rate
DIVISOR = 40 #状態を離散値に振り分けるときの領域の数
#Q-Learningの処理
class QL_Brain:
    def __init__(self, env , num_states, num_actions):
        self.env = env
        self.num_states = num_states
        self.num_actions = num_actions 
        #q table 車の位置x車の速度x行動
        self.q_table = np.zeros((40,40,3))
    
    def get_status(self, observation):
        '''車の位置，速度は連続値→離散値に変換'''
        env_low = self.env.observation_space.low
        env_high = self.env.observation_space.high
        env_dx = (env_high - env_low) / 40
        position = int((observation[0] - env_low[0]) / env_dx[0])
        velocity = int((observation[1] - env_low[1]) / env_dx[1])
        return position, velocity

    def update_q_table(self, action, observation, next_observation, reward):
        '''q tableの更新する'''
        #行動後の状態で得られる最大行動価値
        next_position, next_velocity = self.get_status(next_observation) 
        next_max_q_value = max(self.q_table[next_position][next_velocity])
        #行動前の状態の行動価値
        position, velocity = self.get_status(observation)
        q_value = self.q_table[position][velocity][action]
        #行動価値関数の更新
        self.q_table[position][velocity][action] = q_value + ALPHA * (reward + GAMMA * next_max_q_value - q_value)

        return self.q_table
    
    def decide_action(self, observation):
        '''ε-greedy法を用いて，行動を決定'''
        epsilon = 0.002
        if np.random.uniform(0,1) > epsilon: #99.8%の確率で最大の行動価値の行動を選択
            position, velocity = self.get_status(observation)
            action = np.argmax(self.q_table[position][velocity])
        else:
            action = np.random.choice([0,1,2]) #0.2%の確率で行動をrandomchoice
        return action

In [4]:
# Agent
class Agent:
    def __init__(self, env, num_states, num_actions):
        self.brain = QL_Brain(env, num_states, num_actions)
    
    def update_q_function(self, action, observation, next_observation, reward):
        '''Q関数を更新する'''
        self.brain.update_q_table(action, observation, next_observation, reward)
    
    def get_action(self, observation):
        '''行動を決定する'''
        action = self.brain.decide_action(observation)
        return action

In [5]:
NUM_EPISODES = 10000 #試行回数
MAX_STEPS = 200 #1試行のステップ数

class Environment:
    def __init__(self):
        self.env = gym.make('MountainCar-v0')
        self.env = wrappers.Monitor(env, './gym-results/QL', force=True) #動画保存
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.n
        self.rewards = []
        self.agent = Agent(self.env, self.num_states, self.num_actions)
    
    def run(self): 
        '''実行'''
        observation = self.env.reset() #環境の初期化
        for episode in range(NUM_EPISODES):
            total_reward = 0
            observation = self.env.reset() #環境の初期化
            for _ in range(MAX_STEPS): #1episodeのループ
                #ε-greedy法で行動選択
                action = self.agent.get_action(observation)
                #選択した行動で車をうごかし，その後の観測結果，報酬，到達フラグを取得
                next_observation, reward, done, _ = self.env.step(action)
                #infoはない, rewardはMountainCarではステップごとに固定で-1
                
                #Q tableの更新
                self.agent.update_q_function(action, observation, next_observation, reward)
                #報酬を与える
                total_reward += reward
                #観測の更新
                observation = next_observation
                if done: #車が頂上に到達したら
                    #1episode終了
                    if episode%100 == 0:
                        print('episode: {}, total_reward: {}'.format(episode, total_reward))
                    self.rewards.append(total_reward)
                    #頂上に到達したので，1episode終了
                    break

In [6]:
mountaincar_env = Environment()
mountaincar_env.run()

episode: 0, total_reward: -200.0
episode: 100, total_reward: -200.0
episode: 200, total_reward: -200.0
episode: 300, total_reward: -200.0
episode: 400, total_reward: -200.0
episode: 500, total_reward: -200.0
episode: 600, total_reward: -200.0
episode: 700, total_reward: -200.0
episode: 800, total_reward: -200.0
episode: 900, total_reward: -200.0
episode: 1000, total_reward: -200.0
episode: 1100, total_reward: -200.0
episode: 1200, total_reward: -200.0
episode: 1300, total_reward: -200.0
episode: 1400, total_reward: -200.0
episode: 1500, total_reward: -200.0
episode: 1600, total_reward: -200.0
episode: 1700, total_reward: -192.0
episode: 1800, total_reward: -200.0
episode: 1900, total_reward: -200.0
episode: 2000, total_reward: -200.0
episode: 2100, total_reward: -200.0
episode: 2200, total_reward: -200.0
episode: 2300, total_reward: -200.0
episode: 2400, total_reward: -200.0
episode: 2500, total_reward: -200.0
episode: 2600, total_reward: -199.0
episode: 2700, total_reward: -198.0
epis

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
plt.title('Q-Learning: Reward Transition')
x = list(range(0, NUM_EPISODES))
plt.plot(x, mountaincar_env.rewards)
plt.ylabel('total_rewards')
plt.xlabel('episodes')
plt.grid()
plt.savefig('./fig/QL/QL_RT.png')
plt.show()