In [1]:
import gym 
import random

In [2]:
env = gym.make('SpaceInvaders-v0')
height, width, channels = env.observation_space.shape
actions = env.action_space.n

In [3]:
env.unwrapped.get_action_meanings()

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']

In [25]:
height

210

In [26]:
width

160

In [27]:
channels

3

In [72]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Convolution2D
from tensorflow.keras.optimizers import Adam

In [73]:
def build_model(height, width, channels, actions):
    model = Sequential()
    model.add(Convolution2D(32, (8,8), strides=(4,4), activation='relu', input_shape=(3,height, width, channels)))
    model.add(Convolution2D(64, (4,4), strides=(2,2), activation='relu'))
    model.add(Convolution2D(64, (3,3), activation='relu'))
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [74]:
model = build_model(height, width, channels, actions)

In [75]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_15 (Conv2D)           (None, 3, 51, 39, 32)     6176      
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 3, 24, 18, 64)     32832     
_________________________________________________________________
conv2d_17 (Conv2D)           (None, 3, 22, 16, 64)     36928     
_________________________________________________________________
flatten_5 (Flatten)          (None, 67584)             0         
_________________________________________________________________
dense_21 (Dense)             (None, 512)               34603520  
_________________________________________________________________
dense_22 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_23 (Dense)             (None, 6)                

In [76]:
from rl.agents import DQNAgent, SARSAAgent
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy, BoltzmannQPolicy

In [77]:
total_episodes = 3000
total_test_episodes = 10
max_steps = 10000
learning_rate = 0.01
Gamma = 0.9
epsilon = 1.0
min_epsilon = 0.1

In [20]:
def build_agent(model, actions):
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=epsilon, value_min=min_epsilon, value_test=.2, nb_steps=max_steps)
    memory = SequentialMemory(limit=1000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  enable_dueling_network=True, dueling_type='avg', 
                   nb_actions=actions, nb_steps_warmup=100,
                   gamma=Gamma
                  )
    return dqn

In [42]:
del model

In [21]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=learning_rate))

In [22]:
dqn.fit(env, nb_steps=total_episodes, visualize=False, verbose=2)

Training for 3000 steps ...
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
  701/3000: episode: 1, duration: 168.648s, episode steps: 701, steps per second:   4, episode reward: 210.000, mean reward:  0.300 [ 0.000, 30.000], mean action: 2.466 [0.000, 5.000],  loss: 7082852.646556, mean_q: 188.187017, mean_eps: 0.963955
 1118/3000: episode: 2, duration: 124.704s, episode steps: 417, steps per second:   3, episode reward: 125.000, mean reward:  0.300 [ 0.000, 25.000], mean action: 2.595 [0.000, 5.000],  loss: 1.726446, mean_q: 4.220435, mean_eps: 0.918190
 1949/3000: episode: 3, duration: 252.506s, episode steps: 831, steps per second:   3, episode reward: 135.000, mean reward:  0.162 [ 0.000, 25.000], mean action: 2.566 [0.000, 5.000],  loss: 1.266827, mean_q: 4.011137, mean_eps: 0.862030
 2789/3000: episode: 4, duration: 247.854s, episode steps: 840, steps per second:   3, episode reward: 200.000, mean reward:  0.23

<tensorflow.python.keras.callbacks.History at 0x160be1ccf48>

In [23]:
scores = dqn.test(env, nb_episodes=total_test_episodes, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 10 episodes ...
Episode 1: reward: 210.000, steps: 1013
Episode 2: reward: 110.000, steps: 802
Episode 3: reward: 395.000, steps: 1356
Episode 4: reward: 135.000, steps: 828
Episode 5: reward: 430.000, steps: 1004
Episode 6: reward: 275.000, steps: 1261
Episode 7: reward: 80.000, steps: 689
Episode 8: reward: 75.000, steps: 412
Episode 9: reward: 140.000, steps: 762
Episode 10: reward: 115.000, steps: 687
196.5


In [24]:
dqn.save_weights('Save/3000/dqn.h5f')

In [None]:
dqn.load_weights('Save/3000/dqn.h5f')

### 1. Establish a baseline performance. How well did your Deep Q-learning do on your problem?

total_episodes = 3000
total_test_episodes = 10
max_steps = 10000
learning_rate = 0.01
Gamma = 0.9
epsilon = 1.0
min_epsilon = 0.1

Scored average 196.5 point in the test

### 2. What are the states, the actions, and the size of the Q-table?

The action is one of noop,fire,right,left,rightfire,leftfire.Corresponding numbers 0-5.  
The state is the picture information under the current frame.height 210,width 160, channel 3  
The q-table is simulated by a neural network consisting of three convolution layers and two dense layers.

### 3. What are the rewards? Why did you choose them? 

reward is score of game(3 life), Because the score is the most important thing in a atari game

### 4. How did you choose alpha and gamma in the Bellman equation? Try at least one additional value for alpha and gamma. How did it change the baseline performance?

0.01 and 0.9  

In [34]:
learning_rate = 0.1
Gamma = 0.99

In [35]:
def build_agent(model, actions):
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=epsilon, value_min=min_epsilon, value_test=.2, nb_steps=max_steps)
    memory = SequentialMemory(limit=1000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  enable_dueling_network=True, dueling_type='avg', 
                   nb_actions=actions, nb_steps_warmup=100,
                   gamma=Gamma
                  )
    return dqn

In [36]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=learning_rate))

In [37]:
dqn.fit(env, nb_steps=total_episodes, visualize=False, verbose=2)

Training for 3000 steps ...
  397/3000: episode: 1, duration: 80.559s, episode steps: 397, steps per second:   5, episode reward: 85.000, mean reward:  0.214 [ 0.000, 25.000], mean action: 2.529 [0.000, 5.000],  loss: 33114898048163196.000000, mean_q: 19349493.085983, mean_eps: 0.977635
 1204/3000: episode: 2, duration: 232.927s, episode steps: 807, steps per second:   3, episode reward: 95.000, mean reward:  0.118 [ 0.000, 20.000], mean action: 2.507 [0.000, 5.000],  loss: 4.172706, mean_q: 7.387519, mean_eps: 0.928000
 1833/3000: episode: 3, duration: 186.733s, episode steps: 629, steps per second:   3, episode reward: 110.000, mean reward:  0.175 [ 0.000, 30.000], mean action: 2.552 [0.000, 5.000],  loss: 4.560092, mean_q: 7.495776, mean_eps: 0.863380
 2570/3000: episode: 4, duration: 220.170s, episode steps: 737, steps per second:   3, episode reward: 140.000, mean reward:  0.190 [ 0.000, 30.000], mean action: 2.550 [0.000, 5.000],  loss: 6.195071, mean_q: 7.153993, mean_eps: 0.801

<tensorflow.python.keras.callbacks.History at 0x160cbaac488>

In [38]:
scores = dqn.test(env, nb_episodes=total_test_episodes, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 10 episodes ...
Episode 1: reward: 270.000, steps: 721
Episode 2: reward: 270.000, steps: 709
Episode 3: reward: 270.000, steps: 721
Episode 4: reward: 270.000, steps: 724
Episode 5: reward: 270.000, steps: 713
Episode 6: reward: 270.000, steps: 709
Episode 7: reward: 270.000, steps: 718
Episode 8: reward: 270.000, steps: 730
Episode 9: reward: 270.000, steps: 722
Episode 10: reward: 270.000, steps: 721
270.0


Obviously, the larger the value of alpha, the less the previous learned results are retained. The larger the value of gamma, the more long-term future benefits we take into account the value generated by the current behavior.In this experiment, this resulted in the agent being satisfied with a score of 270

In [39]:
dqn.save_weights('Save/differ/dqn.h5f')

### 5. Try a policy other than e-greedy. How did it change the baseline performance?

In [65]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=1000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  enable_dueling_network=True, dueling_type='avg', 
                   nb_actions=actions, nb_steps_warmup=100,
                   gamma=Gamma
                  )
    return dqn

In [66]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=learning_rate))

In [67]:
dqn.fit(env, nb_steps=total_episodes, visualize=False, verbose=2)

Training for 3000 steps ...
  781/3000: episode: 1, duration: 245.810s, episode steps: 781, steps per second:   3, episode reward: 210.000, mean reward:  0.269 [ 0.000, 30.000], mean action: 2.744 [0.000, 5.000],  loss: 720102.249747, mean_q: -10.475616
 1334/3000: episode: 2, duration: 214.154s, episode steps: 553, steps per second:   3, episode reward: 155.000, mean reward:  0.280 [ 0.000, 25.000], mean action: 2.826 [0.000, 5.000],  loss: 4.833099, mean_q: 7.209023
 2053/3000: episode: 3, duration: 283.262s, episode steps: 719, steps per second:   3, episode reward: 180.000, mean reward:  0.250 [ 0.000, 30.000], mean action: 2.723 [0.000, 5.000],  loss: 5.458886, mean_q: 7.381583
 2969/3000: episode: 4, duration: 354.171s, episode steps: 916, steps per second:   3, episode reward: 490.000, mean reward:  0.535 [ 0.000, 200.000], mean action: 2.440 [0.000, 5.000],  loss: 13.912703, mean_q: 7.651207
done, took 1109.288 seconds


<tensorflow.python.keras.callbacks.History at 0x160cd0e5788>

In [68]:
scores = dqn.test(env, nb_episodes=total_test_episodes, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 10 episodes ...
Episode 1: reward: 285.000, steps: 947
Episode 2: reward: 285.000, steps: 968
Episode 3: reward: 285.000, steps: 975
Episode 4: reward: 285.000, steps: 958
Episode 5: reward: 285.000, steps: 983
Episode 6: reward: 285.000, steps: 959
Episode 7: reward: 285.000, steps: 974
Episode 8: reward: 285.000, steps: 962
Episode 9: reward: 285.000, steps: 983
Episode 10: reward: 285.000, steps: 979
285.0


In [69]:
dqn.save_weights('Save/Boltzmann/dqn.h5f')

Boltzmann policy selects an action stochastically with a probability generated by soft-maxing Q values   
In this experiment, this resulted in the agent being satisfied with 285 points and failing to obtain the global optimal solution

### 6. How did you choose your decay rate and starting epsilon? Try at least one additional value for epsilon and the decay rate. How did it change the baseline performance? What is the value of epsilon when if you reach the max steps per episode?

epsilon = 1.0
min_epsilon = 0.1

In [51]:
epsilon = 0.9
min_epsilon = 0.01

In [52]:
def build_agent(model, actions):
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=epsilon, value_min=min_epsilon, value_test=.2, nb_steps=max_steps)
    memory = SequentialMemory(limit=1000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  enable_dueling_network=True, dueling_type='avg', 
                   nb_actions=actions, nb_steps_warmup=100,
                   gamma=Gamma
                  )
    return dqn

In [53]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=learning_rate))

In [54]:
dqn.fit(env, nb_steps=total_episodes, visualize=False, verbose=2)

Training for 3000 steps ...
  650/3000: episode: 1, duration: 167.275s, episode steps: 650, steps per second:   4, episode reward: 50.000, mean reward:  0.077 [ 0.000, 20.000], mean action: 2.562 [0.000, 5.000],  loss: 24046628.195463, mean_q: 344.595063, mean_eps: 0.866625
 1285/3000: episode: 2, duration: 215.114s, episode steps: 635, steps per second:   3, episode reward: 120.000, mean reward:  0.189 [ 0.000, 30.000], mean action: 2.567 [0.000, 5.000],  loss: 1.648278, mean_q: 8.818841, mean_eps: 0.813937
 2249/3000: episode: 3, duration: 326.490s, episode steps: 964, steps per second:   3, episode reward: 315.000, mean reward:  0.327 [ 0.000, 30.000], mean action: 2.526 [0.000, 5.000],  loss: 2.441885, mean_q: 7.963550, mean_eps: 0.742782
 2780/3000: episode: 4, duration: 180.468s, episode steps: 531, steps per second:   3, episode reward: 120.000, mean reward:  0.226 [ 0.000, 25.000], mean action: 2.426 [0.000, 5.000],  loss: 3.138482, mean_q: 7.990991, mean_eps: 0.676254
done, to

<tensorflow.python.keras.callbacks.History at 0x160cbe47d88>

In [55]:
scores = dqn.test(env, nb_episodes=total_test_episodes, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 10 episodes ...
Episode 1: reward: 15.000, steps: 441
Episode 2: reward: 60.000, steps: 677
Episode 3: reward: 400.000, steps: 1062
Episode 4: reward: 35.000, steps: 477
Episode 5: reward: 140.000, steps: 1140
Episode 6: reward: 225.000, steps: 665
Episode 7: reward: 25.000, steps: 676
Episode 8: reward: 125.000, steps: 901
Episode 9: reward: 20.000, steps: 371
Episode 10: reward: 65.000, steps: 429
111.0


In [57]:
dqn.save_weights('Save/eps/dqn.h5f')

The larger the epsilon, the more the agent focuses on exploration. In this experiment, the changed epsilon performance is not ideal

### 7. What is the average number of steps taken per episode? 

About 697 steps

### 8. Does Q-learning use value-based or policy-based iteration? 

Q-learning use value-based iteration, The goal is to get a complete and reliable q table, so that the agent can do the action with the largest reward in current state.  
policy-based algorithm directly models policy, gives the state, and obtains the action. It can output an action-dimensional discrete distribution from state, representing the probability of selecting each action, and agent can select the action according to this probability distribution.

### 9. Could you use SARSA for this problem? 

In [78]:
def build_Sagent(model, actions):
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=epsilon, value_min=min_epsilon, value_test=.2, nb_steps=max_steps)
    saras = SARSAAgent(model=model, nb_actions=actions, policy=policy, test_policy=policy, 
                       gamma=Gamma, nb_steps_warmup=100, train_interval=1)
    return saras

In [79]:
saras = build_agent(model, actions)
saras.compile(Adam(lr=learning_rate))

In [80]:
saras.fit(env, nb_steps=total_episodes, visualize=False, verbose=2)

Training for 3000 steps ...
  821/3000: episode: 1, duration: 306.750s, episode steps: 821, steps per second:   3, episode reward: 225.000, mean reward:  0.274 [ 0.000, 30.000], mean action: 2.319 [0.000, 5.000],  loss: 754688.322928, mean_q: 48.860606
 1658/3000: episode: 2, duration: 447.685s, episode steps: 837, steps per second:   2, episode reward: 210.000, mean reward:  0.251 [ 0.000, 30.000], mean action: 2.387 [0.000, 5.000],  loss: 1.199248, mean_q: 0.656727
 2407/3000: episode: 3, duration: 398.170s, episode steps: 749, steps per second:   2, episode reward: 120.000, mean reward:  0.160 [ 0.000, 30.000], mean action: 2.479 [0.000, 5.000],  loss: 1.600340, mean_q: 1.552044
done, took 1466.318 seconds


<tensorflow.python.keras.callbacks.History at 0x160cd399bc8>

In [81]:
scores = saras.test(env, nb_episodes=total_test_episodes, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 10 episodes ...
Episode 1: reward: 80.000, steps: 403
Episode 2: reward: 105.000, steps: 663
Episode 3: reward: 155.000, steps: 785
Episode 4: reward: 105.000, steps: 815
Episode 5: reward: 75.000, steps: 559
Episode 6: reward: 155.000, steps: 827
Episode 7: reward: 70.000, steps: 666
Episode 8: reward: 550.000, steps: 1064
Episode 9: reward: 65.000, steps: 666
Episode 10: reward: 105.000, steps: 563
146.5


In [82]:
saras.save_weights('Save/saras/saras.h5f')

Yes, I can. In this experiment, because of the small number of steps, there is no obvious gap between q-learning and saras

### 10. What is meant by the expected lifetime value in the Bellman equation?

The state has its duration, and the expected reward of the action corresponding to each state is different

### 11. When would SARSA likely do better than Q-learning? 

saras is more conservative and safe, so it is more suitable for some practical problems where failure will lead to loss.

### 12. How does SARSA differ from Q-learning?

SARSA is an on-policy method, Q-Learning is an off-policy method, the learning of sarsa is conservative and robust, and each episode and each step of each episode will perform episilon-greedy exploration; q-learning tends to To use the accumulation of experience to learn the optimal strategy

### 13. Explain the Q-learning algorithm.

Maintain a q table , which records all combinations of states and actions. First, we randomly initialize a Q table, and then arbitrarily initialize a state s. The agent uses the ε-greedy algorithm to select the action a corresponding to the state s according to the Q table. Because the agent makes an action, it will get a reward r from the environment. When the change occurs, the agent observes a new state s ′, according to the row of the Q table in the state s ′, query the Q table, obtain the maximum value, and then update the Q table according to the formula.

### 14. Explain the SARSA algorithm.

An Episode starts with a random selection of the first state s1. And select action a1 in the state based on the ε-greedy strategy. After the first step is executed, observe the next state s2, and immediately get the immediate reward r2 of s2. At this point, again based on the ε-greedy policy, action a2 is selected in state s2. After getting a2, update the Q function

### 15. What code is yours and what have you adapted?

### And 16. Did I explain my code clearly?

My code based on following references. I changed some of them to make it more in line with this assignment, rewrote and added some about experiment and question.  

<b>References:<b>  
[1] : https://www.youtube.com/watch?v=Mut_u40Sqz4   
[2] : https://en.wikipedia.org/wiki/Bellman_equation   
[3] : https://zhuanlan.zhihu.com/p/41840804/   

### 17. Did I explain my licensing clearly?

Copyright <2022> Peichen Han

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.