In [None]:
Assignment2

YanXuan
001563047

In [22]:
# Step 1. import dependencies
# import gym, a reinforcement learning experimental environment containing geometric motion
import gym
#import OpenCV2
import cv2
from collections import deque,namedtuple
import numpy as np
from numpy import clip
import random
# Automated Hyperparameter Tuning with Keras Tuner
from keras.models import Sequential
from keras.layers import Conv2D,Flatten,Dense
from tensorflow.keras.optimizers import Adam
#import warnings
import warnings
warnings.filterwarnings('ignore')

In [15]:
# Step 2. Build Buffer and Deep-Q Learning Network
# namedtuple allows to access directly by name
conv = namedtuple('Conv', 'filter kernel stride')

# Step 2.1. Builder Buffer
# Saving the training data in a Buffer and then random sampling is a good way to get the data close to IID
# The parts of the entire training process that involve Buffer are high IO operations
class Buffer:
    def __init__(self,size):
        self.size = size
        self.buffer = deque()
    # Add in Buffer
    def add(self,s,a,r,s2,t):
        s = np.stack((s[0],s[1],s[2],s[3]),axis=2)
        s2 = np.stack((s2[0],s2[1],s2[2],s2[3]),axis=2)
        if len(self.buffer) < self.size:
            self.buffer.appendleft((s,a,r,s2,t))
        else:
            self.buffer.pop()
            self.buffer.appendleft((s,a,r,s2,t))
    # Return the size of Buffer
    def sample(self,batch_size):
        return random.sample(self.buffer,batch_size)

In [16]:
# Step 2.2 Initialize the Deep Q Networks network, which is a combination of deep learning and Q learning
class DQN:
    def __init__(self,buff,batch_size=32,min_buff=10000,gamma=0.99,learning_rate=2.5e-4):
        # Initialize Buffer and gamma value
        self.buffer = buff
        self.gamma = gamma
        self.min_buffer = min_buff
        self.batch_size = batch_size
        # Initialize model from network
        self.target_model = create_network(learning_rate)
        self.model = create_network(learning_rate)
        # Copy of network
        self.copy_network()
        
    def train(self):
        # Termination condition
        if len(self.buffer.buffer) < self.min_buffer:
            return
        # Get states etc. from Buffer
        states,actions,rewards,next_states,terminal = map(np.array,zip(*self.buffer.sample(self.batch_size)))
        # Choose next action
        next_state_action_values = np.max(self.target_model.predict(next_states),axis=1)
        # Get targets and renew its values
        targets = self.model.predict(states)
        targets[range(self.batch_size), actions] = rewards + self.gamma*next_state_action_values*np.invert(terminal)
        # Return to train again
        self.model.train_on_batch(states, targets)
        
    def copy_network(self):
        frm = self.model
        to = self.target_model
        # Copy self model to target model
        for l_tg,l_sr in zip(to.layers,frm.layers):
            wk = l_sr.get_weights()
            l_tg.set_weights(wk)
        
    def predict(self,x):
        # Return predict value
        s = np.stack((x[0],x[1],x[2],x[3]),axis=2)
        return self.model.predict(np.array([s]))

In [17]:
# Step 2.3 define network's learning rate etc.
def create_network(learning_rate,conv_info=[conv(32,8,4),conv(64,4,2),conv(64,3,1)],dense_info=[512],input_size=(80,80,4)):
    model = Sequential()
    # use relu framework for filttering
    for i,cl in enumerate(conv_info):
        if i==0:
            # initialize the model with size
            model.add(Conv2D(cl.filter,cl.kernel,padding="same",strides=cl.stride,activation="relu", input_shape=input_size))
        else:
            model.add(Conv2D(cl.filter,cl.kernel,padding="same",strides=cl.stride,activation="relu"))
    model.add(Flatten())
    # Define dense in model
    for dl in dense_info:
        model.add(Dense(dl,activation="relu"))
    model.add(Dense(6))
    # Create Adam optimizer
    adam = Adam(lr=learning_rate)
    # from keras import loss function as mse
    model.compile(loss='mse',optimizer=adam)
    return model

In [23]:
# Step 3. Create Pong-v4 Agent from gym

# Step 3.1 Load environment
env = gym.make('Pong-v4')

# get the shape of environment
height, width, channels = env.observation_space.shape
# memory actions that available in this environment
actions = env.action_space.n

#show all actions in this game and the space shape
print("Shape of the game space: ",env.observation_space.shape)
print("Actions you can do: ",env.unwrapped.get_action_meanings())

Shape of the game space:  (210, 160, 3)
Actions you can do:  ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']


In [24]:
# Step 3.2 Test in the environment

EPISODES = 10
scores = []
scores_clipped = []

# Play game twice
for episode in range(1, EPISODES + 1):
    #Reset env everytime
    state = env.reset()
    done = False
    score = 0 
    score_clipped = 0
    
    # Randomly choose action and state and record the score
    while not done:
        action = random.choice(range(env.action_space.n))
        n_state, reward, done, info = env.step(action)
        score += reward
        # count clipped score
        score_clipped += clip(reward, -1.0, 1.0)
    
    scores.append(score)
    scores_clipped.append(score_clipped)
    print(f"in episode {episode}, clipped reward is {score_clipped}, and reward is {score}")

# Get mean values and output them
avg = np.mean(scores)
avg_clipped = np.mean(scores_clipped)
print(f"clipped score is {avg_clipped}, and average reward is {avg} ")

#close the environment
env.close()

in episode 1, clipped reward is -20.0, and reward is -20.0
in episode 2, clipped reward is -20.0, and reward is -20.0
in episode 3, clipped reward is -19.0, and reward is -19.0
in episode 4, clipped reward is -21.0, and reward is -21.0
in episode 5, clipped reward is -20.0, and reward is -20.0
in episode 6, clipped reward is -20.0, and reward is -20.0
in episode 7, clipped reward is -19.0, and reward is -19.0
in episode 8, clipped reward is -21.0, and reward is -21.0
in episode 9, clipped reward is -21.0, and reward is -21.0
in episode 10, clipped reward is -20.0, and reward is -20.0
clipped score is -20.1, and average reward is -20.1 


In [11]:
# Step 3.3 Create Pong-v4 Agent
class Pong:
    def __init__(self):
        # Choose Pong-v4 from gym as environment
        self.env = gym.make('Pong-v4')
        self.epsilon = 1
        self.buffer = Buffer(50000)
        self.dqn = DQN(self.buffer)
        self.copy_period = 40000
        self.itr = 0
        self.eps_step = 0.0000009
    
    # Enter the state vector, get the strategy
    def sample_action(self,s):
        if random.random() < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.dqn.predict(s)[0])

    # play one episode
    def play_one_episode(self):
        # start from reseted environment
        observation = self.env.reset()
        done = False
        state = []
        # update state from observation
        update_state(state,observation)
        prv_state = []
        total_reward = 0
        
        while not done:
            # choose different actions based on state
            if len(state) < 4:
                action = self.env.action_space.sample()
            else:
                action = self.sample_action(state)
            prv_state.append(state[-1])
            
            # Don't allow previous state exceed 4
            if len(prv_state) > 4:
                prv_state.pop(0)
            
            # Record this action's result
            observation, reward, done, _ = self.env.step(action)
            
            # update state based on observation
            update_state(state,observation)
            
            # Situation to add situation in buffer
            if len(state) == 4 and len(prv_state) == 4:
                self.buffer.add(prv_state,action,reward,state,done)
            total_reward += reward
            
            # Training with iterators
            self.itr += 1
            if self.itr % 4 == 0:
                self.dqn.train()
            self.epsilon = max(0.1,self.epsilon-self.eps_step)
            if self.itr % self.copy_period == 0:
                self.dqn.copy_network()

        return total_reward

In [12]:
# Step 4. Define some help functions

# convert an image from one color space to another and resize it
def downsample(observation):
    s = cv2.cvtColor(observation[30:,:,:], cv2.COLOR_BGR2GRAY)
    s = cv2.resize(s, (80,80), interpolation = cv2.INTER_AREA) 
    s = s/255.0
    return s

# Update state
def update_state(state,observation):
    ds_observation = downsample(observation)
    state.append(ds_observation)
    if len(state) > 4:
        state.pop(0)

# get result from action of model
def sample_action(model,s):
    return np.argmax(model.predict(np.array([np.stack((s[0],s[1],s[2],s[3]),axis=2)]))[0])

In [None]:
# Step 5. Train the model

# Get Plong object
p = Pong()

# Set training times as 50000
for i in range(50000):
    # Print reward everytime when training finished
    total_reward = p.play_one_episode()
    print("episode total reward:",total_reward)
    
    #Save model every 200 episode
    if i%200 == 0:
        print("Saving the model")
        p.dqn.model.save("model-0.h5".format(i))

In [36]:
# Step 6. Run the environment and see the training result
from keras.models import load_model

# Initialize the environment from gym
env = gym.make('Pong-v4',render_mode='human')
# load the trained model.h5 file
model = load_model('model-0.h5')

done = False
state = []
observation = env.reset()
update_state(state,observation)
score = 0

while not done:
    if len(state) < 4:
        action = env.action_space.sample()
    else:
        action = sample_action(model,state)
    observation, reward, done, _ = env.step(action)
    score+=reward
    update_state(state,observation)



In [40]:
# We win the game for getting the positive score
print(f"final score is {score}")

final score is 4


In [None]:
Questions and Answers

1. Establish a baseline performance. How well did your Deep Q-learning do on your problem? (5 Points)
A:  The baseline performance: batch_size = 32, total_episodes = 50000, gamma = 0.99, learning_rate = 2.5e-4, max_steps = 1/9e-6, 
    epsilon = 1.0, max_epsilon = 1.0, min_epsilon = 0.1
    How well did your Deep Q-learning do on your problem: Before training, the training scores were all negative, but after DQN training, 
    the obtained game scores became positive, indicating that DQN won the model.

2. What are the states, the actions, and the size of the Q-table? (5 Points)
A:  Shape of the game space:  (210, 160, 3)
    Actions you can do:  ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']
    States: len(['paddle1_pos','paddle2_pos','ball-pos','ball_direction'])==4
    Size of q-table: States * Actions = 4*6 = 24

3. What are the rewards? Why did you choose them? (5 Points)
A:  It is stipulated here that the reward is the score of each round of the players on both sides. According to the rules, 
    when an opposing player misses the ball or hits the ball out of bounds, the opponent gets a point. 
    The condition to win the game is for one team to score 21 points first. We want to get as high a score as possible in a game.

4. How did you choose alpha and gamma in the Bellman equation? Try at least one additional value for alpha and gamma. 
   How did it change the baseline performance?  (5 Points)
A:  Alpha is the learning-rate, which equals 2.5e-4 , and gamma is 0.99, all defined in DQN.
    Because the model training takes too long, the theory is discussed below.
    The larger the value of Alpha, the more the model is inclined to the action of high reward, that is, 
    the greater the impact of high reward on the action. Gamma is the discount factor. 
    The discount factor essentially determines how much the reinforcement learning agent cares about the distant future 
    relative to recent rewards. Gamma ranges from 0 to 1 . 
    The closer Gamma is to 0, the more the agent tends to only consider the present reward; the closer Gamma is to 1, 
    the more the agent tends to increase the weight in the future, that is, it is willing to delay reward.

5. Try a policy other than e-greedy. How did it change the baseline performance? (5 Points)
A:  Epsilon-Greedy is a simple method to balance exploration and exploitation by choosing between exploration and exploitation randomly.
    The epsilon-greedy, where epsilon refers to the probability of choosing to explore, 
    exploits most of the time with a small chance of exploring.
    
6. How did you choose your decay rate and starting epsilon? Try at least one additional value for epsilon and the decay rate. 
   How did it change the baseline performance? What is the value of epsilon when if you reach the max steps per episode? (5 Points)
A:  The learning rate, or alpha value, is a tuning parameter in the optimization algorithm that determines the step size 
    for each iteration while moving towards the minimum value of the loss function. , 
    which represents the leap in finding the optimal strategy. For basic QLearning, 
    it represents the degree to which the Q value is updated at each step. A higher alpha value means a larger Q value update. 
    While the agent is learning, gradually converge the learning rate to stabilize the model to arrive at the final optimal policy.
    Epsilon is used when choosing a specific action based on the Q value we already have. 
    If epsilon is equal to zero, the highest value of all q values for each state is always selected. 
    It is easy to fall into the trap of local optima.
    
7. What is the average number of steps taken per episode? (5 Points)
A:  The movements of paddles are  continuously, and average number of steps taken per episode is 39.87ms/step
    
8. Does Q-learning use value-based or policy-based iteration? (5 Points)
A:  Q-learning is a value-based learning algorithm that estimates a value function using a greedy policy obtained 
    from the last policy refinement. 
    Q-learning is an off-policy learner, meaning that it learns the value of the optimal policy independently 
    of the agent's behavior. It finds an optimal policy, taking into account the exploration inherent in the policy.
    
9. Could you use SARSA for this problem? (5 Points)
A:  Yes, but its training results are likely to be worse than DQN. SARSA is a policy learning algorithm 
    where the agent interacts with the environment and updates the policy based on the actions taken.
    The most important difference between the two is how Q is updated after each operation. 
    SARSA fully follows the ε-greedy policy using Q, while Q-learning uses the maximum Q across all possible moves in the next step.
    This makes it look like following a greedy policy with ε=0, i.e. no exploration in this part.
    
10. What is meant by the expected lifetime value in the Bellman equation?(5 Points)
A:  The meaning of H(x, a, v) is that at the current state x , choose the lifetime value associated with action a , 
    continue using the reward function v , attributing the value to the state. 
    That is, the function is an abstract representation of the value that the Bellman equation maximizes. 
    The sets x and a are arbitrary.
    
11. When would SARSA likely do better than Q-learning? (5 Points)
A:  Q-learning can learn a better deterministic strategy than Sarsa, but if the learned random strategy is used directly, 
    the actual path that Q-learning finds is worse than Sarsa, because of the action selection strategy of Q-learning and 
    Sarsa In other words, the final learned random strategy is an epsilon-greedy strategy, not a deterministic greedy strategy, 
    so the final strategy is to randomly select an action in the form of probability, instead of selecting a fixed action based 
    on a certain value, so that every time an action is selected There are also temptations, so Sarsa's final random strategy is 
    safer because it is more detoured, while the random strategy learned by Q-learning is more risky, and the final strategy 
    is a random strategy, which means that it will always remain constant. Exploratory, so using random strategies,
    Q-learning is more likely to perform poorly.

12. How does SARSA differ from Q-learning? (5 Points)  
A:  Q-Learning:  Q(s,a)←Q(s,a)+α[r+γmaxQ(s',a′)−Q(s,a)]
    SARSA:       Q(s,a)←Q(s,a)+α[r+γQ(s',a')−Q(s,a)] 
    The update equation for SARSA depends on the current state, current action, reward obtained, next state and next action. 
    This observation lead to the naming of the learning technique as SARSA stands 
    for State Action Reward State Action which symbolizes the tuple (s, a, r, s’, a’).
   1.Q-learning is more prone to convergence problems when training neural networks because it has higher per-sample variance than SARSA.
   2.Q-learning ignores the penalty of exploratory actions, while SARSA considers the possible penalty of exploratory actions 
    because it is close to convergence. So if there is a risk of a large number of negative rewards close to the optimal path, 
    Q-learning will tend to trigger that reward while exploring, 
    while SARSA will tend to avoid dangerous optimal paths and only learn to use when the exploration parameters are reduced it.
   3.Q-learning learns the optimal policy directly, while SARSA learns a near-optimal policy while exploring. 
    Learning an optimal policy with SARSA requires deciding a policy to attenuate ε in ε-greedy action selection, 
    which is a fine-tuned process.
    
13. Explain the Q-learning algorithm. (5 Points)  
A:  Q-Learning:  Q(s,a)←Q(s,a)+α[r+γmaxQ(s',a′)−Q(s,a)]
    Q-learning is a model-free reinforcement learning algorithm. 
    It is a value-based learning algorithm, where the value-based algorithm updates the value function 
    according to equations (specifically the Bellman equation). It is an off-policy learner that learns the value of the optimal policy 
    independently of the agent's behavior. Q stands for quality and indicates how useful a given action is in obtaining future rewards.
    Q(s,a) is the expected value of executing a in state s and then following the optimal policy.
    Q-learning uses time differences to estimate the value of Q(s,a). 
    The temporal difference is that the agent learns from the environment through episodes with no prior knowledge of the environment.
    The agent maintains a Q[S, A] table, where S is the set of states and A is the set of actions.
    Q[s,a] represents its current estimate of Q(s,a).
                                         
14. Explain the SARSA algorithm. (5 Points)  
A:  SARSA:       Q(s,a)←Q(s,a)+α[r+γQ(s',a')−Q(s,a)] 
    The full name of Sarsa is state-action-reward-state-action. The purpose is to learn the value Q of a specific action 
    in a specific state, and finally build and optimize a Q table, with state as row and action as column, 
    based on the interaction with the environment. reward to update the Q table. 
    In order to better explore the environment in the training, Sarsa adopts the ε-greedy method to train, 
    and there is a certain probability to randomly select the action output.
    The process is, initialize the Q-values table, Q(s, a), then observe the current state, and choose an action, 
    for that state based on one of the action selection policies. Observe the current state, s', choose an action, a', 
    for that state based on one of the action selection policies. Take the action, and observe the reward, r, 
    as well as the new state, s'. Update the Q-value for the state using the observed reward and the maximum reward possible 
    for the next state. Set the state to the new state, and repeat the process until a terminal state is reached.
                                         
15. What code is yours and what have you adapted? (5 Points)
A:  1. optimize scores and clips presentations
    2. modify model outcomes, stroe the best records
    3. devide and organize codes to clearfiy the structure
                                         
16. Did I explain my code clearly? (10 Points)
A:  Yes. I completed a small game Atari-like game based on DQN and OpenCV environment, realized the training and 
    simulation of playing table tennis game, and got a positive score, indicating that the model has won after training.

17. Did I explain my licensing clearly? (5 Points)
A:  Yes.
                                         
18. Professionalism (10 Points)
A:  In this experiment, I strictly followed the naming convention and encapsulated the functions in classes for ease of use. 
    In the process of controlling variables, I keep one variable and the other unchanged to get the correct experimental results.
    This experiment takes a long time to train, so it is hoped that there will be a better platform for model training. 
    This experiment made me better understand the creation of DQN network and the training steps of the model, 
    which made my understanding of deep learning more profound.

In [None]:
References:
1. A Beginners Guide to Q-Learning, Chathurangi Shyalika https://towardsdatascience.com/a-beginners-guide-to-q-learning-c3e2a30a653c.
2. Human-level control through deep reinforcement learning https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf.
3. Practical Reinforcement Learning — 02 Getting started with Q-learning, Shreyas Gite
   https://towardsdatascience.com/practical-reinforcement-learning-02-getting-started-with-q-learning-582f63e4acd9.
4. RL Agent for Atari Game Pong https://github.com/amirhossein-hkh/pong-dqn.
5. Gym Retro, https://openai.com/blog/gym-retro/.
6. Deep Reinforcement Learning for Atari Games Python Tutorial | AI Plays Space Invaders, https://www.youtube.com/watch?v=hCeJeq8U0lo.

In [None]:
LICENSE:

MIT License

Copyright (c) 2022 Yan Xuan

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.