### **Reward Shaping Deliverable**

__From the experimentation it is observed that the `MountainCar-v0` environment only offers positive reward when final goal is achieved for evey other scenario only negative reward values are assigned. Hence, we use the technique of reward shaping to add additional reward in supplement to our original existing goal rewards.__ 

__In this deliverable we focus on using neural networks and reward shaping techniques to measure our agent's performance in terms of net total of rewards gained.__


### **Rendering Component Declaration**

In [1]:
# Execute the below stated statements only one time.
# apt-get update is important for updating path for xvfb library.
!sudo apt-get update  > /dev/null 2>&1
# This specificity of version is very important for compatability reasons.
!apt-get install -y xvfb x11-utils > /dev/null 2>&1
 # gym version compatability with other libraries is required for rendering with this approach.
!pip install gym==0.17.* pyvirtualdisplay==0.2.* PyOpenGL==3.1.* PyOpenGL-accelerate==3.1.*  > /dev/null 2>&1

In [2]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only

In [3]:
import math
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay

In [4]:
import random
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [5]:
# This creates virtual display to send the frames for being rendered.
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1366, 768))
display.start()

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1366x768x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1366x768x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [6]:
def show_video():
    # This function loads the data video inline into the colab notebook.
    # By reading the video stored by the Monitor class.
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 256px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    
# Note: Here, the wrap_env function is different in the sense that it only 
# records and displays images post the model training part only.
def wrap_env(env):
    # This monitoring tool records the outputs from the output and saves it a
    # mp4 file in the stated directory.
    # If we don't change the video directory the videos will get stored in 'content/' directory.
    env = Monitor(env, './video', video_callable=lambda episode_id: episode_id > 10000, force=True)
    return env

In [7]:
env = wrap_env(gym.make('MountainCar-v0'))
env.reset()
goal_steps = 200
score_requirement = -198
intial_games = 10000 # running 10K games to learn the agents behavior.

In [8]:
def play_a_random_game_first():
    '''
    Running some demo observation iterations of the agent in the environment.
    '''
    for step_index in range(goal_steps):
        # env.render()
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        print("Step {}:".format(step_index))
        print("action: {}".format(action))
        print("observation: {}".format(observation))
        print("reward: {}".format(reward))
        print("done: {}".format(done))
        print("info: {}".format(info))
        if done:
            break
    env.reset()
# Running the whole episode for the observation.
play_a_random_game_first()

### **Model Data Preparation with redefined reward function and model training**

In [10]:
def model_data_preparation():
    training_data = []
    accepted_scores = []
    for game_index in range(intial_games):
        score = 0
        game_memory = []
        previous_observation = [] # it stores previous pos, vel observation data.
        for step_index in range(goal_steps):
            action = random.randrange(0, 3)
            observation, reward, done, info = env.step(action)
            # Essentially, we are taking (s(t-1), a(t)) tuple and 
            # evaluating whether our new reward function gives positive reward.
            
            # With this our model learns to take appropriate actions based on
            # previous state and predict current action that needs to be taken.
            if len(previous_observation) > 0:
                game_memory.append([previous_observation, action])
                
            previous_observation = observation
            if observation[0] > -.22: # Relatively dense reward function.
                reward = 1
            
            score += reward
            if done:
                break
            
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                # one-hot encoding the actions taken.
                if data[1] == 1:
                    output = [0, 1, 0]
                elif data[1] == 0:
                    output = [1, 0, 0]
                elif data[1] == 2:
                    output = [0, 0, 1]
                training_data.append([data[0], output])
        
        env.reset()
    
    print(accepted_scores)
    
    return training_data

In [11]:
# Preparing the data for training the model.
training_data = model_data_preparation()

[-184.0, -180.0, -170.0, -188.0, -182.0, -188.0, -186.0, -184.0, -198.0, -182.0, -174.0, -188.0, -188.0, -196.0, -186.0, -190.0, -184.0, -174.0, -180.0, -168.0, -196.0, -164.0, -166.0, -182.0, -188.0, -190.0, -190.0, -164.0, -182.0, -190.0, -180.0, -196.0, -178.0, -194.0, -178.0, -176.0, -190.0, -192.0, -186.0, -174.0, -190.0, -190.0, -188.0, -190.0, -178.0, -190.0, -190.0, -194.0, -192.0, -184.0, -156.0, -140.0, -180.0, -188.0, -192.0, -186.0, -178.0, -188.0, -168.0, -194.0, -186.0, -182.0, -190.0, -192.0, -182.0, -164.0, -190.0, -172.0, -168.0, -180.0, -176.0, -192.0, -192.0, -178.0, -188.0, -166.0, -184.0, -190.0, -196.0, -184.0, -170.0, -190.0, -182.0, -174.0, -186.0, -184.0, -190.0, -184.0, -170.0, -178.0, -172.0, -176.0]


In [12]:
# Defining the model structure for training.
def build_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(128, input_dim=input_size, activation='relu'))
    model.add(Dense(52, activation='relu'))
    model.add(Dense(output_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam())
    return model

In [13]:
# function for training the model.
def train_model(training_data):
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))
    y = np.array([i[1] for i in training_data]).reshape(-1, len(training_data[0][1]))
    model = build_model(input_size=len(X[0]), output_size=len(y[0]))
    model.fit(X, y, epochs=5)
    return model

In [14]:
trained_model = train_model(training_data)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### **Model prediction and agent performance analysis**

In [15]:
scores = []
choices = []
for each_game in range(10):
    score = 0
    prev_obs = []
    print("playing game no: " + str(each_game+1))
    for step_index in range(goal_steps):
        # env.render()
        if len(prev_obs)==0:
            action = random.randrange(0,2)
        else:
            # predicting the next action based on the redefined reward function
            # in the environment.
            action = np.argmax(trained_model.predict(prev_obs.reshape(-1, len(prev_obs)))[0])
        
        choices.append(action)
        new_observation, reward, done, info = env.step(action)
        prev_obs = new_observation
        score+=reward
        if done:
            break
    env.reset()
    scores.append(score)

print(scores)
print('Average Score:',sum(scores)/len(scores))
print('choice 1:{}  choice 0:{} choice 2:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices),choices.count(2)/len(choices)))

[-170.0, -167.0, -162.0, -172.0, -160.0, -88.0, -157.0, -88.0, -158.0, -163.0, -129.0, -168.0, -157.0, -161.0, -157.0, -157.0, -165.0, -159.0, -171.0, -166.0, -91.0, -200.0, -167.0, -167.0, -200.0, -160.0, -175.0, -163.0, -177.0, -158.0, -200.0, -164.0, -167.0, -158.0, -163.0, -163.0, -165.0, -92.0, -157.0, -160.0, -167.0, -165.0, -165.0, -88.0, -170.0, -158.0, -88.0, -173.0, -161.0, -96.0, -171.0, -163.0, -161.0, -178.0, -179.0, -165.0, -89.0, -158.0, -164.0, -167.0, -88.0, -91.0, -166.0, -89.0, -161.0, -169.0, -88.0, -89.0, -128.0, -95.0, -166.0, -157.0, -200.0, -174.0, -88.0, -170.0, -91.0, -163.0, -168.0, -169.0, -176.0, -93.0, -89.0, -157.0, -200.0, -94.0, -162.0, -200.0, -167.0, -200.0, -181.0, -160.0, -89.0, -162.0, -171.0, -169.0, -158.0, -162.0, -160.0, -177.0]
Average Score: -151.85
choice 1:0.10536713862364175  choice 0:0.4418834376028976 choice 2:0.45274942377346067


### **Agent output and result downloads**

In [16]:
# Displaying the agent's performance in the environment.
show_video()

In [17]:
# !zip -r /content/file.zip /content/video
# from google.colab import files
# files.download("/content/file.zip")