In [1]:
import gym
import tensorflow as tf
from collections import deque
import numpy as np
from random import sample, randint
from tensorflow import keras
import matplotlib.pyplot as plt
from keras.utils.np_utils import to_categorical
import os
import glob
import tensorflow_probability as tfp
from tensorflow.keras.layers import Dense
from scipy.signal import savgol_filter

plt.rcParams['figure.figsize'] = (15, 7)
plt.rcParams.update({'font.size': 14})

In [2]:
'''
convenience functions
'''

def smooth(y, window, poly=1):
    '''
    y: vector to be smoothed 
    window: size of the smoothing window '''
    return savgol_filter(y,window,poly)

def save_data(data, ep_num, ep, lr, gamma):
    if ep_num > ep:
        temp = data[-1]
        temp = np.pad(temp,(0,ep_num-len(temp)),mode = 'constant',constant_values = (np.nan))
        data[-1] = temp
    fname = "reinforce_v1_lr"+"{:.4f}".format(lr)[-4:]+"_g"+"{:.4f}".format(gamma)[-4:]+"_runs{:1d}_eps".format(runs)+str(ep_num)+".csv"
    np.savetxt(fname, data, delimiter = "," )
    print("data saved in file {}".format(fname))
    if ep_num == ep:
        plot_smoothed_scores(score_stack)
        
def plot_smoothed_scores(data, save = 1, fname = "demo.pdf"):
    for run, scores in enumerate(data):
        avg_score_hist = smooth(scores,smoothen_over)
        plt.plot(avg_score_hist, label = "run "+str(run+1))
    plt.xlabel("Episodes")
    plt.ylabel("Rewards")
    avg = np.nanmean(data, axis = 0)
    plt.plot(avg, label = "average over "+str(len(data))+" runs")
    plt.legend()
    if save:
        plt.savefig(fname)
    plt.show()

cwd = os.getcwd()
if cwd == "/content":
    from google.colab import drive
print("cwd is :", cwd)
def printdocs(pname):
    exec("print("+str(pname)+".__doc__)")

if cwd == "/content":
    drive.mount('/content/drive')
    !ls /content/drive/MyDrive/data/RLA3_data
    %cd /content/drive/MyDrive/data/RLA3_data
    from Helper import argmax, softmax
else:
    print("cwd is :", cwd)

print("argmax docs :", argmax.__doc__)
print("softmax docs :", softmax.__doc__)

In [3]:
class policy_network(keras.Model):
    def __init__(self, n_states, n_actions, ):
        super().__init__()
        self.n_actions = n_actions
        self.ip = keras.layers.Flatten(input_shape = n_states)
        self.l1 = keras.layers.Dense(24, kernel_initializer = tf.keras.initializers.HeUniform(seed=None), activation="relu")
        self.l2 = keras.layers.Dense(24, kernel_initializer = tf.keras.initializers.HeUniform(seed=None), activation="relu")
        self.l3 = keras.layers.Dense(24, kernel_initializer = tf.keras.initializers.HeUniform(seed=None), activation="relu")
        self.op = keras.layers.Dense(n_actions, activation="softmax")
#     def __init__(self, n_actions, fc1_dims=256, fc2_dims=256):
#         super(policy_network, self).__init__()
#         self.fc1_dims = fc1_dims
#         self.fc2_dims = fc2_dims
#         self.n_actions = n_actions

#         self.fc1 = Dense(self.fc1_dims, activation='relu')
#         self.fc2 = Dense(self.fc2_dims, activation='relu')
#         self.pi = Dense(n_actions, activation='softmax')

    def call(self, state, ):
        fp = self.ip(state)
        fp = self.l1(fp)
        fp = self.l2(fp)
        fp = self.l3(fp)
        policy = self.op(fp)
        return policy

#     def call(self, state):
#         value = self.fc1(state)
#         value = self.fc2(value)

#         pi = self.pi(value)

#         return pi

In [4]:
class agent():
    def __init__(self, n_actions, n_states, lr = 0.003, gamma = 0.99,  ):
        self.lr = lr
        self.gamma = gamma
        self.n_actions = n_actions
        self.n_states = n_states
        self.states = []
        self.actions = []
        self.rewards = []
        self.pi = policy_network(n_states = self.n_states, n_actions = n_actions)                    # check
#         self.pi = policy_network(n_actions = n_actions)
        self.pi.compile(optimizer = keras.optimizers.Adam(learning_rate = self.lr))                  # check

    def get_act(self, state, ):
        '''takes a state and returns a tensor of action categorical probabilities'''
        state_t = tf.convert_to_tensor([state], dtype = tf.float32)
        probs = self.pi(state_t)
        cat_probs = tfp.distributions.Categorical(probs = probs)
        action = cat_probs.sample()
#         print(action.numpy()[0])
        
        return action.numpy()[0]
    
#         state = tf.convert_to_tensor([observation], dtype=tf.float32)
#         probs = self.policy(state)
#         action_probs = tfp.distributions.Categorical(probs=probs)
#         action = action_probs.sample()
#         print(action.numpy()[0])
#         return action.numpy()

    def remember(self, state, action, reward, ):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)

    def learn(self):
        actions_t = tf.convert_to_tensor(self.actions, dtype = tf.float32)
        rewards_arr = np.array(self.rewards)

        returns = np.zeros_like(rewards_arr)
        for state_id in range(returns.shape[-1]):
            returns_ds = 0
            discount_factor = 1
            for state_id_ds in range(state_id, returns.shape[-1]):
                returns_ds += rewards_arr[state_id_ds]*discount_factor
                discount_factor *= self.gamma
                
            returns[state_id] = returns_ds

        with tf.GradientTape() as tape:
            loss = 0
            for state_id, (return_g, state_g) in enumerate(zip(returns, self.states)):
                state_g = tf.convert_to_tensor([state_g], dtype = tf.float32)
                probabilities = self.pi(state_g)
                action_probabilities = tfp.distributions.Categorical(probs = probabilities)
                log_probabilities = action_probabilities.log_prob(actions_t[state_id])
                loss += -return_g * tf.squeeze(log_probabilities)

        grad = tape.gradient(loss, self.pi.trainable_variables)
        self.pi.optimizer.apply_gradients(zip(grad, self.pi.trainable_variables))

        self.states = []
        self.actions = []
        self.rewards = []


In [5]:
# learnrate = 0.001
# gam = 0.99
# runs = 8
# ep_num = 2000
# save_data_cadence = 200
# smoothen_over = 11
# if save_data_cadence>ep_num:
#     print("WARNING : data will not get saved if save cadence is more than ep_num")


# testing
learnrate = 0.001
gam = 0.99
runs = 8
ep_num = 300
save_data_cadence = 100
smoothen_over = 11
if save_data_cadence>ep_num:
    print("WARNING : data will not get saved if save cadence is more than ep_num")

In [None]:
if __name__ == "__main__":
    
    score_stack = []
    # run loop
    for run_num in range(runs):
        
        print("\n\n\n\nbeginning run number {}".format(run_num))

        # initialise environment
        env = gym.make('CartPole-v1')

        # initialize agent
        agent_007 = agent(lr = learnrate, gamma = gam, n_actions = env.action_space.n, n_states = env.observation_space.shape)

        # list to maintain score
        score_hist = []
        score_stack.append(score_hist)
        
        # episode loop
        for ep in range(ep_num):
            done = False
            score = 0
            s = env.reset()

            # step loop
            while not done:
                a = agent_007.get_act(s)
                s_next,reward,done,_ = env.step(a)
                agent_007.remember(s,a,reward)
                s = s_next
                score += reward
#                 env.render()

            # append score to score list
            score_hist.append(score)

            # make agent learn
            agent_007.learn()
            avg_score = np.mean(score_hist[-100:])
            print('episode: ', ep,'score: %.1f' % score,
                ', average score %.1f' % avg_score)

            if (ep+1) % save_data_cadence == 0:
                score_stack[-1] = score_hist
                save_data(score_stack, ep_num, ep, learnrate, gam)

plot_smoothed_scores(score_stack)





beginning run number 0
episode:  0 score: 32.0 , average score 32.0
episode:  1 score: 13.0 , average score 22.5
episode:  2 score: 33.0 , average score 26.0
episode:  3 score: 37.0 , average score 28.8
episode:  4 score: 25.0 , average score 28.0
episode:  5 score: 24.0 , average score 27.3
episode:  6 score: 37.0 , average score 28.7
episode:  7 score: 23.0 , average score 28.0
episode:  8 score: 14.0 , average score 26.4
episode:  9 score: 41.0 , average score 27.9
episode:  10 score: 28.0 , average score 27.9
episode:  11 score: 38.0 , average score 28.8
episode:  12 score: 30.0 , average score 28.8
episode:  13 score: 18.0 , average score 28.1
episode:  14 score: 25.0 , average score 27.9
episode:  15 score: 52.0 , average score 29.4
episode:  16 score: 29.0 , average score 29.4
episode:  17 score: 29.0 , average score 29.3
episode:  18 score: 18.0 , average score 28.7
episode:  19 score: 13.0 , average score 27.9
episode:  20 score: 24.0 , average score 27.8
episode:  21 scor

episode:  175 score: 28.0 , average score 58.6
episode:  176 score: 65.0 , average score 58.8
episode:  177 score: 90.0 , average score 59.4
episode:  178 score: 189.0 , average score 60.9
episode:  179 score: 72.0 , average score 61.1
episode:  180 score: 47.0 , average score 61.1
episode:  181 score: 197.0 , average score 62.2
episode:  182 score: 111.0 , average score 63.0
episode:  183 score: 45.0 , average score 63.2
episode:  184 score: 70.0 , average score 63.4
episode:  185 score: 106.0 , average score 63.7
episode:  186 score: 71.0 , average score 64.0
episode:  187 score: 86.0 , average score 64.8
episode:  188 score: 79.0 , average score 65.3
episode:  189 score: 53.0 , average score 65.5
episode:  190 score: 112.0 , average score 66.3
episode:  191 score: 36.0 , average score 66.6
episode:  192 score: 70.0 , average score 66.9
episode:  193 score: 81.0 , average score 67.2
episode:  194 score: 50.0 , average score 67.2
episode:  195 score: 68.0 , average score 67.5
episode:

episode:  46 score: 41.0 , average score 28.5
episode:  47 score: 47.0 , average score 28.9
episode:  48 score: 20.0 , average score 28.7
episode:  49 score: 40.0 , average score 28.9
episode:  50 score: 31.0 , average score 29.0
episode:  51 score: 57.0 , average score 29.5
episode:  52 score: 48.0 , average score 29.9
episode:  53 score: 25.0 , average score 29.8
episode:  54 score: 36.0 , average score 29.9
episode:  55 score: 32.0 , average score 29.9
episode:  56 score: 95.0 , average score 31.1
episode:  57 score: 76.0 , average score 31.8
episode:  58 score: 98.0 , average score 33.0
episode:  59 score: 71.0 , average score 33.6
episode:  60 score: 79.0 , average score 34.3
episode:  61 score: 82.0 , average score 35.1
episode:  62 score: 83.0 , average score 35.9
episode:  63 score: 47.0 , average score 36.0
episode:  64 score: 139.0 , average score 37.6
episode:  65 score: 55.0 , average score 37.9
episode:  66 score: 32.0 , average score 37.8
episode:  67 score: 60.0 , averag