In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import gym

In [None]:
env = gym.make('CartPole-v0')

In [None]:
class cartpole:
    def __init__(self, theta, gamma, train_param):
        self.theta = theta
        self.gamma = gamma
        self.train_param = train_param
        
    def decaysum(self, rewards):
        """Calculate sum of future rewards with decay factor gamma"""
        length = rewards.shape[0]
        weights = (self.gamma * np.ones(length)) ** np.arange(length)
        weightedsum = np.dot(rewards, weights)
        return weightedsum
    
    def action_simple(self, observation):
        """Move left if pole is leaning left and right otherwise"""
        if observation[2] < 0: return 0
        else: return 1
        
    def action_sigmoid(self, observation, theta):
        """Move left or right based on a sigmoid function"""
        sigmoid = 1 / (1 + np.exp(-np.dot(observation, theta)))
        # Go right (1) with probability sigmoid, else go left (0)
        rand_cf = np.random.rand(1)
        if rand_cf < sigmoid: return 1
        else: return 0 
    
    def update_theta(self, theta, actions, states, fwd_rewards):
        """Find the direction of gradient descent to update theta"""
        num_steps = actions.shape[0]
        update = np.zeros(4)
        for step in range(num_steps):
            factor1 = int(actions[step]==0) - int(actions[step]==1)
            factor2 = 1 - 1/(1+np.exp(-np.dot(observation, theta)))
            update = update + factor1 * factor2 * states[step]
        return update
    
    def run_simulations(self, num_episodes, episode_length):
        """Run simulations and find direction in which to update theta"""
        
        theta = self.theta
        update_all = np.zeros(4)
        train_param = self.train_param
        gamma = self.gamma
        
        # Shows the number of steps until the pole falls. If the
        # pole does not fall, we want it to give the episode_length
        success_array = episode_length * np.ones(num_episodes)
        
        for episode in range(num_episodes):
            observation = env.reset()
            
            # Keep log of the states, actions, and rewards
            states = np.zeros(shape=(episode_length, 4))
            actions = np.zeros(shape=(episode_length))
            rewards = np.zeros(shape=(episode_length))
    
            # Run each episode and record relevant data
            for step in range(episode_length):
                states[step] = observation
                action = self.action_sigmoid(observation, theta)
                observation, reward, done, info = env.step(action)
                actions[step] = action
                rewards[step] = reward
                
                # Finish the loop if pole has tilted too far
                if done:
                    success_array[episode] = step + 1
                    break
                
            # Input the total (decaying) future rewards
            fwd_rewards = np.zeros(shape=(episode_length))
            for step in range(episode_length):
                fwd_rewards[step] = self.decaysum(rewards[step:])
            
            update = self.update_theta(theta, actions, states, fwd_rewards)
            update_all = update_all + update

        print(np.mean(success_array))
        print(update_all)
        print(theta + train_param * update_all)