# Neural Network for estimating policy trained using policy_gradient method
-------------------------------------------------------------------------------------------------------------------
This notebook solves the cart-pole task using a neural network and trained using the policy_gradient method

In [1]:
import tensorflow as tf
import gym
import numpy as np
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters


define a function to play one episode of the cart-pole game and return the corresponding states and actions. Note that, for playing the game, we are using the neural network to take actions :). Yes! it's like inception. The neural network plays the game to get better at it.

#### Please don't mind the long function names :( I am sort of habitual to it ... 

In [2]:
get_environment = lambda: gym.make('CartPole-v0')  # create a lambda function for obtaining the environment

In [3]:
def play_one_cart_pole_episode(model, dis_gamma=0.1):
    """
        play single episode of the cart-pole game in order to generate the learning data
        @args:
            model: neural network object used to predict the action
        @returns:
            experience => (states, returns): tuple of lists of state and return (**Not reward)
    """
    # obtain the cart_pole environment
    env = get_environment()
    
    # reset environment to obtain the first set of observations
    obs = env.reset()
    
    # initialize the states and rewards lists
    states = [obs]
    rewards = []  # note that initial state has no reward corresponding to it
    
    # play the game untill it lasts
    done = False
    while not done:
        action_probs = model.predict(obs)
        action = np.random.choice(range(len(action_probs)), p=action_probs)
        
        # take the action on the environment
        obs, reward, done, _ = env.step(action)
        
        # append the state and reward to appropriate lists
        states.append(obs)
        rewards.append(reward)
        
    # now that we have the rewards, calculate the returns from them
    # Note that return for a state is the 
    # Expected value of rewards (here onwards) discounted by the discount factor \gamma
    # G(t) = r + (gamma * G(t + 1))
    
    # initialize the returns list **Note that the last state has a return of 0
    returns = [0]
    
    # calculate the returns in reverse order since it is efficient to do so
    while reward in reversed(rewards):
        returns.append(reward + (gamma * returns[-1]))
        
    # reverse the returns list
    returns = reversed(returns)
        
    # return the calculated lists
    return states, returns