In [1]:
import tensorflow as tf
import numpy as np
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
sessConfig=tf.ConfigProto(gpu_options=gpu_options)

In [5]:
npV = np.arange(5
               )
npV

array([0, 1, 2, 3, 4])

In [98]:

class RLUtilsTF():

    def createRowVec( self, n:int, initialVals=0., dtype=None, name=None ):
        """[summary]
        
        Arguments:
            n {int} -- [description]
        
        Keyword Arguments:
            initialVals {[type]} -- [description] (default: {0.})
            dtype {[type]} -- [description] (default: {None})
            name {[type]} -- [description] (default: {None})
        
        Returns:
            [type] -- Row vector with a initial value 0.
        """


        if dtype is None:
            return tf.Variable(tf.fill( [n, 1], initialVals ), name=name )
        else:
            return tf.Variable(
                        tf.cast( tf.fill( [n, 1], initialVals ), name=name , dtype=dtype )
            )
    

    def createRowVecfromNumpyVector( self, npV:np.ndarray, dtype=None, name=None ):

        if dtype is not None:
            npV = npV.astype( dtype )
        
        return tf.Variable( tf.reshape(npV, [-1,1]), name=name )

    def createRowVecfromNumpyMatrix( self, npV, dtype=None, name=None ):

        if dtype is not None:
            npV = npV.astype( dtype )
            
        return tf.Variable( npV, name=name  )
    
    def createRandomProbabiltyMatrix( self, n, dtype=tf.float32 ):

        with tf.name_scope( 'rlUtilsP' ):

            vec = tf.get_variable("temp_vec", shape=[n], initializer=tf.zeros_initializer(), dtype=dtype )
            
            ta = tf.TensorArray( dtype=dtype, size=n )

            loopVars = ( 0, ta )

            condition = lambda i, _: i < n

            def body( i, ta ): 
                randomizeVec = vec.assign( tf.random_uniform( shape=[n], seed=39, dtype=dtype ) )
                softmaxVec = tf.divide( randomizeVec, tf.reduce_sum( randomizeVec ) )
                with tf.control_dependencies([randomizeVec, softmaxVec]):
                    return ( i + 1, ta.write(i, softmaxVec ) ) 
            

            _, ta_final = tf.while_loop( condition, body, loopVars )

            return ta_final.stack()


    

In [99]:
rlUtils = RLUtilsTF()


In [102]:
with tf.variable_scope( 'test3', reuse=False):
    b1 = rlUtils.createRowVec(5, dtype=tf.int64)
    b2 = tf.random_uniform( [5], seed=39 )
    a = [1,2]
    b3 = rlUtils.createRandomProbabiltyMatrix(5, dtype=tf.float64)

with tf.Session( config=sessConfig ) as sess:
    sess.run(tf.global_variables_initializer())
    b1_out, b2_out, b3_out = sess.run([b1,b2,b3])
    print( b1_out )
    print(b2_out)
    print(b3_out)


[[0]
 [0]
 [0]
 [0]
 [0]]
[0.9911251  0.5737276  0.6360707  0.15947068 0.9381956 ]
[[3.94010980e-01 3.75632064e-02 2.14421871e-01 2.61124659e-02
  3.27891477e-01]
 [2.73176148e-01 3.49314804e-01 2.75400302e-01 9.70524975e-02
  5.05624875e-03]
 [4.86278559e-01 1.30606528e-01 1.47819623e-01 7.76311423e-03
  2.27532176e-01]
 [1.77969489e-01 1.44348042e-01 3.15334965e-01 3.30558712e-01
  3.17887921e-02]
 [4.29984280e-04 5.59505891e-01 1.68737561e-01 1.94013282e-01
  7.73132826e-02]]


In [104]:
with tf.variable_scope( 'test4', reuse = False ):
    V = rlUtils.createRowVec(5, name='V', dtype=tf.float64)
    R = tf.random_normal([5,1], mean = 0, stddev=1.0, seed=39, name='reward', dtype=tf.float64 )
    P = rlUtils.createRandomProbabiltyMatrix( 5, dtype=tf.float64 )
    gamma = 0.5

In [105]:

class BellmanTF():

    def oneShotMRPEvaluation(self, V, P, R, gamma):
        """Don't call it more than once over the same set of variables. Evaluate the operator whenever needed.
        It is only good for small MRP's with discount < 1, finite reward set, finite state set, 
        static transition probabilities. Caculates all the values of states.
        
        Arguments:
            V {np.variable} -- matrix of shape [n,1] where n is the number of states
            P {np.variable} -- matrix of [n,n] where each row is s and each column is s'. static transition probabilities
            R {np.variable} -- same as the shape of V. finite reward set
            gamma{ same type as P}
        
        Returns:
            
        """

        dnom = tf.subtract(tf.ones_like(P), tf.multiply(P, gamma) )
        invDnom = tf.matrix_inverse(dnom)
        return V.assign(tf.matmul(invDnom, R))

In [106]:
bellman = BellmanTF()
updateV = bellman.oneShotMRPEvaluation( V, P, R, gamma )

In [107]:
with tf.variable_scope( 'test', reuse=True):
    b2 = tf.subtract(tf.ones_like(P), tf.multiply(P, gamma) )
    b3 = tf.matrix_inverse(b2)
    b1 = updateV

with tf.Session( config=sessConfig ) as sess:
    sess.run(tf.global_variables_initializer())
    b1_out, b2_out, b3_out = sess.run([b1,b2,b3])
    print( b1_out )
    print(b2_out)
    print(b3_out)

[[-2.98639307]
 [ 2.87398087]
 [ 2.88728901]
 [-6.10398221]
 [ 3.86017701]]
[[0.80299451 0.9812184  0.89278906 0.98694377 0.83605426]
 [0.86341193 0.8253426  0.86229985 0.95147375 0.99747188]
 [0.75686072 0.93469674 0.92609019 0.99611844 0.88623391]
 [0.91101526 0.92782598 0.84233252 0.83472064 0.9841056 ]
 [0.99978501 0.72024705 0.91563122 0.90299336 0.96134336]]
[[  6.60906436  -0.10416289  -8.35139734  -0.39950945   2.46822754]
 [  2.52949158  -1.73726967  -1.28072452   3.06895584  -2.358231  ]
 [ -9.64247958 -11.33521341  14.20757932   2.41269913   4.57963677]
 [  7.30171541   8.44721377  -7.29896434  -6.68708212  -1.5406605 ]
 [ -6.44301274   4.27162799   2.96881759   2.09942185  -2.67463247]]


In [None]:
with tf.Session( config=sessConfig ) as sess:
    sess.run(tf.global_variables_initializer())
    V_out, V_0, R_out, P_out = sess.run( [updateV, V, R, P] )
    print(V_out)
    print(V_0)
    print(R_out)
    print(P_out)