<a href="https://colab.research.google.com/github/Yogesh7920/Reinforcement-Learning/blob/master/RL_Temporal_Difference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [None]:
import numpy as np
from copy import deepcopy

## Reward Matrix

In [None]:
grid_size = 5
reward = np.array(
    [[0 for j in range(grid_size)] for i in range(grid_size)]
)

reward[0, -1] = 1

In [None]:
reward

array([[0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

## Policy Matrix

In [None]:
policy = [['R' for j in range(grid_size)] for i in range(grid_size)]
for r in range(grid_size):
    policy[r][-1] = 'U'

policy[0][-1] = 'S'

In [None]:
np.matrix(policy)

matrix([['R', 'R', 'R', 'R', 'S'],
        ['R', 'R', 'R', 'R', 'U'],
        ['R', 'R', 'R', 'R', 'U'],
        ['R', 'R', 'R', 'R', 'U'],
        ['R', 'R', 'R', 'R', 'U']], dtype='<U1')

In [None]:
def take_action(x, y, action):
    if action == 'R':
        y += 1
    elif action == 'U':
        x -= 1

    return x, y

## Config

In [None]:
cur_x, cur_y = grid_size-1, 0 # bottom left
alpha = 0.9
gamma = 0.9
e = 1e-5

# Temporal Difference

In [None]:
def temporal_diff(i, j):
    next_i, next_j = take_action(i, j, policy[i][j])
    return V[i][j] + alpha * (reward[i][j] + gamma*V[next_i][next_j] - V[i][j])

In [None]:
V = np.zeros((grid_size, grid_size))
change = float('inf')

while True:
    new_V = deepcopy(V)
    for i in range(grid_size):
        for j in range(grid_size):
            new_V[i][j] = temporal_diff(i, j)
    
    change = np.max(np.abs(V-new_V))
    if change < e:
      break
    V = new_V

In [None]:
np.round(V, 5)

array([[6.56089, 7.28989, 8.09989, 8.99989, 9.99989],
       [5.90479, 6.56089, 7.28989, 8.09989, 8.99989],
       [5.3143 , 5.90479, 6.56089, 7.28989, 8.09989],
       [4.78286, 5.3143 , 5.90479, 6.56089, 7.28989],
       [4.30456, 4.78286, 5.3143 , 5.90479, 6.56089]])

# Temporal Differnece with Function Approximation

In [None]:
def features(x, y):
    arr = np.array([
        1, x, y, abs(x-y), x*y, x**2, y**2,
                           x**2+y**2, (x+y)**2
    ]).reshape(9, 1)
    return arr

def features_norm(x, y):
    feats = features(x, y)
    return feats / np.linalg.norm(feats)

In [None]:
def temporal_diff_fun(i, j, weights):
    next_i, next_j = take_action(i, j, policy[i][j])
    feats = features_norm(i, j)
    feats_T = feats.flatten()
    next_feats = features_norm(next_i, next_j)
    next_feats_T = next_feats.flatten()
    td = gamma * (np.dot(next_feats_T, weights)) - (np.dot(feats_T, weights))
    return reward[i, j] + td


In [None]:
alpha = 0.9
gamma = 0.95
e = 1e-5

In [None]:
weights = np.random.randn(9)
change = float('inf')
while True:

    old_weights = deepcopy(weights)

    for i in range(grid_size):
        for j in range(grid_size):
            delta_weights = temporal_diff_fun(i, j, weights)
            feats = features_norm(i, j).flatten()
            weights += alpha * np.dot(delta_weights, feats)

    change = np.max(np.abs(weights-old_weights))
    if change < e:
        break


In [None]:
weights

array([ 7.8831539 ,  0.34978433, -4.53027617, -0.1849415 , -0.3811357 ,
        0.21082104,  6.83780154,  5.50629884,  4.75281316])

In [None]:
V = []
for i in range(grid_size):
    temp = []
    for j in range(grid_size):
        v = np.dot(weights, features_norm(i, j))
        temp.append(v.flatten()[0])
    V.append(temp)

V = np.array(V)

In [None]:
np.round(V, 5)

array([[7.88315, 8.27309, 8.85322, 9.11936, 9.27756],
       [7.55991, 7.92192, 8.30126, 8.65294, 8.89935],
       [6.63492, 7.00538, 7.53419, 7.9925 , 8.34583],
       [6.33909, 6.6405 , 7.0719 , 7.50458, 7.87082],
       [6.22083, 6.46656, 6.80763, 7.17258, 7.51446]])