In [43]:
import numpy as np
import time
import random
#Defining Grid

#defining action variables
up=0
down=1
left=2
right=3

#no. of states and no. of variables
noS=4*4
noA=4

S=range(noS)

step_reward=-1    #for every step
wall_reward=-10
goal_reward=0

terminal_state = lambda s: s==15  #first and last state terminal

def update_P(wall):
    P=dict() #transition probabilities

    for s in S:
        P[s]=dict()

        if (terminal_state(s)):
            P[s][up]=(s,1.0,0.0)   # next_state, probability, reward
            P[s][down]=(s,1.0,0.0)
            P[s][right]=(s,1.0,0.0)
            P[s][left]=(s,1.0,0.0)
        else:
            next_s= s if(s<4) else s-4
            if terminal_state(next_s):
                P[s][up]=(next_s,1.0,goal_reward)
            elif next_s in wall:
                P[s][up]=(s,1.0,-10)
            else:
                P[s][up]=(next_s,1.0,step_reward)

            next_s= s if(16-s<=4) else s+4
            if terminal_state(next_s):
                P[s][down]=(next_s,1.0,goal_reward)
            elif next_s in wall:
                P[s][down]=(s,1.0,-10)
            else:
                P[s][down]=(next_s,1.0,step_reward)

            next_s= s if((s+1)%4==0) else s+1
            if terminal_state(next_s):
                P[s][right]=(next_s,1.0,goal_reward)
            elif next_s in wall:
                P[s][right]=(s,1.0,-10)
            else:
                P[s][right]=(next_s,1.0,step_reward)

            next_s= s if(s%4==0) else s-1
            if terminal_state(next_s):
                P[s][left]=(next_s,1.0,goal_reward)
            elif next_s in wall:
                P[s][left]=(s,1.0,-10)
            else:
                P[s][left]=(next_s,1.0,step_reward)
                
    return P
        
#wall=[5,7,9]
#P=update_P(wall)
        
print 'No. of states in grid: ', noS
print 'No. of action options in each state:', noA

Action_Index=dict()
Action_Index[0]='up'
Action_Index[1]='down'
Action_Index[2]='left'
Action_Index[3]='right'
Action_Index[5]='terminal states (stay)'
Action_Index[7]='wall'

print 'Index for actions:'
for k,v in Action_Index.items():
    print k,":",v

No. of states in grid:  16
No. of action options in each state: 4
Index for actions:
0 : up
1 : down
2 : left
3 : right
5 : terminal states (stay)
7 : wall


In [34]:
def e_greedy(e,Q_s):
    x=random.randrange(1,11)
    if x<=e*10:
        return random.randrange(noA)
    else:
        return np.argmax(Q_s)
    

In [45]:

def q_learning(P,no_episodes,no_steps,alpha,discount,epsilon):
    fails=0
    Q=np.zeros((noS,noA))
    
    for e in range(no_episodes):
        
        #starting point
        S=random.randrange(noS)
        step=0
        while S!=15 and (step<no_steps): #and (S not in wall)
            A=e_greedy(epsilon,Q[S])
            S_,garb,R=P[S][A]
            A_=np.argmax(Q[S_])
            Q[S][A]=Q[S][A]+alpha*(R+discount*Q[S_][A_]-Q[S][A])
            S=S_
            step+=1
        #print step    
        if terminal_state(S)==False:
            fails+=1
            
    print fails
    return Q

#Testing
wall=[4,6]
P=update_P(wall)

fixedObstacles=True
result_q=q_learning(P,2000,200,0.1,0.7,0.1)
print result_q

policy=np.zeros((noS))
bestQ=np.zeros((noS))

for s in range(noS):
    if terminal_state(s):
        policy[s]=5
        bestQ[s]=5
    elif s in wall and fixedObstacles==True:
        policy[s]=7
        bestQ[s]=7
    else:
        policy[s]=np.argmax(result_q[s])
        bestQ[s]=max(result_q[s])

print 'Best Policy with Q Learning'
print policy.reshape(4,4)
    
print 'Corresponding Values for Q Learning'
print bestQ.reshape(4,4)

print 'Index for actions:'
for k,v in Action_Index.items():
    print k,":",v

0
[[-2.81964814 -4.55797742 -2.8091894  -2.77308975]
 [-2.64572124 -2.53299882 -2.6910316  -2.53299871]
 [-2.38056985 -7.80508427 -2.31210118 -2.19      ]
 [-1.99134258 -1.7        -2.28084574 -2.02723707]
 [-2.58990513 -2.52926196 -5.21737441 -2.52896851]
 [-2.57258985 -2.19       -6.92142885 -8.46064548]
 [-2.03997537 -1.68286113 -1.93953083 -1.68166331]
 [-1.9224856  -1.         -9.24408012 -1.5577243 ]
 [-7.34279169 -2.18997721 -2.32021694 -2.18997733]
 [-2.33425528 -1.7        -2.3086754  -1.7       ]
 [-9.51878055 -1.         -1.95863069 -1.        ]
 [-1.53606311  0.         -1.5458317  -0.93538918]
 [-2.03123803 -1.93922286 -1.96160406 -1.7       ]
 [-1.997358   -1.62565254 -1.92147159 -1.        ]
 [-1.50205309 -0.90152291 -1.60772148  0.        ]
 [ 0.          0.          0.          0.        ]]
Best Policy with Q Learning
[[ 3.  3.  3.  1.]
 [ 7.  1.  7.  1.]
 [ 1.  3.  3.  1.]
 [ 3.  3.  3.  5.]]
Corresponding Values for Q Learning
[[-2.77308975 -2.53299871 -2.19       -1

In [17]:
#Value Iteration

def value_iteration(P,discount,threshold):
    #Initialisation
    value=np.zeros((noS,))
    
    while True:
        new_policy=np.zeros([noS,4])
        change=0
        for s in S:
            if s!=15:
                v=value[s]
                action_values = np.zeros(noA)
                for a in range(noA):   		# Iterating over all the actions     
                        next_state,probability,reward = P[s][a]
                        action_values[a] += probability*(reward + discount*value[next_state])
                max_total = np.amax(action_values)   # taking the max reward value 
                best_a = np.argmax(action_values)

                value[s]=max_total
                new_policy[s][best_a]=1

                change=max(change,np.abs(v-value[s]))
            
        if change < threshold:
              break
    
    value[wall]=13            
    return new_policy,value.reshape(4,4)

start=time.clock()
discount=0.9
threshold=0.0001
best_policy,corr_value=value_iteration(P,discount,threshold)
end=time.clock()

show_best_policy=np.zeros(noS,)
for s,p_s in enumerate(best_policy):
    if terminal_state(s):
        show_best_policy[s]=5
    elif s in wall:
        show_best_policy[s]=7
    else:
        show_best_policy[s]=np.argmax(p_s)
        
print wall    
print 'Best policy with Value Iteration is'
print show_best_policy.reshape(4,4)
print 'Corresponding Value Function is'
print corr_value.reshape(4,4)
print 'Time taken'
print end-start

[0 8 5]
Best policy with Value Iteration is
[[ 7.  3.  1.  2.]
 [ 1.  7.  1.  1.]
 [ 7.  1.  1.  1.]
 [ 3.  3.  3.  5.]]
Corresponding Value Function is
[[ 13.     -3.439  -2.71   -3.439]
 [ -3.439  13.     -1.9    -1.   ]
 [ 13.     -1.9    -1.      0.   ]
 [ -1.9    -1.      0.      0.   ]]
Time taken
0.00294861117618


In [109]:
import random

def td0(P,policy,no_episodes,alpha,discount):
    value=np.zeros((noS,))
    
    for e in range(no_episodes):
        S=random.randrange(noS)
        step=0
        while S!=0 and S!=15 and (step<200): # and (S not in wall) 
            A=policy[S]
            S_,garb,R=P[S][A]
            value[S]=value[S]+alpha*(R+discount*value[S_]-value[S])
            S=S_
            step+=1
    return value

temp = np.random.normal(7,3,3)
global wall
wall = temp.astype(int)
update_P(wall)
print wall
test_policy=policy
print policy.reshape(4,4)
result_value=td0(P,test_policy,1000,0.1,0.7)
    
print 'values'
print result_value.reshape(4,4)

[3 5 4]
[[ 5.  2.  2.  3.]
 [ 0.  0.  1.  1.]
 [ 0.  1.  1.  1.]
 [ 2.  3.  3.  5.]]
values
[[ 0.          9.99999999  5.98515291 -3.33333333]
 [ 9.99996013  5.97952064  3.16225205  5.97399671]
 [ 5.95677926  3.19708384  5.99987169  9.99987296]
 [-3.33333333  5.99999614 10.          0.        ]]
