# Comparison of Solutions for Markov Decision Processes

In this project, we will be comparing a variety of classic dynamic programming and linear programming methods. From the dynamic programming realm, the solution methods are:
-  Value Iteration
-  Policy Iteration

and for linear programming methods:
-  First order methods
-  Interior point methods
-  Simplex methods

## Introduction

Markov Decision Processes (MDPs) provide a mathematical formulation for stochastic decision making

In [15]:
import numpy as np
import math as m
import numpy.linalg as la
from scipy.optimize import linprog
import numpy.random as rn
import matplotlib.pyplot as plt

Below is an example of a candidate Grid World that will be solved using the above methods.

## Generating the MDP

In [6]:
# This is the grid world generator with 5 possible actions for the agent i.e. (left,right,up,down,loop).
# Inputs:
# Row: Number of rows of the grid world
# Col: Number of columns of the grid world
# Prob: Probability of taking the desired action
# Output:
# P_0: Transition matrix with dimensions 5 x (Row x Col) x (Row x Col)  

# When an action is chosen by the agent, it is performed with probability Prob and the remaining (1-Prob) probability
# is distributed among other actions. This property is included to introduce stochasticity.
# Output matrix is formed in a way that 0th row of the matrix represent transitions from the  bottom left corner 
# of the grid world. Similarly, the last row is for the upper right corner ((Row x Col)th grid).

def Grid_world(Row,Col,Prob):
    State=Row*Col
    Actions=5
    np.random.seed(0)
    prob=Prob
    P_0=np.zeros((Actions,State,State))
    #action left
    for i in range(P_0.shape[1]):
            if i%Col==0:
                if i!=0 and i!=State-Col:
                    P_0[0,i,i]=prob+(1-prob)/4
                    P_0[0,i,i+1]=(1-prob)/4
                    P_0[0,i,i+Col]=(1-prob)/4
                    P_0[0,i,i-Col]=(1-prob)/4
                if i==0:
                    P_0[0,i,i]=prob+(1-prob)/4+(1-prob)/4
                    P_0[0,i,i+1]=(1-prob)/4
                    P_0[0,i,i+Col]=(1-prob)/4
                if i==State-Col:
                    P_0[0,i,i]=prob+(1-prob)/4+(1-prob)/4
                    P_0[0,i,i+1]=(1-prob)/4
                    P_0[0,i,i-Col]=(1-prob)/4
            elif i%Col==Col-1:
                if i!=Col-1 and i!=State-1:
                    P_0[0,i,i]=(1-prob)/4+(1-prob)/4
                    P_0[0,i,i-1]=prob
                    P_0[0,i,i+Col]=(1-prob)/4
                    P_0[0,i,i-Col]=(1-prob)/4
                if i==Col-1:
                    P_0[0,i,i]=(1-prob)/4+(1-prob)/4+(1-prob)/4
                    P_0[0,i,i-1]=prob
                    P_0[0,i,i+Col]=(1-prob)/4
                if i==State-1:
                    P_0[0,i,i]=(1-prob)/4+(1-prob)/4+(1-prob)/4
                    P_0[0,i,i-1]=prob
                    P_0[0,i,i-Col]=(1-prob)/4
            elif i>0 and i<Col-1:
                P_0[0,i,i]=(1-prob)/4+(1-prob)/4
                P_0[0,i,i-1]=prob
                P_0[0,i,i+1]=(1-prob)/4
                P_0[0,i,i+Col]=(1-prob)/4
            elif i>(Row-1)*Col and i<State-1:
                P_0[0,i,i]=(1-prob)/4+(1-prob)/4
                P_0[0,i,i-1]=prob
                P_0[0,i,i+1]=(1-prob)/4
                P_0[0,i,i-Col]=(1-prob)/4
            else:
                P_0[0,i,i]=(1-prob)/4
                P_0[0,i,i-1]=prob
                P_0[0,i,i+1]=(1-prob)/4
                P_0[0,i,i-Col]=(1-prob)/4
                P_0[0,i,i+Col]=(1-prob)/4
    # action right
    for i in range(P_0.shape[1]):
            if i%Col==0:
                if i!=0 and i!=State-Col:
                    P_0[1,i,i]=(1-prob)/4+(1-prob)/4
                    P_0[1,i,i+1]=prob
                    P_0[1,i,i+Col]=(1-prob)/4
                    P_0[1,i,i-Col]=(1-prob)/4
                if i==0:
                    P_0[1,i,i]=(1-prob)/4+(1-prob)/4+(1-prob)/4
                    P_0[1,i,i+1]=prob
                    P_0[1,i,i+Col]=(1-prob)/4
                if i==State-Col:
                    P_0[1,i,i]=(1-prob)/4+(1-prob)/4+(1-prob)/4
                    P_0[1,i,i+1]=prob
                    P_0[1,i,i-Col]=(1-prob)/4
            elif i%Col==Col-1:
                if i!=Col-1 and i!=State-1:
                    P_0[1,i,i]=prob+(1-prob)/4
                    P_0[1,i,i-1]=(1-prob)/4
                    P_0[1,i,i+Col]=(1-prob)/4
                    P_0[1,i,i-Col]=(1-prob)/4
                if i==Col-1:
                    P_0[1,i,i]=prob+(1-prob)/4+(1-prob)/4
                    P_0[1,i,i-1]=(1-prob)/4
                    P_0[1,i,i+Col]=(1-prob)/4
                if i==State-1:
                    P_0[1,i,i]=prob+(1-prob)/4+(1-prob)/4
                    P_0[1,i,i-1]=(1-prob)/4
                    P_0[1,i,i-Col]=(1-prob)/4
            elif i>0 and i<Col-1:
                P_0[1,i,i]=(1-prob)/4+(1-prob)/4
                P_0[1,i,i-1]=(1-prob)/4
                P_0[1,i,i+1]=prob
                P_0[1,i,i+Col]=(1-prob)/4
            elif i>(Row-1)*Col and i<State-1:
                P_0[1,i,i]=(1-prob)/4+(1-prob)/4
                P_0[1,i,i-1]=(1-prob)/4
                P_0[1,i,i+1]=prob
                P_0[1,i,i-Col]=(1-prob)/4
            else:
                P_0[1,i,i]=(1-prob)/4
                P_0[1,i,i-1]=(1-prob)/4
                P_0[1,i,i+1]=prob
                P_0[1,i,i-Col]=(1-prob)/4
                P_0[1,i,i+Col]=(1-prob)/4
    # action up
    for i in range(P_0.shape[1]):
            if i%Col==0:
                if i!=0 and i!=State-Col:
                    P_0[2,i,i]=(1-prob)/4+(1-prob)/4
                    P_0[2,i,i+1]=(1-prob)/4
                    P_0[2,i,i+Col]=prob
                    P_0[2,i,i-Col]=(1-prob)/4
                if i==0:
                    P_0[2,i,i]=(1-prob)/4+(1-prob)/4+(1-prob)/4
                    P_0[2,i,i+1]=(1-prob)/4
                    P_0[2,i,i+Col]=prob
                if i==State-Col:
                    P_0[2,i,i]=prob+(1-prob)/4+(1-prob)/4
                    P_0[2,i,i+1]=(1-prob)/4
                    P_0[2,i,i-Col]=(1-prob)/4
            elif i%Col==Col-1:
                if i!=Col-1 and i!=State-1:
                    P_0[2,i,i]=(1-prob)/4+(1-prob)/4
                    P_0[2,i,i-1]=(1-prob)/4
                    P_0[2,i,i+Col]=prob
                    P_0[2,i,i-Col]=(1-prob)/4
                if i==Col-1:
                    P_0[2,i,i]=(1-prob)/4+(1-prob)/4+(1-prob)/4
                    P_0[2,i,i-1]=(1-prob)/4
                    P_0[2,i,i+Col]=prob
                if i==State-1:
                    P_0[2,i,i]=prob+(1-prob)/4+(1-prob)/4
                    P_0[2,i,i-1]=(1-prob)/4
                    P_0[2,i,i-Col]=(1-prob)/4
            elif i>0 and i<Col-1:
                P_0[2,i,i]=(1-prob)/4+(1-prob)/4
                P_0[2,i,i-1]=(1-prob)/4
                P_0[2,i,i+1]=(1-prob)/4
                P_0[2,i,i+Col]=prob
            elif i>(Row-1)*Col and i<State-1:
                P_0[2,i,i]=prob+(1-prob)/4
                P_0[2,i,i-1]=(1-prob)/4
                P_0[2,i,i+1]=(1-prob)/4
                P_0[2,i,i-Col]=(1-prob)/4
            else:
                P_0[2,i,i]=(1-prob)/4
                P_0[2,i,i-1]=(1-prob)/4
                P_0[2,i,i+1]=(1-prob)/4
                P_0[2,i,i-Col]=(1-prob)/4
                P_0[2,i,i+Col]=prob
    # action down
    for i in range(P_0.shape[1]):
            if i%Col==0:
                if i!=0 and i!=State-Col:
                    P_0[3,i,i]=(1-prob)/4+(1-prob)/4
                    P_0[3,i,i+1]=(1-prob)/4
                    P_0[3,i,i+Col]=(1-prob)/4
                    P_0[3,i,i-Col]=prob
                if i==0:
                    P_0[3,i,i]=prob+(1-prob)/4+(1-prob)/4
                    P_0[3,i,i+1]=(1-prob)/4
                    P_0[3,i,i+Col]=(1-prob)/4
                if i==State-Col:
                    P_0[3,i,i]=(1-prob)/4+(1-prob)/4+(1-prob)/4
                    P_0[3,i,i+1]=(1-prob)/4
                    P_0[3,i,i-Col]=prob
            elif i%Col==Col-1:
                if i!=Col-1 and i!=State-1:
                    P_0[3,i,i]=(1-prob)/4+(1-prob)/4
                    P_0[3,i,i-1]=(1-prob)/4
                    P_0[3,i,i+Col]=(1-prob)/4
                    P_0[3,i,i-Col]=prob
                if i==Col-1:
                    P_0[3,i,i]=prob+(1-prob)/4+(1-prob)/4
                    P_0[3,i,i-1]=(1-prob)/4
                    P_0[3,i,i+Col]=(1-prob)/4
                if i==State-1:
                    P_0[3,i,i]=(1-prob)/4+(1-prob)/4+(1-prob)/4
                    P_0[3,i,i-1]=(1-prob)/4
                    P_0[3,i,i-Col]=prob
            elif i>0 and i<Col-1:
                P_0[3,i,i]=prob+(1-prob)/4
                P_0[3,i,i-1]=(1-prob)/4
                P_0[3,i,i+1]=(1-prob)/4
                P_0[3,i,i+Col]=(1-prob)/4
            elif i>(Row-1)*Col and i<State-1:
                P_0[3,i,i]=(1-prob)/4+(1-prob)/4
                P_0[3,i,i-1]=(1-prob)/4
                P_0[3,i,i+1]=(1-prob)/4
                P_0[3,i,i-Col]=prob
            else:
                P_0[3,i,i]=(1-prob)/4
                P_0[3,i,i-1]=(1-prob)/4
                P_0[3,i,i+1]=(1-prob)/4
                P_0[3,i,i-Col]=prob
                P_0[3,i,i+Col]=(1-prob)/4
    # action loop
    for i in range(P_0.shape[1]):
            if i%Col==0:
                if i!=0 and i!=State-Col:
                    P_0[4,i,i]=prob+(1-prob)/4
                    P_0[4,i,i+1]=(1-prob)/4
                    P_0[4,i,i+Col]=(1-prob)/4
                    P_0[4,i,i-Col]=(1-prob)/4
                if i==0:
                    P_0[4,i,i]=prob+(1-prob)/4+(1-prob)/4
                    P_0[4,i,i+1]=(1-prob)/4
                    P_0[4,i,i+Col]=(1-prob)/4
                if i==State-Col:
                    P_0[4,i,i]=prob+(1-prob)/4+(1-prob)/4
                    P_0[4,i,i+1]=(1-prob)/4
                    P_0[4,i,i-Col]=(1-prob)/4
            elif i%Col==Col-1:
                if i!=Col-1 and i!=State-1:
                    P_0[4,i,i]=prob+(1-prob)/4
                    P_0[4,i,i-1]=(1-prob)/4
                    P_0[4,i,i+Col]=(1-prob)/4
                    P_0[4,i,i-Col]=(1-prob)/4
                if i==Col-1:
                    P_0[4,i,i]=prob+(1-prob)/4+(1-prob)/4
                    P_0[4,i,i-1]=(1-prob)/4
                    P_0[4,i,i+Col]=(1-prob)/4
                if i==State-1:
                    P_0[4,i,i]=prob+(1-prob)/4+(1-prob)/4
                    P_0[4,i,i-1]=(1-prob)/4
                    P_0[4,i,i-Col]=(1-prob)/4
            elif i>0 and i<Col-1:
                P_0[4,i,i]=prob+(1-prob)/4
                P_0[4,i,i-1]=(1-prob)/4
                P_0[4,i,i+1]=(1-prob)/4
                P_0[4,i,i+Col]=(1-prob)/4
            elif i>(Row-1)*Col and i<State-1:
                P_0[4,i,i]=prob+(1-prob)/4
                P_0[4,i,i-1]=(1-prob)/4
                P_0[4,i,i+1]=(1-prob)/4
                P_0[4,i,i-Col]=(1-prob)/4
            else:
                P_0[4,i,i]=prob
                P_0[4,i,i-1]=(1-prob)/4
                P_0[4,i,i+1]=(1-prob)/4
                P_0[4,i,i-Col]=(1-prob)/4
                P_0[4,i,i+Col]=(1-prob)/4
    return P_0

## Value Iteration

## Policy Iteration

## Simplex Method

In [23]:
Rows=3
Columns=5
States=Rows*Columns
P_0=Grid_world(Rows,Columns,0.8)
A=np.zeros((5*States,States)) # 5 is number of actions !
b=np.zeros(5*States)
c=np.ones(States)
# Constraints  Ax<=b 
for a in range(5):
    A[a*States:(a+1)*States,:]=np.eye(States)-P_0[a,:,:]
for a in range(5):
    b[(a+1)*(States-1)]=1
print(A.shape[0])
print(b.shape)
res=linprog(c, A, b, A_eq=None, b_eq=None, bounds=None, method='simplex', callback=None, options={'disp': False, 'bland': False, 'tol': 1e-12, 'maxiter': 1000})
print(res)

Simplex(P_0)

75
(75,)
     fun: -0.0
 message: 'Optimization terminated successfully.'
     nit: 0
   slack: array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.])
  status: 0
 success: True
       x: array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.])


ValueError: Invalid input for linprog with method = 'simplex'.  The number of rows in A_ub must be equal to the number of values in b_ub

## First Order Methods

### Gradient Ascent

### Accelerated Gradient Ascent

## Interior Point Methods