In [45]:
from gurobipy import *
import numpy as np

'''
Modelling an example that was used in the AAAI-17 paper:
x' = min(1, max(0, 0.2x + 0.7a))
    *in the paper this was x' = max(1, min(0, 0.2x + 0.7a)) but x' would always be 1? 
z' = Normal(x, a), a > 0 
r' = if(x < z') then x'
     else 1 - x' 
NOTES:
    - x is a state variable
    - a is an action variable
    - z is an intermediate variable 
    - r is the reward
'''

#Set up problem as one step planning -> what gives the best reward RIGHT NOW (not optimal) 

# Initial state
x = 0
totReward = 0.0

# Iteration set up
horizon = 5
i = 0

# Create a new optimization problem for every time step in the horizon
while i < horizon:

    # Create model
    model = Model("AAAI17-MILP")
    model.setParam('OutputFlag', 0)

    # Create variables
    xprime = model.addVar(lb = 0, ub = 1, name = "x'")
    action = model.addVar(lb = 0, ub = GRB.INFINITY, name = "a")
    zprime = model.addVar(lb = -1*GRB.INFINITY, ub = GRB.INFINITY, name = "z'")
    reward = model.addVar(lb = -1*GRB.INFINITY, ub = GRB.INFINITY, name = "r")
    xinter = model.addVar(lb = 0, ub = GRB.INFINITY, name = "xinter") # To evaluate inner max for x' transition
    # Binary variables for big M constraints
    bxinter = model.addVar(vtype=GRB.BINARY, name = "bxinter")
    bxprime = model.addVar(vtype = GRB.BINARY, name = "bxprime")
    breward = model.addVar(vtype=GRB.BINARY, name = "breward")
    model.update()

    zp = LinExpr()
    n = 10000
    navg = 0
    j = 0 
    
    # Determinize zprime  
    while j < n:
        norm = np.random.standard_normal()
        zp += x + action*norm
        navg += norm
        j += 1
    navg = (1/n)*navg
    zp = (1/n)*zp 
    
    # Set objective
    model.setObjective(reward)
    
    # Set constraints
    M= 1000
    
    # xinter constraints 
    # If bxinter = 1, 0.2x + 0.7a > 0, so xinter = 0.2x + 0.7a
    model.addConstr(0.2*x + 0.7*action >=  -1*M*(1 - bxinter))
    model.addConstr(xinter <= 0.2*x + 0.7*action + M*(1 - bxinter))
    model.addConstr(xinter >= 0.2*x + 0.7*action - M*(1 - bxinter))
    # If bxinter = 0, 0.2x + 0.7a < 0, so xinter = 0
    model.addConstr(0.2*x + 0.7*action <= M*bxinter)
    model.addConstr(xinter <= M*bxinter)
    model.addConstr(xinter >= -1*M*bxinter)
        
    # xprime constraints
    # If bxprime = 1, xinter < 1, so xprime = xinter
    model.addConstr(xinter <= 1 + M*(1 - bxprime))
    model.addConstr(xprime <= xinter + M*(1 - bxprime))
    model.addConstr(xprime >= xinter - M*(1 - bxprime))
    # If bxprime = 0, xinter > 1, so xprime = 1
    model.addConstr(xinter >= 1 - M*bxprime)
    model.addConstr(xprime <= 1 + M*bxprime)
    model.addConstr(xprime >= 1 - M*bxprime)

    # zprime constraints -> setting it equal to summation formulated above
    model.addConstr(zprime <= zp)
    model.addConstr(zprime >= zp)
    
    # reward constraints
    # If breward = 1, x < zprime and so reward = xprime
    model.addConstr(zprime >= x + 10e-6 - M*(1 - breward))
    model.addConstr(reward <= xprime + M*(1 - breward))
    model.addConstr(reward >= xprime - M*(1 - breward))
    # If breward = 0, x >= zprime and so reward = 1 - x'
    model.addConstr(zprime <= x + M*breward)
    model.addConstr(reward <= 1 - xprime + M*breward)
    model.addConstr(reward >= 1 - xprime - M*breward)
    
    # Optimize model
    model.optimize()
    
    #Print Results
    print("Gurobi Results for Iteration " + str(i) + ":")
    print("x(" + str(i) + "): " + str(x))
    for v in model.getVars():
        print('Gurobi Result: %s(%d): %g' % (v.varName, i, v.x))
        if v.varName == "x'":
            print('\tCheck Result: %s(%d): %g' % (v.varName, i, min(1.0, max(0.0, 0.2*x + 0.7*action.x))))
        elif v.varName == "z'":
            print('\tCheck Result: %s(%d): %g' % (v.varName, i, x + action.x*navg))
        elif v.varName == "xinter":
            print('\tCheck Result: %s(%d): %g' % (v.varName, i, max(0.0, 0.2*x + 0.7*action.x)))
        elif v.varName == "r":
            if x < zprime.x:
                print('\tCheck Result: %s(%d): %g' % (v.varName, i, xprime.x))
            else:
                print('\tCheck Result: %s(%d): %g' % (v.varName, i, 1.0 - xprime.x))
    print('Reward(%d): %g\n' % (i, model.objVal))      
    totReward += model.objVal

    # Set state transition
    x = xprime.x
    i += 1
    
print("Total Reward: " + str(totReward))


Gurobi Results for Iteration 0:
x(0): 0
Gurobi Result: x'(0): 0.00167635
	Check Result: x'(0): 0.00167635
Gurobi Result: a(0): 0.00239478
Gurobi Result: z'(0): 1e-05
	Check Result: z'(0): 1e-05
Gurobi Result: r(0): 0.00167635
	Check Result: r(0): 0.00167635
Gurobi Result: xinter(0): 0.00167635
	Check Result: xinter(0): 0.00167635
Gurobi Result: bxinter(0): 1
Gurobi Result: bxprime(0): 1
Gurobi Result: breward(0): 1
Reward(0): 0.00167635

Gurobi Results for Iteration 1:
x(1): 0.0016763485898905195
Gurobi Result: x'(1): 1
	Check Result: x'(1): 1
Gurobi Result: a(1): 1.42809
Gurobi Result: z'(1): -0.016455
	Check Result: z'(1): -0.016455
Gurobi Result: r(1): -4.44089e-16
	Check Result: r(1): 0
Gurobi Result: xinter(1): 1
	Check Result: xinter(1): 1
Gurobi Result: bxinter(1): 1
Gurobi Result: bxprime(1): 0
Gurobi Result: breward(1): 0
Reward(1): -4.44089e-16

Gurobi Results for Iteration 2:
x(2): 1.0
Gurobi Result: x'(2): 1
	Check Result: x'(2): 1
Gurobi Result: a(2): 1.14286
Gurobi Result