## Description

This notebook explores a further question: if we hone in to the Forward Looking Proposer and Probabilistic Responder, can we find an equilibrium approach? What if we complicate these strategies further and make them level-k aware of their opponents strategies?

In [12]:
import numpy as np
from scipy.optimize import minimize_scalar

### Player 2
---
This code shows how player 2 would optimize their $alpha$ value, i.e. optimizing how high or low variance they should be about their utility to maximize their utility (minimize their loss)

In [13]:
_x = None
_u_bad_2 = None

# Responder Player (P2: Democrat)
def p_accept(x, alpha, u_bad_2):
    utility_diff = -x - u_bad_2
    return 1 / (1 + np.exp(-alpha * utility_diff))

def utility_p2(x, alpha, u_bad_2):
    p = p_accept(x, alpha, u_bad_2)
    return -x * p + u_bad_2 * (1 - p)

def neg_utility_p2(alpha):
    # representing that p2 gets the inverse of the offer provided
    return -utility_p2(_x, alpha, _u_bad_2)

def optimize_alpha(x, u_bad_2):
    global _x, _u_bad_2
    _x = x
    _u_bad_2 = u_bad_2
    result = minimize_scalar(neg_utility_p2, bounds=(0.01, 10), method='bounded')
    return result.x


### Player 1

In [14]:
def expected_utility_p1_fixed(x, alpha, depth, u_bad_1, u_resp, p, u_bad_2):
    """
    This is for the basic case where p is fixed
    """
    p_acc = p_accept(x, alpha, u_bad_2)

    if depth == 0:
        return x * p_acc + u_bad_1 * (1 - p_acc)

    continuation = (1 - p) * expected_utility_p1_fixed(x, alpha, depth - 1, u_bad_1, u_resp, p, u_bad_2) + p * u_resp

    utility_if_accepted = x + continuation
    utility_if_rejected = u_bad_1 + continuation

    return p_acc * utility_if_accepted + (1 - p_acc) * utility_if_rejected

In [15]:
_alpha = None
_u_bad_1 = None
_u_resp = None
_p = None
_u_bad_2 = None
_depth = None
_p_bump = None

def neg_expected_utility_p1(x):
    return -expected_utility_p1_fixed(x, _alpha, _depth, _u_bad_1, _u_resp, _p, _u_bad_2)

def optimize_x(alpha, depth, u_bad_1, u_resp, p, u_bad_2):
    global _alpha, _depth, _u_bad_1, _u_resp, _p, _u_bad_2
    _alpha = alpha
    _depth = depth
    _u_bad_1 = u_bad_1
    _u_resp = u_resp
    _p = p
    _u_bad_2 = u_bad_2
    
    result = minimize_scalar(neg_expected_utility_p1, bounds=(0, 100), method='bounded')
    return result.x, -result.fun

In [16]:
def expected_utility_p1_dynamic(x, alpha, depth, u_bad_1, u_resp, p, p_bump, u_bad_2):
    """
    This is for the case where p is impacted by rejection
    """
    p_acc = p_accept(x, alpha, u_bad_2)

    if depth == 0:
        return x * p_acc + u_bad_1 * (1 - p_acc)

    # Accept branch uses original p
    continuation_accept = (1 - p) * expected_utility_p1_dynamic(x, alpha, depth - 1, u_bad_1, u_resp, p, p_bump, u_bad_2) + p * u_resp

    # Reject branch uses bumped p
    bumped_p = min(p + p_bump, 1.0)
    continuation_reject = (1 - bumped_p) * expected_utility_p1_dynamic(x, alpha, depth - 1, u_bad_1, u_resp, p, p_bump, u_bad_2) + bumped_p * u_resp

    utility_if_accepted = x + continuation_accept
    utility_if_rejected = u_bad_1 + continuation_reject

    return p_acc * utility_if_accepted + (1 - p_acc) * utility_if_rejected

def neg_expected_utility_p1_dynamic(x):
    return -expected_utility_p1_dynamic(x, _alpha, _depth, _u_bad_1, _u_resp, _p, _p_bump, _u_bad_2)

def optimize_x_dynamic(alpha, depth, u_bad_1, u_resp, p, p_bump, u_bad_2):
    global _alpha, _depth, _u_bad_1, _u_resp, _p, _p_bump, _u_bad_2
    _alpha = alpha
    _depth = depth
    _u_bad_1 = u_bad_1
    _u_resp = u_resp
    _p = p
    _p_bump = p_bump
    _u_bad_2 = u_bad_2

    result = minimize_scalar(neg_expected_utility_p1_dynamic, bounds=(0, 100), method='bounded')
    return result.x, -result.fun


### If the players are not aware of each other's strategies

Then we quickly reach an equilibrium: without awareness of the proposer's strategy, the responder wants to get as close to utilitarian as they can. They are not aware that if they were more willing to take risks and reject some deals then the proposer will update strategy to protect itself.

In [17]:
# Simulation constants
depth = 3          # recursion depth (4 rounds total)
u_bad_1 = -75      # Player 1's bad outcome
u_bad_2 = -75      # Player 2's bad outcome
u_resp = -25        # Player 1's utility when not proposer
p = 0.25            # probability of control switching
n_steps = 20       # number of updates in trajectory

# Initial strategies
x = 50.0
alpha = 0.1
trajectory = [(x, alpha)]

n_steps = 20  # or whatever number of steps you want
for step in range(n_steps):
    if step % 2 == 0:
        x_val, _ = optimize_x(alpha, depth, u_bad_1, u_resp, p, u_bad_2)
        x = x_val
    else:
        alpha_val = optimize_alpha(x, u_bad_2)
        alpha = alpha_val

    trajectory.append((x, alpha))

trajectory

[(50.0, 0.1),
 (50.530503530698184, 0.1),
 (50.530503530698184, 9.999994793574977),
 (74.26923306063837, 9.999994793574977),
 (74.26923306063837, 9.999993236301638),
 (74.26923296253761, 9.999993236301638),
 (74.26923296253761, 9.999993236299266),
 (74.26923296253777, 9.999993236299266),
 (74.26923296253777, 9.999993236299266),
 (74.26923296253777, 9.999993236299266),
 (74.26923296253777, 9.999993236299266),
 (74.26923296253777, 9.999993236299266),
 (74.26923296253777, 9.999993236299266),
 (74.26923296253777, 9.999993236299266),
 (74.26923296253777, 9.999993236299266),
 (74.26923296253777, 9.999993236299266),
 (74.26923296253777, 9.999993236299266),
 (74.26923296253777, 9.999993236299266),
 (74.26923296253777, 9.999993236299266),
 (74.26923296253777, 9.999993236299266),
 (74.26923296253777, 9.999993236299266)]

### Now Add Rejection Flip Bump

This too makes no difference. As long as the responder doesn't know that that the proposer will respond to its strategies, the proposer has the advantage here, as the responder is short-term risk averse since they lack the ability to consider the future.

In [18]:
# Add new constant
p_bump = 0.1  # how much to increase p after a rejection

# Updated alternating update loop using the dynamic expected utility
x = 50.0
alpha = 0.1
trajectory = [(x, alpha)]

for step in range(n_steps):
    if step % 2 == 0:
        # Player 1 updates x using dynamic utility with p_bump
        x_val, _ = optimize_x_dynamic(alpha, depth, u_bad_1, u_resp, p, p_bump, u_bad_2)
        x = x_val
    else:
        # Player 2 updates alpha as before
        alpha_val = optimize_alpha(x, u_bad_2)
        alpha = alpha_val

    trajectory.append((x, alpha))

trajectory


[(50.0, 0.1),
 (49.931342707538995, 0.1),
 (49.931342707538995, 9.999994793574977),
 (74.26094295543456, 9.999994793574977),
 (74.26094295543456, 9.99999303871964),
 (74.26094284317034, 9.99999303871964),
 (74.26094284317034, 9.999993038717003),
 (74.26094284275251, 9.999993038717003),
 (74.26094284275251, 9.99999303871699),
 (74.26094284275251, 9.99999303871699),
 (74.26094284275251, 9.99999303871699),
 (74.26094284275251, 9.99999303871699),
 (74.26094284275251, 9.99999303871699),
 (74.26094284275251, 9.99999303871699),
 (74.26094284275251, 9.99999303871699),
 (74.26094284275251, 9.99999303871699),
 (74.26094284275251, 9.99999303871699),
 (74.26094284275251, 9.99999303871699),
 (74.26094284275251, 9.99999303871699),
 (74.26094284275251, 9.99999303871699),
 (74.26094284275251, 9.99999303871699)]

Based on this realization, responder also needs to have recursive long-term utility

In [19]:
def expected_utility_p2_recursive(x, alpha, depth, u_bad_2):
    p_acc = p_accept(x, alpha, u_bad_2)

    if depth == 0:
        return -x * p_acc + u_bad_2 * (1 - p_acc)

    continuation = expected_utility_p2_recursive(x, alpha, depth - 1, u_bad_2)

    utility_if_accepted = -x + continuation
    utility_if_rejected = u_bad_2 + continuation

    return p_acc * utility_if_accepted + (1 - p_acc) * utility_if_rejected


### Player 2 chooses alpha, assuming Player 1 best-responds to it

In [20]:
def optimize_alpha_level_k(depth, u_bad_1, u_resp, p, p_bump, u_bad_2):
    def neg_recursive_utility(alpha):
        x_star, _ = optimize_x_dynamic(alpha, depth, u_bad_1, u_resp, p, p_bump, u_bad_2)
        return -expected_utility_p2_recursive(x_star, alpha, depth, u_bad_2)

    result = minimize_scalar(neg_recursive_utility, bounds=(0.01, 10), method='bounded')
    return result.x 


### Now re-run simulation with player 2 aware of player 1's response

Suddenly, player 1 wants to be as high variance as possible to make player 1 give lower offers

In [25]:
x = 50.0
alpha = 0.1
trajectory = [(x, alpha)]

for step in range(n_steps):
    if step % 2 == 0:
        x_val, _ = optimize_x_dynamic(alpha, depth, u_bad_1, u_resp, p, p_bump, u_bad_2)
        x = x_val
    else:
        alpha_val = optimize_alpha_level_k(depth, u_bad_1, u_resp, p, p_bump, u_bad_2)
        alpha = alpha_val

    trajectory.append((x, alpha))

trajectory


[(50.0, 0.1),
 (49.931342707538995, 0.1),
 (49.931342707538995, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471),
 (42.400440611120096, 0.05046804023833471)]

### But this is perhaps unfair to player 1: if player 1 knew player 2 knew how player 1 would respond...

In [26]:
# Given a fixed x, return the best alpha assuming Player 2 is forward-looking
def optimize_alpha_level_k_given_x(x, depth, u_bad_2):
    def neg_recursive_utility(alpha):
        return -expected_utility_p2_recursive(x, alpha, depth, u_bad_2)
    
    result = minimize_scalar(neg_recursive_utility, bounds=(0.01, 10), method='bounded')
    return result.x


In [27]:
# Level-k Player 1 expected utility, anticipating that Player 2 will respond with alpha*(x)
def player1_levelk_objective(x, depth, u_bad_1, u_resp, p, p_bump, u_bad_2):
    alpha_star = optimize_alpha_level_k_given_x(x, depth, u_bad_2)
    return -expected_utility_p1_dynamic(x, alpha_star, depth, u_bad_1, u_resp, p, p_bump, u_bad_2)

# Optimizer for level-k Player 1
def optimize_x_level_k(depth, u_bad_1, u_resp, p, p_bump, u_bad_2):
    def objective(x):
        return player1_levelk_objective(x, depth, u_bad_1, u_resp, p, p_bump, u_bad_2)
    
    result = minimize_scalar(objective, bounds=(0, 100), method='bounded')
    return result.x, -result.fun


### So now we have both players as level-k thinkers

We have a fascinating (and bad) result: The equilibrium here is for the proposer to take a strong line, offering just below the decision line, and for the proposer to take an incredibly high variance approach. This result doesn't really make sense to me at all

In [28]:
x = 50.0
alpha = 0.1
trajectory = [(x, alpha)]

for step in range(n_steps):
    if step % 2 == 0:
        # Player 1 chooses x knowing Player 2 will respond with alpha*(x)
        x_val, _ = optimize_x_level_k(depth, u_bad_1, u_resp, p, p_bump, u_bad_2)
        x = x_val
    else:
        # Player 2 chooses alpha knowing Player 1 will respond with x*(alpha)
        alpha_val = optimize_alpha_level_k(depth, u_bad_1, u_resp, p, p_bump, u_bad_2)
        alpha = alpha_val

    trajectory.append((x, alpha))

trajectory


[(50.0, 0.1),
 (74.26094297539323, 0.1),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471),
 (74.26094297539323, 0.05046804023833471)]