In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Agent parameters
# agent which can choose any value of investment from 0 to 100 
# in risky asset and consume (0,1,2,5,10,20,50,100)% of the wealth

class PogChamp:
    def __init__(self):

        # statespace vector
        # time as a percentage of time left in episode 
        self.state = [x/(100) for x in range(100)]
        # actionspace vector
        # 1st: pi, the percentage invested in risky asset
        # 2nd: consumption percantages of wealth 
        self.actions = [[(x)/100,y]
                        for x in range(101)
                        for y in [0.01,0.1,0.5,1]]
        # q-table to store state-action pair values
        # optimal strategy will choose action to which has the greatest 
        # q value in the table at the given state
        # table is 2 dimension storing all state x action pairs
        self.q_table = np.zeros((len(self.state),len(self.actions)))
 
    def action_step(self,wealth, action, t,episode_consumption):
        # action taken
        pi,consumption = self.actions[action]
        # log actions
        pi_count[pi]+=1
        c_count.append(consumption)
        # scale consumption
        consumption*=10
        episode_consumption.append(wealth*consumption*dt)
        # collect reward for actions           
        reward = self.get_reward(consumption*wealth*dt,gamma)*pow(rho,-t*dt)
        # utility of wealth
        if (t+1)== total_days: reward+= agent.get_reward(wealth,gamma)
        return pi, reward,consumption,episode_consumption
    
    
    
    # formula for calculating how the wealth process develops every timestep
    def get_dW(self,W, pi, mu, r, c,dt, sigma):
        dZ = np.random.normal()*(np.sqrt(dt))
        drift = W*(pi*(mu-r)+r-c)
        dw = drift*dt + pi*sigma*W*dZ
        return dw


    # calculates the reward per unit of consumption according to CRRA formula
    def get_reward(self,u, gamma,t = 0):
        if gamma == 1:
            reward = np.log(u)
        else:
            reward = (u**(1-gamma))/(1-gamma)
        return reward

In [None]:
# Training Paramater settings

# number of episodes trials
# discount rate for rewards
# learning rate
num_episodes = 50000
rho = 1.05
alpha = 0.01



# # parameters for wealth process

# T = Total lifespan to use portfolio
# dt = timestep
# total_days = total number of days in the lifespan
# volatility
# drift
# riskless rate
# investor gamma
T = 5
dt=1/252
total_days = int(T/dt)
sigma = 0.2
mu = 0.1
r= 0.02
gamma = 4

In [None]:
# RL training
# activate agent
agent = PogChamp()
pi_count = {(x/100):0 for x in range(101)}
total_consumed  = []
final_wealth =[]
ep_rewards = []



for episode in range(num_episodes):
    # new state parameters
    wealth = 50000
    # clear history
    discounted_sum = 0
    state_action_history = []
    rewards_history = []
    c_count = []
    episode_consumption = []
    G= 0 
    for t in range(total_days):
        if t%10 == 0:
            # current state
            state = int(np.floor(100*t/total_days))
            # get action
            prob_exploit = 1-0.5*np.exp(-0.0005*episode)
            # exploit
            if np.random.random() < prob_exploit:            
                action = np.argmax(agent.q_table[state])
            # explore
            else:
                action = np.random.choice(range(len(agent.actions)))
            
            pi, reward,consumption,episode_consumption = agent.action_step(wealth, action, t,episode_consumption)
            # record visit
            agent.q_table_visits[state,action] += 1
            state_action_history.append([state,action])
            # take action and observe the new state and reward from the environment
            
            # record reward
            
            rewards_history.append(reward)
        else: consumption = 0
        
        dw = agent.get_dW(wealth, pi, mu, r, consumption, dt, sigma)
        wealth += dw

    
    total_returns = np.cumsum(rewards_history)[::-1]
    G = total_returns[0]
    for i in range(len(rewards_history)):
        s = state_action_history[i][0]
        a = state_action_history[i][1]
        # update here next lookup formula in videos
        agent.q_table[s,a] = agent.q_table[s,a] + alpha*(total_returns[i] - agent.q_table[s,a])
    
    
    # record episode results
    ep_rewards.append(G)
    total_consumed.append(sum(episode_consumption))
    final_wealth.append(wealth)
    
    # clear history
    state_action_history = []
    
    if (episode+1) % 1000 == 0:
        template = "culumative reward: {:.8f} at episode {}"
        print(template.format(G, episode+1))
        
mean_rewards = [np.mean(ep_rewards[n-300:n]) if n > 300 else np.mean(ep_rewards[:n]) 
           for n in range(1, len(ep_rewards))] 

In [None]:
# Plot results
plt.figure(figsize=(12,8))
plt.plot(mean_rewards)
plt.title('Total Return')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.savefig('Agent Total Return Linear.png')
plt.show()  


picount = [pi_count[x] for x in pi_count]
plt.figure(figsize=(12,8))
plt.plot(picount)
plt.title('pi frequency')
plt.xlabel('pi')
plt.ylabel('frequency')
plt.savefig('Pi Allocation Histogram MC.png')
plt.show()   

x = np.linspace(0,1,len(c_count))
plt.figure(figsize=(12,8))
plt.plot(x,np.array(c_count)/10)
plt.title('Sample Consumption Trajectory')
plt.xlabel('Percentage of Time Elapsed')
plt.ylabel('Proportion of Consumption per Timestep')
plt.savefig('Sample Consumption Trajectory.png')
plt.show() 

from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
(x, y) = np.meshgrid(np.arange(agent.q_table.shape[0])/100, np.arange(agent.q_table.shape[1]))
z = abs(agent.q_table)
print(x.shape,y.shape,z.shape)



fig = plt.figure(figsize=(20,20))
ax = fig.gca(projection='3d')
ax.set_xlabel('Percent of Time')
ax.set_ylabel('Pi and Consumption')
ax.set_zlabel('Total Return')
surf = ax.plot_surface(x, y, z.T, cmap=cm.coolwarm,
                       linewidth=0, antialiased=False)
plt.savefig('Agent Q-Table.png')
plt.show()

tally = "Total money consumed: {:.2f}\n\
Total money leftover: {:.2f}"

MoneyC = np.mean(total_consumed)
MoneyL = np.mean(final_wealth)
print(tally.format(MoneyC,MoneyL))
 