In [1]:
import math
import gym
from gym import spaces, logger
from gym.utils import seeding
import numpy as np

In [26]:
class PSEnv(gym.Env):
    """
    Observation: 
        Type: Box(2)
        Num	Observation                                                   Min         Max
        i	Previous closing price = current opening price of asset i    -Inf         Inf
        
    Actions:
        Description: A portfolio vector for one trading period.
        Type: Box(m) where m is the amount of assets.
        Num	Action
        i = number from 1...t	Proportion of capital to invest in the ith asset
        
    Reward:
        sum of a_(t-1, i) * y_(t-1, i), where a is the portfolio matrix and y is the asset returns matrix
        
    Starting State:
        All observations are assigned a uniform random value in [-0.05..0.05]
        
    Episode Termination:b
        Pole Angle is more than 12 degrees
        Cart Position is more than 2.4 (center of the cart reaches the edge of the display)
        Episode length is greater than 200
        Solved Requirements
        Considered solved when the average reward is greater than or equal to 195.0 over 100 consecutive trials.
    """
    metadata = {'render.modes': ['human']}

    def __init__(self, n_assets):
        high = np.repeat(np.finfo(np.float32).max, n_assets)

        self.observation_space = spaces.Box(-high, high, dtype=np.float32)
        
        self.action_space = spaces.Box(np.zeros(n_assets),
                                       np.ones(n_assets),
                                       dtype=np.float32)

        self.state = None

    def step(self, action, t_price):
        #         assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
        state = self.state
        t_minus_1_price = state
        t_returns = t_price / t_minus_1_price
        
        reward = np.dot(action.T, t_returns)
        
        done = False # Will be feeding in each trading periods' vectors from outside, so we never need to check for reaching last trading period here.

        return np.array(self.state), done, reward, {}

    def reset(self):
        self.state = None
        return np.array(self.state)

In [None]:
def ucb_bandit_portfolio(stock_returns):
    t = 0
#     n_time, n_assets = stock_returns.shape # n_time >> n_assets in our case
     n_assets = stock_returns.shape # n_time >> n_assets in our case
#     portfolio = np.zeros((n_assets, n_time)) # the output portfolio
    Rbar = np.zeros((n_assets,1)) # empirical mean of return for assets

    num_selected = {}
    for i in range(n_assets):
        num_selected[i] = 0
        
    # for loop up until n_time, output the bandit portfolio at each time t
    
    for t in range(n_time):
        
        if t < n_assets:
            portfolio[t, t] = 1
            Rbar[t] = stock_returns.iloc[t, t]
            num_selected[t] = 1
            continue
            
        max_asset = 0
        max_upper_bound = 0
        for asset in range(n_assets):
            
            avg_reward = Rbar[asset]
            right_part = np.sqrt(2*np.log(t) / num_selected[asset])
            upper_bound = avg_reward + right_part
            
            if upper_bound > max_upper_bound:
                max_asset = asset
                max_upper_bound = upper_bound
                
        portfolio[max_asset, t] = 1
        #pull
        Rbar[max_asset] = (Rbar[max_asset] * num_selected[max_asset] + stock_returns.iloc[t, max_asset]) / (num_selected[max_asset] + 1)
        num_selected[max_asset] += 1
        
    return portfolio