# Read data from file

In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import cma
from scipy.optimize import minimize
from scipy import stats

STATES = 18
ACTIONS = 4
GAMMA = 0.95
BENCHMARK_PERFORMANCE = 1.41537
DUMMY = 0

def parse_file(filename):
    fileForInput = open(filename, 'r')
    no_of_episodes = int(fileForInput.readline())
    no_of_episodes = 1000
    print(int(no_of_episodes))
    states = [[] for _ in range(no_of_episodes)]
    actions = [[] for _ in range(no_of_episodes)]
    rewards = [[] for _ in range(no_of_episodes)]
    pi_b = [[] for _ in range(no_of_episodes)]

    horizon_length = 0
    min_timesteps = 1000
    for episode in range(no_of_episodes):
        timesteps = int(fileForInput.readline())
        horizon_length = max(horizon_length, timesteps)
        min_timesteps = min(min_timesteps, timesteps)
        for timestep in range(timesteps):
            cur_list = fileForInput.readline().rstrip('\n').split(',')
            states[episode].append([int(cur_list[0]) + 1])
            actions[episode].append(int(cur_list[1]))
            rewards[episode].append(int(cur_list[2]))
            pi_b[episode].append(float(cur_list[3]))
    
    # padding the data with dummy values in order to make all episodes of same length
    for episode in range(no_of_episodes):
        cur_timesteps = len(states[episode])
        for timestep in range(cur_timesteps, horizon_length):
            states[episode].append([DUMMY])
            actions[episode].append(DUMMY)
            rewards[episode].append(DUMMY)
            pi_b[episode].append(DUMMY)
    dataset = {'states': states, 'actions': actions, 'rewards': rewards, 'pi_b': pi_b}
    return dataset

# Feed forward neural network for mapping states to actions

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self, states, actions):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(states, 128)  
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64,32)
        self.fc4 = nn.Linear(32, actions)

    def forward(self, x):
        episodes = x.size()[0]
        horizon_length = x.size()[1]
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        out = self.fc4(x)
        out = torch.nn.functional.softmax(out, dim=1)

#         out = torch.reshape(out, (episodes, horizon_length, -1))
        out = out.view(episodes, horizon_length, -1)
        return out

    def num_flat_features(self, x):
        size = x.size()[2:]
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net(1, 4)
print(net)

Net(
  (fc1): Linear(in_features=1, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (fc4): Linear(in_features=32, out_features=4, bias=True)
)


# Reading data from file and splitting into test and train 

In [4]:
if __name__ == '__main__':
    dataset = parse_file('data.csv')
    states = dataset['states']
    actions = dataset['actions']
    rewards = dataset['rewards']
    pi_b = dataset['pi_b']
    states_train, states_test, actions_train, actions_test, rewards_train, rewards_test, pi_b_train, pi_b_test = \
        train_test_split(states, actions, rewards, pi_b, test_size=0.5, random_state=42)
    
    dataset_test = {'states': states_test, 'actions': actions_test, 'rewards': rewards_test, 'pi_b': pi_b_test}
    dataset_test_size = len(states_test)

1000


# Converting states, rewards, actions and pi_b into tensors to work with autograd

In [5]:
tensor_rewards = torch.tensor(rewards_train)
tensor_states = torch.tensor(states_train)
tensor_pib = torch.tensor(pi_b_train)
tensor_actions = torch.tensor(actions_train)

# Function to calculate the high confidence lower bound with ttest

In [6]:
def lower_bound(is_estimates, factor=2, delta=0.01):
    size = is_estimates.size()[0]
    lb = torch.mean(is_estimates) - factor * (
            torch.std(is_estimates) / np.sqrt(size)) * stats.t.ppf(1 - delta, size - 1)
    return lb

# Converting data into batches using dataloader and randomsampler

In [7]:
from torch.utils.data import DataLoader, Dataset, TensorDataset, RandomSampler
bs = 64
dataset = TensorDataset(tensor_states, tensor_actions, tensor_rewards, tensor_pib)
sampler = RandomSampler(dataset)
dataloader = DataLoader(dataset, sampler=sampler, batch_size=bs)

# Function that returns the PDIS estimates of the dataset

In [8]:
def get_PDIS_estimate(dataset, out):
    states = dataset['states']
    actions = dataset['actions']
    rewards = dataset['rewards']
    pi_b = dataset['pi_b']

    is_estimates = torch.zeros(states.size()[0])

    for episode in range(len(states)):
        is_current = torch.tensor(0).float()
        frac = torch.tensor(1).float()
        for timestep in range(len(states[episode])):
            
            s = states[episode][timestep]
            if s[0] == DUMMY:
                break
            a = actions[episode][timestep]
            frac *= out[episode][timestep][a] / pi_b[episode][timestep]
            is_current += (GAMMA ** timestep) * (rewards[episode][timestep] * frac)
        is_estimates[episode] =is_current
    return is_estimates

# Training of the neural network with loss as 
$$-PDIS (\theta) + \lambda * (c - \rho^-)$$
Minimization w.r.t. policy parameter $\theta $  and 
Maximization w.r.t. lagrange multiplier $\lambda$ 

In [9]:
horizon_length = 186
batch_size = 64
# using Adam optimizer to solve the minimization w.r.t policy parameter
optimizer = torch.optim.Adam(net.parameters(), lr = 1e-4)

# langrange multiplier
l = torch.tensor(0.0, requires_grad=True)

# getting the value of the total objective function at the current values of theta and lambda
def lfn(out, l, dataset):
    list_estimates = (get_PDIS_estimate(dataset, out))
    mean_estimate = -torch.mean(list_estimates) + l * (BENCHMARK_PERFORMANCE - lower_bound(list_estimates))
    return mean_estimate

# training loop
for epoch in range(5):
    epoch_loss = 0 
    c=0
    for states, actions, rewards, pi_b in dataloader:
        episodes = states.size()[0]
        optimizer.zero_grad()
        
        # forward pass to get the action probabilities from the states
        out = net(states.float())
        
        # calculate the objective function
        mean_estimate = lfn(out, l, {'states':states, 'actions': actions,'rewards':rewards,'pi_b':pi_b})
        
        # backprop w.r.t. policy parameter gradient descent since minimization w.r.t. theta
        mean_estimate.backward(retain_graph=True)
        optimizer.step()
        epoch_loss += mean_estimate.item()
        c+=1
        
        # calculating the gradient w.r.t. lambda
        _, dl = torch.autograd.grad(mean_estimate, [out, l])
        
        # gradient ascent w.r.t. lambda since maximization w.r.t. lambda
        with torch.no_grad():
            l += 1e-5 * dl
#         print("l is ", l)
    print("epoch loss----->",epoch_loss/c)

epoch loss-----> -2.1430511474609375
epoch loss-----> -2.370282858610153
epoch loss-----> -2.690591514110565
epoch loss-----> -3.010955587029457
epoch loss-----> -3.2746819853782654


# Analyzing the learned model 

In [13]:
import torch
test_states = torch.tensor([[1],[2],[3],[4],[5],[6],[7],[8],[9],[10],[11],[12],[13],[14],[15],[16],[17]])
pi_e = net(test_states.float())

In [11]:
print(pi_e)

tensor([[[0.2408, 0.2459, 0.2227, 0.2907]],

        [[0.2331, 0.2475, 0.2220, 0.2975]],

        [[0.2253, 0.2472, 0.2196, 0.3079]],

        [[0.2174, 0.2451, 0.2171, 0.3205]],

        [[0.2075, 0.2439, 0.2136, 0.3350]],

        [[0.1979, 0.2424, 0.2093, 0.3505]],

        [[0.1884, 0.2407, 0.2049, 0.3660]],

        [[0.1791, 0.2388, 0.2003, 0.3817]],

        [[0.1701, 0.2367, 0.1957, 0.3975]],

        [[0.1613, 0.2344, 0.1910, 0.4133]],

        [[0.1528, 0.2318, 0.1860, 0.4294]],

        [[0.1446, 0.2288, 0.1808, 0.4458]],

        [[0.1366, 0.2257, 0.1754, 0.4623]],

        [[0.1289, 0.2224, 0.1698, 0.4789]],

        [[0.1215, 0.2188, 0.1643, 0.4954]],

        [[0.1144, 0.2150, 0.1587, 0.5119]],

        [[0.1076, 0.2110, 0.1531, 0.5283]]], grad_fn=<ViewBackward>)


In [14]:
tensor_rewards = torch.tensor(rewards_test)
tensor_states = torch.tensor(states_test)
tensor_pib = torch.tensor(pi_b_test)
tensor_actions = torch.tensor(actions_test)
from torch.utils.data import DataLoader, Dataset, TensorDataset, RandomSampler
bs = 64
dataset = TensorDataset(tensor_states, tensor_actions, tensor_rewards, tensor_pib)
# train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)
sampler = RandomSampler(dataset)
dataloader = DataLoader(dataset, sampler=sampler, batch_size=bs)


# Testing the learned model on test data

In [15]:
c=0
net.eval()
for states, actions, rewards, pi_b in dataloader:
    with torch.no_grad():
        out = net(states.float())
        mean_estimate = lfn(out, l, {'states':states, 'actions': actions,'rewards':rewards,'pi_b':pi_b})
        print(mean_estimate)

tensor(-1.2031)
tensor(-3.5412)
tensor(-1.7411)
tensor(-1.4903)
tensor(-2.5011)
tensor(-2.2867)
tensor(-2.3453)
tensor(-2.9590)


In [None]:
# lfn = lambda x, l: x**2 / 2 + l * (x - 1)
# x = torch.tensor(-0.1, requires_grad=True)
# l = torch.tensor(0.0, requires_grad=True)

# xs = []
# ls = []
# for i in range(200):
#     f = lfn(x, l)
#     dx, dl = torch.autograd.grad(f, [x, l])
#     with torch.no_grad():
#         x -= 1e-1 * dx
#         l += 1e-1 * dl
#     xs.append(x.item())
#     ls.append(l.item())

# %matplotlib inline
# from matplotlib import pyplot
# # pyplot.plot(xs)
# pyplot.plot(ls)
# x.item()