In [118]:
from tqdm import tqdm
#from .autonotebook import tqdm as notebook_tqdm
import numpy as np
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from statistics import mean
import math
th.manual_seed(0)

# from sklearn import preprocessing

<torch._C.Generator at 0x7dc032291ab0>

In [131]:
##dataset class
from torch.utils.data.dataset import Dataset, random_split

class ExpertDataSet(Dataset):

    def __init__(self, expert_observations, expert_actions):
        self.observations = expert_observations
        self.actions = expert_actions

    def __getitem__(self, idx):
        # return (self.observations[index], self.actions[index])
        normalized_observations = 2 * ((self.observations[idx] - self.observations.min()) / (self.observations.max() - self.observations.min())) - 1
        # normalized_actions = 2 * ((self.actions[idx] - self.actions.min()) / (self.actions.max() - self.actions.min())) - 1
        normalized_data = (normalized_observations,self.actions[idx])
        return normalized_data


    def __len__(self):
        return len(self.observations)

In [133]:
expert_observations = np.load('./expert-observations.npy', allow_pickle=True)
expert_actions = np.load('./expert-actions.npy', allow_pickle=True)



print(len(expert_actions))
print(len(expert_observations))


print(expert_observations[0].shape)
print(expert_actions[0].shape)

expert_dataset = ExpertDataSet(expert_observations, expert_actions)

#split in 80% training and 20%test
batch_size = 64
train_prop = 0.8
train_size = int(train_prop * len(expert_dataset))
test_size = len(expert_dataset) - train_size
train_expert_dataset, test_expert_dataset = random_split(expert_dataset, [train_size, test_size])

train_loader = th.utils.data.DataLoader(  dataset=train_expert_dataset, batch_size=batch_size, shuffle=False)
test_loader = th.utils.data.DataLoader(  dataset=test_expert_dataset, batch_size=batch_size, shuffle=False)

# for s,a in train_loader
#     obs_max = expert_observations[i].max()
#     obs_min = expert_observations[i].min()
#     act_max = expert_actions[i].max()
#     act_min = expert_actions[i].min()

#     print(f'expert_actions({i}) -> {expert_actions[i]}')
#     print(f'expert_observations({i}) -> {expert_observations[i]}')

#     if(math.isinf(obs_max) or math.isinf(obs_min)):
#         print('colpevole: ')
#         print(expert_observations[i])
#     if(math.isinf(act_max) or math.isinf(act_min)):
#         print('colpevole: ')
#         print(expert_actions[i])



print(train_loader.dataset.__getitem__(0)[0].shape[0])
print(train_loader.dataset.__getitem__(0)[1].shape[0])



4000
4000
(196,)
(36,)
196
36


In [150]:
###### Define student agent

no_cuda = False
torch_available = th.cuda.is_available()
print(torch_available)
use_cuda = not no_cuda and th.cuda.is_available()
print('use_cuda: ', use_cuda)

device = th.device("cuda" if use_cuda else "cpu")

class StudentAgent:
    def __init__(self, env, train_loader, test_loader, learning_rate):
        self.env = env

        



        self.train_loader = train_loader
        self.test_loader = test_loader

        n_inputs = train_loader.dataset.__getitem__(0)[0].shape[0] # dimension of observation space (of state = 196)
        n_outputs = train_loader.dataset.__getitem__(0)[1].shape[0] # dimension of actions space (of outptu = 36)

        #simple layer
        self.policy = nn.Sequential(
            # nn.BatchNorm1d(n_inputs),
            nn.Linear(n_inputs, 16), 
            # nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Linear(16, n_outputs),
            # nn.BatchNorm1d(n_outputs)
            # nn.Softmax(dim=-1)
            )

        print("policy net: ", self.policy)

        self.loss_criterion = nn.MSELoss()

        self.optimizer =  optim.Adam(self.policy.parameters(), lr=learning_rate)

        self.num_eval_episodes = 10

    def train(self, num_epochs):
        self.policy.train()
        self.policy.to(device)

        epoch_loss = 0
        counter = 0 #only for statistics

        for epoch in range(num_epochs):
            print(f'epoch -> {epoch}')
            for batch_idx, (data, target) in enumerate(train_loader):
                obs, expert_action = data.to(device), target.to(device)
                self.optimizer.zero_grad()
                obs = obs.float()


                student_action = self.policy(obs)
                expert_action = expert_action.float() 
                print(f'obs -> {obs}')
                print(f'obs max -> {obs.max()}')
                print(f'obs min -> {obs.min()}')

                print(f'student_action -> {student_action}')
                print(f'expert_action -> {expert_action}')
                print(f'expert_action None? -> {expert_action.any()==np.nan}')
                print(f'tipo expert -> {type(expert_action)}')

                
                print(f'tipo st -> {type(student_action)}')
                # print(f'expert_action None? -> {}')

                if th.isnan(student_action).any():
                    print('errooreeeeeeeeee')
                    break

                # print(f'expert_action min -> {expert_action.min()}')

                # print(f'expert_action -> {expert_action.(expert_action.max())}')
                # print(f'expert_action -> {expert_action.(expert_action.min())}')

                
                result = (expert_action == expert_action.max()).nonzero(as_tuple=True)
                # print('max and min')

                
                # print('valori strani')
                # print(expert_action[34])
                # print(expert_action[22])
                
                # print('result',result)

                loss = self.loss_criterion(student_action, expert_action)
                loss.backward()
                self.optimizer.step()
                epoch_loss += loss.item()
                counter += 1
                
                # print("Student actions: {}".format(student_action.shape))
                # print("Expert actions: {}".format(expert_action.shape))
            
            
            # compute accuracy
            train_acc = self.compute_accuracy(self.train_loader)
            test_acc = self.compute_accuracy(self.test_loader)
            print("Epoch {}:\ttrain accuracy: {}\ttest accuracy: {}".format(epoch, train_acc, test_acc))
            print("Epoch Loss: {}".format(epoch_loss/counter))
            epoch_loss = 0
            counter = 0

    def compute_accuracy(self, loader):
        total = 0
        correct = 0

        self.policy.eval()
        test_loss = 0

        with th.no_grad():
            for data, target in loader:

                obs, expert_action = data.to(device), target.to(device)
                obs = obs.float()

                student_action = self.policy(obs)

                total += student_action.size()[0]

                # print("Student actions: {}".format(student_action.shape))
                # print("Expert actions: {}".format(expert_action.shape))

                correct += sum(sum(student_action==expert_action)).item()

        accuracy = 100. * correct/(float)(total)

        return accuracy


False
use_cuda:  False


In [143]:
np.nan

nan

In [151]:
student = StudentAgent(None, train_loader, test_loader, 0.01)
student.train(1)

policy net:  Sequential(
  (0): Linear(in_features=196, out_features=16, bias=True)
  (1): ReLU()
  (2): Linear(in_features=16, out_features=36, bias=True)
)
epoch -> 0
obs -> tensor([[0.0762, 0.0762, 0.0762,  ..., 0.0762, 0.0762, 0.0762],
        [0.0762, 0.0762, 0.0762,  ..., 0.0762, 0.0762, 0.0762],
        [0.0762, 0.0762, 0.0762,  ..., 0.0762, 0.0762, 0.0762],
        ...,
        [0.0762, 0.0762, 0.0762,  ..., 0.0762, 0.0762, 0.0762],
        [0.0762, 0.0762, 0.0762,  ..., 0.0762, 0.0762, 0.0762],
        [0.0762, 0.0762, 0.0762,  ..., 0.0762, 0.0762, 0.0762]])
obs max -> 0.7713021636009216
obs min -> -0.681028425693512
student_action -> tensor([[ 0.1686, -0.1412,  0.2023,  ..., -0.1030, -0.1058,  0.0433],
        [ 0.1686, -0.1412,  0.2023,  ..., -0.1030, -0.1058,  0.0433],
        [ 0.1686, -0.1412,  0.2023,  ..., -0.1030, -0.1058,  0.0433],
        ...,
        [ 0.1686, -0.1412,  0.2023,  ..., -0.1030, -0.1058,  0.0433],
        [ 0.1686, -0.1412,  0.2023,  ..., -0.1030, -0.1

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [124]:
for epoch in range(1):
            for batch_idx, (data, target) in enumerate(train_loader):
                obs, expert_action = data.to(device), target.to(device)
                # self.optimizer.zero_grad()

                print(f'input -> {obs} ')
                print(obs.min())
                print(obs.max())
                obs = obs.float()
                student_action = student.policy(obs)
                # print(f'student_action -> {student_action}')
                expert_action = expert_action.float() 
                print(f'output -> {student_action}')
                # print(f'expert_action -> {expert_action.(expert_action.max())}')
                # print(f'expert_action -> {expert_action.(expert_action.min())}')

                
                result = (expert_action == expert_action.max()).nonzero(as_tuple=True)
                # print('max and min')
                # print(f'expert_action -> {expert_action.max()}')
                # print(f'expert_action -> {expert_action.min()}')

                
                # print('valori strani')
                # print(expert_action[34])
                # print(expert_action[22])
                
                # print('result',result)

                # loss = self.loss_criterion(student_action, expert_action)
                # loss.backward()
                # self.optimizer.step()
                # epoch_loss += loss.item()
                # counter += 1
                
                # print("Student actions: {}".format(student_action.shape))
                # print("Expert actions: {}".format(expert_action.shape))
            


input -> tensor([[0.0762, 0.0762, 0.0762,  ..., 0.0762, 0.0762, 0.0762],
        [0.0762, 0.0762, 0.0762,  ..., 0.0762, 0.0762, 0.0762],
        [0.0762, 0.0762, 0.0762,  ..., 0.0762, 0.0762, 0.0762],
        ...,
        [0.0762, 0.0762, 0.0762,  ..., 0.0762, 0.0762, 0.0762],
        [0.0762, 0.0762, 0.0762,  ..., 0.0762, 0.0762, 0.0762],
        [0.0762, 0.0762, 0.0762,  ..., 0.0762, 0.0762, 0.0762]],
       dtype=torch.float64) 
tensor(-0.6708, dtype=torch.float64)
tensor(0.3682, dtype=torch.float64)
output -> tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]],
       grad_fn=<NativeBatchNormBackward0>)
input -> tensor([[ 0.0762,  0.0762,  0.0762,  ...,  0.0762,  0.0762,  0.0762],
        [ 0.0762,  0.0762,  0.0762,  ...,  0.0762,  0.0762,  0.0762],