In [76]:
from tqdm import tqdm
#from .autonotebook import tqdm as notebook_tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from statistics import mean
import math
torch.manual_seed(42)

# from sklearn import preprocessing

<torch._C.Generator at 0x708895c4f710>

In [77]:
##dataset class
from torch.utils.data.dataset import Dataset, random_split

class ExpertDataSet(Dataset):

    def __init__(self, expert_observations, expert_actions):
        self.observations = torch.from_numpy(expert_observations)
        self.actions = self.preprocess_data(torch.from_numpy(expert_actions))

    def __getitem__(self, idx):
        # return (self.observations[index], self.actions[index])
        normalized_observations = 2 * ((self.observations[idx] - self.observations.min()) / (self.observations.max() - self.observations.min())) - 1
        # normalized_actions = 2 * ((self.actions[idx] - self.actions.min()) / (self.actions.max() - self.actions.min())) - 1
        normalized_data = (normalized_observations, self.actions[idx])
        return normalized_data


    def __len__(self):
        return len(self.observations)
    
    def preprocess_data(self, data, clip_value=1e38):
        # Clip values to a maximum and minimum range
        data = torch.clamp(data, min=-clip_value, max=clip_value)
        
        # Convert to float
        return data.float()

In [79]:
expert_observations = np.load('./expert-observations.npy', allow_pickle=True)
expert_actions = np.load('./expert-actions.npy', allow_pickle=True)

count_discarded_numpy = 0
count_discarded = 0


new_exp_action = expert_actions

list_of_index_to_drop = []
for i, a in enumerate(expert_actions):
  if (a > 1e2).any() or (a > 1e2).any():
  # if not np.isfinite(a).all(): 
    list_of_index_to_drop.append(i)
    print(i)
    print(a)
    count_discarded_numpy+=1
    # break


print("Expert actions len: {}".format(len(expert_actions)))
print("Expert observations len: {}".format(len(expert_observations)))

expert_dataset = ExpertDataSet(expert_observations, expert_actions)



for i in range(len(expert_dataset)):
  a = expert_dataset.__getitem__(i)[1]
  # print(a.max())
  # print(a.min())
  if (a > 1e2).any() or (a < -1e2).any() :
  # if not torch.isfinite(a).any():
    count_discarded += 1
    print(a)


print("Discarded data")
print("Discarded form np: {}".format(count_discarded_numpy))
print("Discarded form torch: {}".format(count_discarded))

#split in 80% training and 20%test
batch_size = 64
train_prop = 0.8
train_size = int(train_prop * len(expert_dataset))
test_size = len(expert_dataset) - train_size
train_expert_dataset, test_expert_dataset = random_split(expert_dataset, [train_size, test_size])

train_loader = torch.utils.data.DataLoader(  dataset=train_expert_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(  dataset=test_expert_dataset, batch_size=batch_size, shuffle=True)


print("Shapes:")
print(train_loader.dataset.__getitem__(0)[1].shape)
print(train_loader.dataset.__getitem__(0)[0].shape)



Expert actions len: 8000
Expert observations len: 8000
Discarded data
Discarded form np: 0
Discarded form torch: 0
Shapes:
torch.Size([36])
torch.Size([196])


In [93]:
###### Define student agent

no_cuda = False
torch_available = torch.cuda.is_available()
print(torch_available)
use_cuda = not no_cuda and torch.cuda.is_available()
print('use_cuda: ', use_cuda)

device = torch.device("cuda" if use_cuda else "cpu")

class StudentAgent:
    def __init__(self, train_loader, test_loader, learning_rate, threshold):
        self.train_loader = train_loader
        self.test_loader = test_loader

        n_inputs = train_loader.dataset.__getitem__(0)[0].shape[0] # dimension of observation space (of state = 196)
        n_outputs = train_loader.dataset.__getitem__(0)[1].shape[0] # dimension of actions space (of outptu = 36)

        #simple layer
        self.policy = nn.Sequential(
            # nn.BatchNorm1d(n_inputs),
            nn.Linear(n_inputs, 16), 
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Linear(16, n_outputs),
            # nn.BatchNorm1d(n_outputs)
            # nn.Softmax(dim=-1)
            )

        print("policy net: ", self.policy)

        # self.loss_criterion = nn.HuberLoss() #nn.MSELoss()
        self.loss_criterion = nn.MSELoss()

        self.optimizer =  optim.Adam(self.policy.parameters(), lr=learning_rate)
        self.num_eval_episodes = 5
        self.accuracy_threshold = threshold

    def print_gradients(self):
        for name, param in self.policy.named_parameters():
            if param.requires_grad:
                if torch.isnan(param.grad).any(): 
                    return 1#break
                # print(f"Gradient of {name}: {param.grad}")
        return 0

    def train(self, num_epochs):
        self.policy.train()
        self.policy.to(device)
        
        loss = 0
        epoch_loss = 0
        unused_val = 0

        for epoch in range(num_epochs):
            for batch_idx, (data, target) in enumerate(self.train_loader):
                obs, expert_action = data.to(device), target.to(device)

                self.optimizer.zero_grad()
                obs = obs.float()

                student_action = self.policy(obs)
                expert_action = expert_action.float()

                loss = self.loss_criterion(student_action, expert_action)
                # loss.register_hook(lambda grad: print(grad))
                loss.backward()
                # print("Loss: {}".format(loss.item()))
                
                
                if not loss.item() == torch.inf: 
                    epoch_loss += loss.item()
                    self.optimizer.step()

                else:
                    unused_val += 1
                    # print("### BATCH {} ###".format(batch_idx))
                    # print(f'obs -> {obs}')
                    print("\n______________________________________________________________________________")
                    print(f'expert_action -> {expert_action}')
                    print("\n______________________________________________________________________________")
                    print(f'student_action -> {student_action}')
                    print("\n______________________________________________________________________________")
                    return expert_action,student_action

                res = self.print_gradients()
                
                # print("###############################################################################\n")

                if torch.isnan(student_action).any(): 
                    print('e successo')
                    break
                if res == 1: 
                    print("\n______________________________________________________________________________")
                    print(student_action.shape)
                    for i, ea in enumerate(expert_action):
                        if not np.isfinite(ea).all():
                            print(i+64)
                            print(f'expert_action -> {ea}')

                    print("\n______________________________________________________________________________")
                    print(f'Max expert_action -> {expert_action.max()}')
                    print(f'Min expert_action -> {expert_action.min()}')
                    print(f'Max student_action -> {student_action.max()}')
                    print(f'Min student_action -> {student_action.min()}')
                    break
                
                # print("Student actions: {}".format(student_action.shape))
                # print("Expert actions: {}".format(expert_action.shape))
            
            
            # compute accuracy
            train_acc = self.compute_accuracy(self.train_loader)
            test_acc = self.compute_accuracy(self.test_loader)
            print("Epoch {}:\ttrain accuracy: {}\ttest accuracy: {}".format(epoch, train_acc, test_acc))
            print("Train Loss: {}".format(epoch_loss/(batch_idx+1)))
            self.validation(test_loader, num_epochs=self.num_eval_episodes)
            print("Unused Loss: {}".format(unused_val))
            epoch_loss = 0
            unused_val = 0
            # if train_acc >80. and test_acc>80.: return
            print("###############################################################################\n")


    def validation(self, loader, num_epochs):
        self.policy.eval()
        epoch_loss = 0
        for epoch in range(num_epochs):
            for batch_idx, (data, target) in enumerate(loader):
                obs, expert_action = data.to(device), target.to(device)
                obs = obs.float()
                student_action = self.policy(obs)
                loss = self.loss_criterion(student_action, expert_action)
                epoch_loss += loss.item()

        print("Validation Loss: {}".format(epoch_loss/(batch_idx+1)))



    def compute_accuracy(self, loader):
        total = 0
        correct = 0
        self.policy.eval()
        test_loss = 0

        with torch.no_grad():
            for data, target in loader:
                obs, expert_action = data.to(device), target.to(device)
                obs = obs.float()
                student_action = self.policy(obs)

                total += student_action.size(0)

                # print("Total: {}".format(total))
                # print("\n______________________________________________________________________________")
                # print(f'expert_action -> {expert_action}')
                # print("\n______________________________________________________________________________")
                # print(f'student_action -> {student_action}')
                # print("\n______________________________________________________________________________")
                # similarity = math.isclose(student_action.any(), expert_action.any(), rel_tol=1e-5)

                # difference = (expert_action - student_action).abs()
                similarity= torch.abs(student_action - expert_action)
                similarity = torch.sum(similarity,dim=1)/36
                manual_mse = torch.mean(torch.square(student_action) - torch.square(expert_action),dim=0)

                correct  += (similarity < self.accuracy_threshold).sum().item()
                # print(f'similarity -> {similarity.shape}') # 64
                # print(f'similarity -> {similarity.mean()}')
                # print(f'similarity -> {similarity.shape}')

                
                # print(f'mse -> {manual_mse}')
                # print(f'shape -> {manual_mse.shape}')


                # print(f'correct -> {correct}')
                # correct += int(similarity/total)
                # correct += sum(student_action==expert_action).item()
                

        # print(f'correct -> {correct}')
        accuracy = 100. * correct/(total)

        return accuracy

False
use_cuda:  False


In [94]:
student = StudentAgent(train_loader, test_loader, learning_rate=1e-3, threshold=1e-3)
prova = student.train(50)

policy net:  Sequential(
  (0): Linear(in_features=196, out_features=16, bias=True)
  (1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
  (3): Linear(in_features=16, out_features=36, bias=True)
)
Epoch 0:	train accuracy: 0.0	test accuracy: 0.0
Train Loss: 0.0876106252335012
Validation Loss: 0.2442743768915534
Unused Loss: 0
###############################################################################

Epoch 1:	train accuracy: 0.0	test accuracy: 0.0
Train Loss: 0.04688288516364992
Validation Loss: 0.1775653869472444
Unused Loss: 0
###############################################################################

Epoch 2:	train accuracy: 0.0	test accuracy: 0.0
Train Loss: 0.029434132815804332
Validation Loss: 0.09692502556368708
Unused Loss: 0
###############################################################################

Epoch 3:	train accuracy: 0.0	test accuracy: 0.0
Train Loss: 0.017892009168863295
Validation Loss: 0.07321122071705759
