## Imports 

In [10]:
import os
import sys
import argparse
from tqdm import tqdm
from os import path
from git import Repo
from os.path import exists
from os import mkdir, remove, rename
# from .autonotebook import tqdm as notebook_tqdm

import math
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
torch.manual_seed(42)

# set up train device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device: {}".format(device))

# setup project root dir
COLAB = 'google.colab' in sys.modules
if COLAB:
  root_dir = '/content'
  %mkdir ./data/
else:
  repo = Repo(".", search_parent_directories=True)
  root_dir = repo.git.rev_parse("--show-toplevel")
print("root: {}".format(root_dir))


device: cuda
root: /home/bitfra/Desktop/Francesco/rl_project/acrobatic-agents


## Data

In [11]:
# Dataset class
from torch.utils.data.dataset import Dataset, random_split

class ExpertDataSet(Dataset):

    def __init__(self, expert_observations, expert_actions):
        self.observations = torch.from_numpy(expert_observations)
        self.actions = self.preprocess_data(torch.from_numpy(expert_actions))

    def __getitem__(self, idx):
        # return (self.observations[index], self.actions[index])
        normalized_observations = 2 * ((self.observations[idx] - self.observations.min()) / (self.observations.max() - self.observations.min())) - 1
        # normalized_actions = 2 * ((self.actions[idx] - self.actions.min()) / (self.actions.max() - self.actions.min())) - 1
        normalized_data = (normalized_observations, self.actions[idx])
        return normalized_data


    def __len__(self):
        return len(self.observations)
    
    def preprocess_data(self, data, clip_value=1e38):
        # Clip values to a maximum and minimum range
        data = torch.clamp(data, min=-clip_value, max=clip_value)
        
        # Convert to float
        return data.float()

In [12]:
# Import data and build dataloader

expert_observations = np.load(root_dir+'/data/expert-observations.npy', allow_pickle=True)
expert_actions = np.load(root_dir+'/data/expert-actions.npy', allow_pickle=True)

count_discarded_numpy = 0
count_discarded = 0

new_exp_action = expert_actions

list_of_index_to_drop = []
for i, a in enumerate(expert_actions):
  if (a > 1e2).any() or (a > 1e2).any():
  # if not np.isfinite(a).all(): 
    list_of_index_to_drop.append(i)
    print(i)
    print(a)
    count_discarded_numpy+=1
    # break


print("Expert actions len: {}".format(len(expert_actions)))
print("Expert observations len: {}".format(len(expert_observations)))

expert_dataset = ExpertDataSet(expert_observations, expert_actions)



for i in range(len(expert_dataset)):
  a = expert_dataset.__getitem__(i)[1]
  # print(a.max())
  # print(a.min())
  if (a > 1e2).any() or (a < -1e2).any() :
  # if not torch.isfinite(a).any():
    count_discarded += 1
    print(a)


print("Discarded data")
print("Discarded form np: {}".format(count_discarded_numpy))
print("Discarded form torch: {}".format(count_discarded))

#split in 80% training and 20%test
batch_size = 64
train_prop = 0.8
train_size = int(train_prop * len(expert_dataset))
test_size = len(expert_dataset) - train_size
train_expert_dataset, test_expert_dataset = random_split(expert_dataset, [train_size, test_size])

train_loader = torch.utils.data.DataLoader(  dataset=train_expert_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(  dataset=test_expert_dataset, batch_size=batch_size, shuffle=True)


print("Shapes:")
print(train_loader.dataset.__getitem__(0)[1].shape)
print(train_loader.dataset.__getitem__(0)[0].shape)



Expert actions len: 8000
Expert observations len: 8000


Discarded data
Discarded form np: 0
Discarded form torch: 0
Shapes:
torch.Size([36])
torch.Size([196])


## Model

In [29]:
# Policy Agent

class BCAgent(nn.Module):

  def __init__(self, obs_space, action_space) -> None:
    super(BCAgent, self).__init__()

    self.name = 'Behavioral-Cloning-Agent'
    self.device = device

    self.n_inputs = obs_space
    self.n_outputs = action_space

    # Policy Network
    self.fc1 = nn.Linear(self.n_inputs,16)
    self.bn1 = nn.BatchNorm1d(16)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(16, self.n_outputs)

  def forward(self, x):
    out = self.fc1(x)
    out = self.bn1(out)
    out = self.relu(out)
    out = self.fc2(out)
    return out
  
  def load_parameters(self, src):
    # if exists(dir+self.name.lower()+'.pt'): 
    if exists(src):
        print("Loading model "+self.name+" state parameters")
        print("From :{}".format(src))
        self.load_state_dict(torch.load(src, map_location=self.device))
        return self
    else:
        print("Error no model "+self.name.lower()+" found!")
        exit(1)


## Training

In [30]:
# Train functions

def train(
        policy,
        train_epochs,
        eval_epochs,
        train_loader, 
        test_loader,
        optimizer,
        loss_criterion,
        scheduler,
        thrashold
    ):

    policy.train()
    policy.to(device)
    
    loss = 0
    epoch_loss = 0
    unused_val = 0

    
    with tqdm(total=train_epochs, leave=True) as pbar:
        for epoch in range(train_epochs):
        
            for batch_idx, (data, target) in enumerate(train_loader):

                obs, expert_action = data.to(device), target.to(device)
                obs = obs.float()

                optimizer.zero_grad()

                student_action = policy(obs)
                expert_action = expert_action.float()

                loss = loss_criterion(student_action, expert_action)
                # loss.register_hook(lambda grad: print(grad))
                loss.backward()
                # print("Loss: {}".format(loss.item()))

                if not loss.item() == torch.inf: 
                    epoch_loss += loss.item()
                    optimizer.step()
                    

                else:
                    unused_val += 1
                    print("### BATCH {} ###".format(batch_idx))
                    print(f'obs -> {obs}')
                    print("\n______________________________________________________________________________")
                    print(f'expert_action -> {expert_action}')
                    print("\n______________________________________________________________________________")
                    print(f'student_action -> {student_action}')
                    print("\n______________________________________________________________________________")
                    return expert_action,student_action

                res = print_gradients(policy)
                
                if torch.isnan(student_action).any(): 
                    print('e successo')
                    break

                if res == 1: 
                    print("\n______________________________________________________________________________")
                    print(student_action.shape)
                    for i, ea in enumerate(expert_action):
                        if not np.isfinite(ea).all():
                            print(i+64)
                            print(f'expert_action -> {ea}')

                    print("\n______________________________________________________________________________")
                    print(f'Max expert_action -> {expert_action.max()}')
                    print(f'Min expert_action -> {expert_action.min()}')
                    print(f'Max student_action -> {student_action.max()}')
                    print(f'Min student_action -> {student_action.min()}')
                    break
                
            # deactivate scheduler
            if epoch % 50 == 0 and epoch < thrashold :
                scheduler.step()
            
            # compute accuracy
            # print("Epoch {}".format(epoch))
            # print("Train Loss: {}".format(epoch_loss/(batch_idx+1)))
            # validation(test_loader,policy,loss_criterion,num_epochs=eval_epochs)
            # print("Unused Loss: {}".format(unused_val))
            t_loss = epoch_loss/(batch_idx+1)
            v_loss = validation(test_loader,policy,loss_criterion,num_epochs=eval_epochs)
            epoch_loss = 0
            unused_val = 0
            pbar.set_postfix(train=t_loss, validation=v_loss)
            pbar.update(1)
            
        
        print("Train Loss: {}".format(t_loss))
        print("Validation Loss: {}".format(v_loss))
        print("###############################################################################\n")


def validation(loader, policy,loss_criterion, num_epochs):
    policy.eval()
    epoch_loss = 0
    for epoch in range(num_epochs):
        for batch_idx, (data, target) in enumerate(loader):
            obs, expert_action = data.to(device), target.to(device)
            obs = obs.float()
            student_action = policy(obs)
            loss = loss_criterion(student_action, expert_action)
            epoch_loss += loss.item()

    return epoch_loss/(batch_idx+1)

def print_gradients(policy):
    for name, param in policy.named_parameters():
        if param.requires_grad:
            if torch.isnan(param.grad).any(): 
                return 1#break
            # print(f"Gradient of {name}: {param.grad}")
    return 0

In [31]:
# Train module

obs_space = train_loader.dataset.__getitem__(0)[0].shape[0]
action_space = train_loader.dataset.__getitem__(0)[1].shape[0]
policy = BCAgent(obs_space, action_space)
loss_criterion = nn.MSELoss()
# Create a learning rate scheduler
step_size = 50
gamma = 0.1
# scheduler = 
optimizer =  optim.Adam(policy.parameters(), lr=1e-2)
scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)
eval_epochs = 5

train(policy, 
      train_epochs=20, 
      eval_epochs=5, 
      train_loader=train_loader, 
      test_loader=test_loader,
      optimizer=optimizer,
      loss_criterion=loss_criterion,
      scheduler=scheduler,
      thrashold = 100
    )

100%|██████████| 20/20 [00:51<00:00,  2.58s/it, train=0.0051, validation=0.0233] 

Train Loss: 0.005100689987884834
Validation Loss: 0.02331682918826118
###############################################################################






## Saving

In [32]:
# Save model

dest = root_dir+'/checkpoints/'
save_name = policy.name.lower()+'.pt'

if not exists(dest): 
  mkdir(dest)

else: 
    if exists(dest+save_name):
        rename(dest+save_name, dest+save_name+'.bk')

torch.save(policy.state_dict(), dest+save_name)

#qui finisce 

In [33]:
print(policy.device)

cuda


In [36]:
for i in range(100,110):
  obs, action  = train_loader.dataset.__getitem__(i)

  print(f'real action: {action}')

  obs = obs.float().unsqueeze(0).to(device)

  our_action = policy(obs).squeeze().detach().cpu().numpy()

  print(f'our action: {our_action}')

real action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
our action: [-0.00348195  0.00419287 -0.00257483  0.00043887  0.00120174 -0.00855614
  0.00243651 -0.00230825  0.00232608  0.00091796 -0.00292316  0.00018568
  0.00852696  0.0067345   0.00056387 -0.00586368  0.00131345  0.00893152
  0.00015079  0.00402855 -0.00314843 -0.00057269 -0.00933423  0.00017852
  0.00105132 -0.00175206 -0.01157005 -0.00417152  0.0014846   0.00283458
  0.0029884  -0.00030958  0.00520914  0.00751644  0.00582008 -0.00287467]
real action: tensor([-0.7530,  0.0591, -0.0638,  0.4054, -0.3077,  0.2075,  0.0923,  0.5357,
         1.3419, -0.3775,  0.1903,  0.6258, -2.6502, -0.4104, -0.0113, -0.1208,
         0.0889,  0.8460, -0.2452, -0.2553,  0.0944,  0.7016,  1.6317,  0.2099,
         0.1358,  0.3904, -3.0713,  0.0482, -0.1579,  0.0431,  0.6645,  0.1123,
         0.3630, -0.0584,  0.1462,  1.2696

# Loading 

In [37]:
src = root_dir+'/checkpoints/'+policy.name.lower()+'.pt'
policy.load_parameters(src)
print(root_dir+'/checkpoints/'+policy.name.lower()+'.pt')

Loading model Behavioral-Cloning-Agent state parameters
From :/home/bitfra/Desktop/Francesco/rl_project/acrobatic-agents/checkpoints/behavioral-cloning-agent.pt
/home/bitfra/Desktop/Francesco/rl_project/acrobatic-agents/checkpoints/behavioral-cloning-agent.pt
