## Imports 

In [3]:
import os
import sys
import argparse
from tqdm import tqdm
from os import path
from git import Repo
from os.path import exists
from os import mkdir, remove, rename
# from .autonotebook import tqdm as notebook_tqdm

import math
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data.dataset import Dataset, random_split

torch.manual_seed(42)

# set up train device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device: {}".format(device))

# setup project root dir
COLAB = 'google.colab' in sys.modules
if COLAB:
  root_dir = '/content'
  %mkdir ./data/
else:
  repo = Repo(".", search_parent_directories=True)
  root_dir = repo.git.rev_parse("--show-toplevel")
  sys.path.insert(0, root_dir+'/models/')
print("root: {}".format(root_dir))



device: cpu
root: /home/leeoos/Projects/master/airo-rl/acrobatic-agents


In [12]:
# Import data and build dataloader
dataset_len = 4000
expert_observations = np.load(root_dir+'/data/expert-observations-'+str(dataset_len)+'.npy', allow_pickle=True)
expert_actions = np.load(root_dir+'/data/expert-actions-'+str(dataset_len)+'.npy', allow_pickle=True)

# BCO FC1

## Data

In [8]:
# Dataset class

class ExpertDataSet(Dataset):

    def __init__(self, expert_observations, expert_actions):
        self.observations = torch.from_numpy(expert_observations).float()
        self.actions = self.__preprocess__(torch.from_numpy(expert_actions))
        

    def __getitem__(self, idx):
        # return (self.observations[index], self.actions[index])
        normalized_observations = 2 * ((self.observations[idx] - self.observations.min()) / (self.observations.max() - self.observations.min())) - 1
        # normalized_actions = 2 * ((self.actions[idx] - self.actions.min()) / (self.actions.max() - self.actions.min())) - 1
        normalized_data = (normalized_observations, self.actions[idx])
        return normalized_data


    def __len__(self):
        return len(self.observations)
    
    
    def __preprocess__(self, data, clip_value=1e38):
        # Clip values to a maximum and minimum range
        data = torch.clamp(data, min=-clip_value, max=clip_value)
        
        # Convert to float
        return data.float()
    
    def __min__max__(self):
        return self.observations.min(), self.observations.max() 

In [16]:
# Make Datasets 

count_discarded_numpy = 0
count_discarded = 0

new_exp_action = expert_actions

list_of_index_to_drop = []
for i, a in enumerate(expert_actions):
  if (a > 1e2).any() or (a > 1e2).any():
  # if not np.isfinite(a).all(): 
    list_of_index_to_drop.append(i)
    # print(i)
    # print(a)
    count_discarded_numpy+=1
    # break


print("Expert actions len: {}".format(len(expert_actions)))
print("Expert observations len: {}".format(len(expert_observations)))

expert_dataset = ExpertDataSet(expert_observations, expert_actions)

for i in range(len(expert_dataset)):
  a = expert_dataset.__getitem__(i)[1]
  # print(a.max())
  # print(a.min())
  if (a > 1e2).any() or (a < -1e2).any() :
  # if not torch.isfinite(a).any():
    count_discarded += 1
    # print(a)

print("Discarded data:")
print("Discarded form np: {}".format(count_discarded_numpy))
print("Discarded form torch: {}".format(count_discarded))

min_val, max_val = expert_dataset.__min__max__()

print("Statistics: ")
print("Observations min: {}".format(min_val))
print("Observations max: {}".format(max_val))


Expert actions len: 4000
Expert observations len: 4000
Discarded data:
Discarded form np: 1876
Discarded form torch: 1876
Statistics: 
Observations min: -32.20038604736328
Observations max: 35.84436798095703


In [10]:
# Data Loaders

batch_size = 32
train_prop = 0.7
train_size = int(train_prop * len(expert_dataset))
test_size = int(0.2 * len(expert_dataset))
val_size = int(0.1 * len(expert_dataset))
train_expert_dataset, test_expert_dataset,val_expert_dataset = random_split(expert_dataset, [train_size, test_size,val_size])

train_loader = torch.utils.data.DataLoader(dataset=train_expert_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_expert_dataset, batch_size=batch_size, shuffle=False)
val_loader = torch.utils.data.DataLoader( dataset=val_expert_dataset, batch_size=batch_size, shuffle=False)


print("Shapes:")
print("actions shape: {}".format(train_loader.dataset.__getitem__(0)[1].shape))
print("observations shape: {}".format(train_loader.dataset.__getitem__(0)[0].shape))


Shapes:
actions shape: torch.Size([36])
observations shape: torch.Size([197])


## Model

In [11]:
# Policy Agent
import bco_agents as bco

obs_space = train_loader.dataset.__getitem__(0)[0].shape[0]
action_space = train_loader.dataset.__getitem__(0)[1].shape[0]
policy = bco.BCOAgentFC(obs_space, action_space, h_size=obs_space*2).to(device)

print("Policy net: {}".format(policy))

Policy net: BCOAgentFC(
  (fc1): Linear(in_features=197, out_features=394, bias=True)
  (bn1): BatchNorm1d(394, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU()
  (fc2): Linear(in_features=394, out_features=36, bias=True)
)


## Training

In [6]:
# Train functions

def train(
        policy,
        train_epochs,
        train_loader, 
        val_loader,
        optimizer,
        loss_criterion,
        scheduler,
        thrashold
    ):

    policy.train()
    loss = 0
    epoch_loss = 0
    unused_val = 0

    with tqdm(total=train_epochs, leave=True) as pbar:
        for epoch in range(train_epochs):
            for batch_idx, (data, target) in enumerate(train_loader):

                obs, expert_action = data.to(device), target.to(device)
                obs = obs.float()

                optimizer.zero_grad()

                student_action = policy(obs)
                expert_action = expert_action.float()

                loss = loss_criterion(student_action, expert_action)
                # loss.register_hook(lambda grad: print(grad))
                loss.backward()
                # print("Loss: {}".format(loss.item()))

                if not loss.item() == torch.inf: 
                    epoch_loss += loss.item()
                    optimizer.step()
                    

                else:
                    unused_val += 1
                    print("### BATCH {} ###".format(batch_idx))
                    print(f'obs -> {obs}')
                    print("\n______________________________________________________________________________")
                    print(f'expert_action -> {expert_action}')
                    print("\n______________________________________________________________________________")
                    print(f'student_action -> {student_action}')
                    print("\n______________________________________________________________________________")
                    return expert_action,student_action

                res = print_gradients(policy)
                
                if torch.isnan(student_action).any(): 
                    print('e successo')
                    break

                if res == 1: 
                    print("\n______________________________________________________________________________")
                    print(student_action.shape)
                    for i, ea in enumerate(expert_action):
                        if not np.isfinite(ea).all():
                            print(i+64)
                            print(f'expert_action -> {ea}')

                    print("\n______________________________________________________________________________")
                    print(f'Max expert_action -> {expert_action.max()}')
                    print(f'Min expert_action -> {expert_action.min()}')
                    print(f'Max student_action -> {student_action.max()}')
                    print(f'Min student_action -> {student_action.min()}')
                    break
                
            # deactivate scheduler
            # if epoch % 50 == 0 and epoch < thrashold :
            #     scheduler.step()
            
            # compute accuracy
            # print("Epoch {}".format(epoch))
            # print("Train Loss: {}".format(epoch_loss/(batch_idx+1)))
            # validation(test_loader,policy,loss_criterion,num_epochs=eval_epochs)
            # print("Unused Loss: {}".format(unused_val))
            t_loss = epoch_loss/(batch_idx+1)
            v_loss = validation(val_loader, policy, loss_criterion)
            epoch_loss = 0
            unused_val = 0
            pbar.set_postfix(train=t_loss, validation=v_loss)
            pbar.update(1)
            
        
        print("###############################################################################\n")
        print("Train Loss: {}".format(t_loss))
        print("Validation Loss: {}".format(v_loss))
        print("###############################################################################\n")


def validation(loader, policy,loss_criterion):
    policy.eval()
    epoch_loss = 0
    for batch_idx, (data, target) in enumerate(loader):
        obs, expert_action = data.to(device), target.to(device)
        obs = obs.float()
        student_action = policy(obs)
        loss = loss_criterion(student_action, expert_action)
        epoch_loss += loss.item()

    return epoch_loss/(batch_idx+1)

def print_gradients(policy):
    for name, param in policy.named_parameters():
        if param.requires_grad:
            if torch.isnan(param.grad).any(): 
                return 1#break
            # print(f"Gradient of {name}: {param.grad}")
    return 0

In [13]:
# Train module

loss_criterion = nn.MSELoss()
# Create a learning rate scheduler
step_size = 50
gamma = 0.1
optimizer =  optim.Adam(policy.parameters(), lr=1e-3)
optimizer =  optim.SGD(policy.parameters(), lr=1e-3,momentum=0.9)
scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)
eval_epochs = 5

train(policy, 
      train_epochs=200, 
      train_loader=train_loader, 
      val_loader=val_loader,
      optimizer=optimizer,
      loss_criterion=loss_criterion,
      scheduler=scheduler,
      thrashold = 100
    )

100%|██████████| 200/200 [02:48<00:00,  1.19it/s, train=0.00277, validation=0.00528]

###############################################################################

Train Loss: 0.0027715489849104856
Validation Loss: 0.005281741102551582
###############################################################################






## Testing

In [14]:
def test(policy, test_loader,loss_criterion):
    policy.eval()   
    loss = 0
    epoch_loss = 0
    
    for batch_idx, (data, target) in enumerate(test_loader):

        obs, expert_action = data.to(device), target.to(device)
        obs = obs.float()

        student_action = policy(obs)
        expert_action = expert_action.float()

        loss = loss_criterion(student_action, expert_action)

        if not loss.item() == torch.inf: 
            epoch_loss += loss.item()
        
        if torch.isnan(student_action).any(): 
            print('e successo')
            break
        
    t_loss = epoch_loss/(batch_idx+1)
    epoch_loss = 0
            
    
    print("Test Loss: {}".format(t_loss))

In [15]:
test(policy=policy, test_loader=test_loader, loss_criterion=loss_criterion)

Test Loss: 0.003524899054929464


In [16]:
for i in range(80,110):
  print(f'real action: {action}')
  obs, action  = train_loader.dataset.__getitem__(i)
  obs = obs.float().unsqueeze(0).to(device)
  our_action = policy(obs).squeeze().detach().cpu().numpy()
  print(f'our action: {our_action}')

real action: tensor([-0.2640, -0.0243,  0.0218,  0.4367,  0.4632,  0.2836,  0.0659,  0.3830,
         1.0745, -0.0309,  0.0526,  0.4187, -1.4656, -0.9716,  0.0219,  0.0418,
         0.2906,  0.5058,  0.0244, -0.0727,  0.1426,  0.6817,  0.5032, -0.0425,
         0.1321,  0.5010, -0.8631, -2.2014, -0.1835, -0.1698,  0.5500,  0.7496,
         0.0946,  0.0629,  0.2329, -0.5155])
our action: [-0.27507994  0.05222445 -0.01102364  0.45286563  0.38199672  0.23051387
  0.16658112  0.47308296  0.9855     -0.05561681  0.03316684  0.46120366
 -1.4810281  -0.7969774  -0.00516352  0.03130021  0.24241611  0.50514567
 -0.07279885  0.04402629  0.19728291  0.6124793   0.4097048  -0.1410407
  0.11935482  0.5775593  -0.79885334 -2.0341215  -0.16610786 -0.04422004
  0.5788413   0.8363042   0.10946438 -0.10726436  0.2894362  -0.372266  ]
real action: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]

In [None]:
# not normalized observations / manual normalization
stop_at = 5
policy.eval()
for obs in expert_observations:
  print(obs)
  inputs = torch.from_numpy(obs[:196]).float().unsqueeze(0)
  inputs = 2 * ((inputs - min_val) / (max_val - min_val)) - 1
  print(inputs)
  output = policy(inputs).squeeze().detach().cpu().numpy()
  # print(f'our action: {our_action}')

In [None]:
# Normalized observations
for i in range(80,110):
  obs = train_loader.dataset.__getitem__(i)[0]
  obs = obs.float().unsqueeze(0).to(device)
  print(obs)


## Saving

In [17]:
# Save model

# save_parameters()
version = 5
policy.save_parameters(root_dir+'/checkpoints/', version)

## Loading 

In [37]:
src = root_dir+'/checkpoints/'+policy.name.lower()+'.pt'
policy.load_parameters(src)
print(root_dir+'/checkpoints/'+policy.name.lower()+'.pt')

Loading model Behavioral-Cloning-Agent state parameters
From :/home/bitfra/Desktop/Francesco/rl_project/acrobatic-agents/checkpoints/behavioral-cloning-agent.pt
/home/bitfra/Desktop/Francesco/rl_project/acrobatic-agents/checkpoints/behavioral-cloning-agent.pt


# BCO CNN

## Data 

In [None]:
# Dataset class

class ExpertDataSetCNN(Dataset):

    def __init__(self, expert_observations, expert_actions):
        self.observations = torch.from_numpy(expert_observations).float()
        self.actions = self.__preprocess__(torch.from_numpy(expert_actions))
        

    def __getitem__(self, idx):
        # return (self.observations[index], self.actions[index])
        # normalized_observations = 2 * ((self.observations[idx] - self.observations.min()) / (self.observations.max() - self.observations.min())) - 1
        # normalized_actions = 2 * ((self.actions[idx] - self.actions.min()) / (self.actions.max() - self.actions.min())) - 1
        # normalized_data = (normalized_observations, self.actions[idx])
        normalized_data = (self.observations[idx], self.actions[idx])
        return normalized_data


    def __len__(self):
        return len(self.observations)
    
    
    def __preprocess__(self, data, clip_value=1e38):
        # Clip values to a maximum and minimum range
        data = torch.clamp(data, min=-clip_value, max=clip_value)
        
        # Convert to float
        return data.float()
    
    def __min__max__(self):
        return self.observations.min(), self.observations.max() 

## Model

In [None]:
###### Define student agent


torch.manual_seed(42)

from os.path import join, exists
from os import mkdir, remove, rename
# from os import mkdir, unlink, listdir, getpid, remove


class BCOAgentCNN(nn.Module):

  def __init__(self, obs_space, 
               action_space,
               h_size=16,
               device='cpu'
              ) -> None:
    
    super(BCOAgentCNN, self).__init__()

    self.name = 'bco-fc'
    self.device = device

    self.n_inputs = obs_space
    self.n_outputs = action_space

    # Policy Network
    self.fc1 = nn.Linear(self.n_inputs, h_size) #16
    self.bn1 = nn.BatchNorm1d(h_size) #16
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(h_size, self.n_outputs) #16

  def forward(self, x):
    out = self.fc1(x)
    out = self.bn1(out)
    out = self.relu(out)
    out = self.fc2(out)
    return out
  
  def load_parameters(self, src, version):
    src = src+self.name.lower()+'-'+str(version)+'.pt'
    if exists(src):
        print("Loading model "+self.name.lower()+'-'+str(version)+" state parameters")
        print("From :{}".format(src))
        self.load_state_dict(torch.load(src, map_location=self.device))
        return self
    else:
        print("Error no model "+self.name.lower()+'-'+str(version)+'.pt'+" found!")
        exit(1)

  
  def save_parameters(self, dest, version):
    save_name = self.name.lower()+'-'+str(version)+'.pt'

    if not exists(dest): 
      mkdir(dest)
    else: 
        if exists(dest+save_name):
          rename(dest+save_name, dest+save_name+'.bk')

    torch.save(self.state_dict(), dest+save_name)




## Training

## Testing

## Saving