## Imports 

In [1]:
import os
import sys
import pickle
import joblib
import argparse
from tqdm import tqdm
from os import path
from git import Repo
from os.path import exists
from os import mkdir, remove, rename
# from .autonotebook import tqdm as notebook_tqdm

import math
import random
import numpy as np
from sklearn.preprocessing import RobustScaler

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data.dataset import Dataset, random_split

torch.manual_seed(42)

# set up train device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device: {}".format(device))

# setup project root dir
COLAB = 'google.colab' in sys.modules
if COLAB:
  root_dir = '/content'
  %mkdir ./data/
else:
  repo = Repo(".", search_parent_directories=True)
  root_dir = repo.git.rev_parse("--show-toplevel")
  sys.path.insert(0, root_dir+'/models/')
print("root: {}".format(root_dir))



device: cuda
root: /home/bitfra/Desktop/Francesco/rl_project/acrobatic-agents


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Import data and build dataloader
dataset_len = 6000
raw_expert_observations = np.load(root_dir+'/data/expert-observations-'+str(dataset_len)+'.npy', allow_pickle=True)
expert_actions = np.load(root_dir+'/data/expert-actions-'+str(dataset_len)+'.npy', allow_pickle=True)

In [3]:
# Robust scaling

# Create a robust scaler object
scaler = RobustScaler()

# Fit the scaler to your data
scaler.fit(raw_expert_observations)
expert_observations = scaler.transform(raw_expert_observations)

# This will save the scaler to a file named 'scaler.joblib'
joblib.dump(scaler, root_dir+'/data/scaler-'+str(dataset_len)+'.joblib')  

# with open(root_dir+'/data/scaler-'+str(dataset_len)+'.joblib', 'wb') as file:
#     pickle.dump(scaler, file)


['/home/bitfra/Desktop/Francesco/rl_project/acrobatic-agents/data/scaler-6000.joblib']

# BCO FC1

## Data

In [4]:
# Dataset class

class ExpertDataSet(Dataset):

    def __init__(self, expert_observations, expert_actions):
        self.observations = torch.from_numpy(expert_observations).float()
        self.actions = self.__preprocess__(torch.from_numpy(expert_actions))
        

    def __getitem__(self, idx):
        # return (self.observations[index], self.actions[index])
        # normalized_observations = 2 * ((self.observations[idx] - self.observations.min()) / (self.observations.max() - self.observations.min())) - 1
        # normalized_actions = 2 * ((self.actions[idx] - self.actions.min()) / (self.actions.max() - self.actions.min())) - 1
        # normalized_data = (normalized_observations, self.actions[idx])
        # return normalized_data
        return self.observations[idx], self.actions[idx]


    def __len__(self):
        return len(self.observations)
    
    
    def __preprocess__(self, data, clip_value=1e38):
        # Clip values to a maximum and minimum range
        data = torch.clamp(data, min=-clip_value, max=clip_value)
        
        # Convert to float
        return data.float()
    
    def __min__max__(self):
        return self.observations.min(), self.observations.max() 

In [5]:
# Make Datasets 

count_discarded_numpy = 0
count_discarded = 0

new_exp_action = expert_actions

list_of_index_to_drop = []
for i, a in enumerate(expert_actions):
  if (a > 1e2).any() or (a > 1e2).any():
  # if not np.isfinite(a).all(): 
    list_of_index_to_drop.append(i)
    # print(i)
    # print(a)
    count_discarded_numpy+=1
    # break


print("Expert actions len: {}".format(len(expert_actions)))
print("Expert observations len: {}".format(len(expert_observations)))

expert_dataset = ExpertDataSet(expert_observations, expert_actions)

for i in range(len(expert_dataset)):
  a = expert_dataset.__getitem__(i)[1]
  # print(a.max())
  # print(a.min())
  if (a > 1e2).any() or (a < -1e2).any() :
  # if not torch.isfinite(a).any():
    count_discarded += 1
    # print(a)

print("Discarded data:")
print("Discarded form np: {}".format(count_discarded_numpy))
print("Discarded form torch: {}".format(count_discarded))

min_val, max_val = expert_dataset.__min__max__()

print("Statistics: ")
print("Observations min: {}".format(min_val))
print("Observations max: {}".format(max_val))


Expert actions len: 6000
Expert observations len: 6000
Discarded data:
Discarded form np: 0
Discarded form torch: 0
Statistics: 
Observations min: -43.44462203979492
Observations max: 25.609783172607422


In [6]:
# check correct scaling
idx = random.randint(0, dataset_len)
print(raw_expert_observations[idx])
print(expert_dataset.__getitem__(idx)[0])

[ 7.15714286e-01  7.30014086e-01  2.83791423e-02  9.70363617e-03
  6.32491708e-02  6.87499523e-01  4.34871405e-01 -3.10895652e-01
 -4.91503000e-01  1.44381285e-01  5.32424450e-02  3.21142405e-01
  6.91232085e-01  4.16582197e-01 -3.28087777e-01 -4.90933746e-01
  2.79643804e-01  7.12199211e-02  5.62593341e-01  6.54284537e-01
  4.10000980e-01 -2.69350946e-01 -5.75552821e-01 -4.09491658e-02
 -1.86350763e-01  1.66778445e-01  8.14477742e-01 -2.45841160e-01
 -4.70354915e-01  2.34423488e-01 -8.50124955e-02 -5.13072416e-01
  2.12468147e-01  7.96494067e-01  7.79825747e-02 -5.24966955e-01
 -2.89699256e-01 -1.44465506e-01 -7.05847956e-01  1.63728476e-01
  8.51464689e-01 -2.05010585e-02 -5.23900747e-01  1.07551673e-02
 -1.79333091e-02 -6.94095492e-02  4.56959963e-01  9.39763069e-01
  1.56709135e-01 -2.14659110e-01 -2.14963004e-01 -5.86979389e-02
 -3.02378953e-01  3.79739106e-01  9.64010119e-01  1.06653832e-01
 -2.43436322e-01 -6.94104703e-03 -5.33422828e-02 -4.38151509e-01
  3.50697815e-01  9.64010

In [26]:
# Data Loaders

batch_size = 128
train_prop = 0.7
train_size = int(train_prop * len(expert_dataset))
test_size = int(0.2 * len(expert_dataset))
val_size = int(0.1 * len(expert_dataset))
train_expert_dataset, test_expert_dataset,val_expert_dataset = random_split(expert_dataset, [train_size, test_size,val_size])

train_loader = torch.utils.data.DataLoader(dataset=train_expert_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_expert_dataset, batch_size=batch_size, shuffle=False)
val_loader = torch.utils.data.DataLoader( dataset=val_expert_dataset, batch_size=batch_size, shuffle=False)


print("Shapes:")
print("actions shape: {}".format(train_loader.dataset.__getitem__(0)[1].shape))
print("observations shape: {}".format(train_loader.dataset.__getitem__(0)[0].shape))


Shapes:
actions shape: torch.Size([36])
observations shape: torch.Size([197])


## Model

In [43]:
# Policy Agent
import bco_agents as bco

obs_space = train_loader.dataset.__getitem__(0)[0].shape[0]
action_space = train_loader.dataset.__getitem__(0)[1].shape[0]
policy = bco.BCOAgentFC(obs_space, action_space, h_size=obs_space*2).to(device)

print("Policy net: {}".format(policy))

Policy net: BCOAgentFC(
  (fc1): Linear(in_features=197, out_features=394, bias=True)
  (bn1): BatchNorm1d(394, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU()
  (fc2): Linear(in_features=394, out_features=36, bias=True)
)


In [27]:
# Policy Agent

import bco_cnn as bco_cnn
obs_space = train_loader.dataset.__getitem__(0)[0].shape[0]
action_space = train_loader.dataset.__getitem__(0)[1].shape[0]
policy = bco_cnn.BCO_cnn(obs_space, action_space).to(device) # fix add hidden layer size

print("Policy net: {}".format(policy))

Policy net: BCO_cnn(
  (conv1): Conv1d(1, 36, kernel_size=(5,), stride=(2,), padding=(1,))
  (conv2): Conv1d(36, 36, kernel_size=(3,), stride=(2,), padding=(2,))
  (fc1): Linear(in_features=1800, out_features=72, bias=True)
  (fc2): Linear(in_features=72, out_features=36, bias=True)
  (LRelu): LeakyReLU(negative_slope=0.01)
)


## Training

In [28]:
# Train functions

def train(
        policy,
        train_epochs,
        train_loader, 
        val_loader,
        optimizer,
        loss_criterion,
        scheduler,
        thrashold
    ):

    policy.train()
    loss = 0
    epoch_loss = 0
    unused_val = 0

    with tqdm(total=train_epochs, leave=True) as pbar:
        for epoch in range(train_epochs):
            for batch_idx, (data, target) in enumerate(train_loader):

                obs, expert_action = data.to(device), target.to(device)
                obs = obs.float()

                if policy.name == 'bco-cnn':
                    obs = obs.unsqueeze(1)
                

                optimizer.zero_grad()

                student_action = policy(obs)
                expert_action = expert_action.float()

                loss = loss_criterion(student_action, expert_action)
                # loss.register_hook(lambda grad: print(grad))
                loss.backward()
                # print("Loss: {}".format(loss.item()))

                if not loss.item() == torch.inf: 
                    epoch_loss += loss.item()
                    optimizer.step()
                    

                else:
                    unused_val += 1
                    print("### BATCH {} ###".format(batch_idx))
                    print(f'obs -> {obs}')
                    print("\n______________________________________________________________________________")
                    print(f'expert_action -> {expert_action}')
                    print("\n______________________________________________________________________________")
                    print(f'student_action -> {student_action}')
                    print("\n______________________________________________________________________________")
                    return expert_action,student_action

                res = print_gradients(policy)
                
                if torch.isnan(student_action).any(): 
                    print('e successo')
                    break

                if res == 1: 
                    print("\n______________________________________________________________________________")
                    print(student_action.shape)
                    for i, ea in enumerate(expert_action):
                        if not np.isfinite(ea).all():
                            print(i+64)
                            print(f'expert_action -> {ea}')

                    print("\n______________________________________________________________________________")
                    print(f'Max expert_action -> {expert_action.max()}')
                    print(f'Min expert_action -> {expert_action.min()}')
                    print(f'Max student_action -> {student_action.max()}')
                    print(f'Min student_action -> {student_action.min()}')
                    break
                
            # deactivate scheduler
            # if epoch % 50 == 0 and epoch < thrashold :
            #     scheduler.step()
            
            # compute accuracy
            # print("Epoch {}".format(epoch))
            # print("Train Loss: {}".format(epoch_loss/(batch_idx+1)))
            # validation(test_loader,policy,loss_criterion,num_epochs=eval_epochs)
            # print("Unused Loss: {}".format(unused_val))
            t_loss = epoch_loss/(batch_idx+1)
            v_loss = validation(val_loader, policy, loss_criterion)
            epoch_loss = 0
            unused_val = 0
            pbar.set_postfix(train=t_loss, validation=v_loss)
            pbar.update(1)
            
        
        print("###############################################################################\n")
        print("Train Loss: {}".format(t_loss))
        print("Validation Loss: {}".format(v_loss))
        print("###############################################################################\n")


def validation(loader, policy,loss_criterion):
    policy.eval()
    epoch_loss = 0
    for batch_idx, (data, target) in enumerate(loader):
        obs, expert_action = data.to(device), target.to(device)
        obs = obs.float()
        if policy.name == 'bco-cnn':
            obs = obs.unsqueeze(1)
        student_action = policy(obs)
        loss = loss_criterion(student_action, expert_action)
        epoch_loss += loss.item()

    return epoch_loss/(batch_idx+1)

def print_gradients(policy):
    for name, param in policy.named_parameters():
        if param.requires_grad:
            if torch.isnan(param.grad).any(): 
                return 1#break
            # print(f"Gradient of {name}: {param.grad}")
    return 0

In [38]:
# Train module

loss_criterion = nn.MSELoss()
# Create a learning rate scheduler
step_size = 80
gamma = 0.3
optimizer =  optim.Adam(policy.parameters(), lr=1e-3)
optimizer =  optim.SGD(policy.parameters(), lr=1e-3,momentum=0.9)
scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)
eval_epochs = 5

train(policy, 
      train_epochs=2500, 
      train_loader=train_loader, 
      val_loader=val_loader,
      optimizer=optimizer,
      loss_criterion=loss_criterion,
      scheduler=scheduler,
      thrashold = 100
    )

100%|██████████| 2500/2500 [02:48<00:00, 14.81it/s, train=0.00162, validation=0.00214]

###############################################################################

Train Loss: 0.0016199027921891575
Validation Loss: 0.0021418428281322122
###############################################################################






## Testing

In [39]:
def test(policy, test_loader,loss_criterion):
    policy.eval()   
    loss = 0
    epoch_loss = 0
    
    for batch_idx, (data, target) in enumerate(test_loader):

        obs, expert_action = data.to(device), target.to(device)
        obs = obs.float()
        
        if policy.name == 'bco-cnn':
            obs = obs.unsqueeze(1)

        student_action = policy(obs)
        expert_action = expert_action.float()

        loss = loss_criterion(student_action, expert_action)

        if not loss.item() == torch.inf: 
            epoch_loss += loss.item()
        
        if torch.isnan(student_action).any(): 
            print('e successo')
            break
        
    t_loss = epoch_loss/(batch_idx+1)
    epoch_loss = 0
            
    
    print("Test Loss: {}".format(t_loss))

In [41]:
test(policy=policy, test_loader=test_loader, loss_criterion=loss_criterion)

Test Loss: 0.0024583286023698745


In [37]:
for i in range(80,110):
  obs, action  = train_loader.dataset.__getitem__(i)
  print(f'real action: {action}')
  obs = obs.float().unsqueeze(0).to(device)
  if policy.name == 'bco-cnn':
    obs = obs.unsqueeze(1)
  our_action = policy(obs).squeeze().detach().cpu().numpy()
  print(f'our action: {our_action}')

real action: tensor([-0.3407,  0.0575, -0.0633,  0.4559, -0.3676, -0.0539,  0.2109,  0.5210,
         0.4833,  0.0087, -0.4612,  0.5249, -0.2025, -0.4031,  0.0756,  0.1666,
         0.4129,  0.5274, -0.0903,  0.2039,  0.2079,  0.3886,  0.7169,  0.0659,
        -0.0135,  0.5697, -0.4891, -0.1242, -0.0457,  0.0159,  0.1940,  0.4291,
         0.1285, -0.2849,  0.3103,  0.1618])
our action: [-0.24804996  0.05207529 -0.10330974  0.40238184 -0.41371584 -0.07148968
  0.20436227  0.54680246  0.45014328 -0.01753886 -0.46559608  0.52920425
 -0.17799583 -0.47082734  0.08399461  0.17068592  0.41985917  0.5196824
 -0.10476074  0.21129487  0.21357316  0.39600885  0.7914258   0.05508422
 -0.01567679  0.5620211  -0.45917904 -0.08325265 -0.05579152 -0.01537449
  0.17497064  0.37968275  0.15011017 -0.29302117  0.29794413  0.166696  ]
real action: tensor([-0.2683,  0.0535, -0.0365,  0.5007,  0.4285,  0.3690,  0.0562,  0.4159,
         0.9843, -0.0570, -0.0247,  0.5013, -2.1507, -0.2433,  0.0571,  0.0461,

In [24]:
# not normalized observations / manual normalization
stop_at = 5
policy.eval()
for obs in expert_observations:
  print(obs)
  inputs = torch.from_numpy(obs[:196]).float().unsqueeze(0)
  inputs = 2 * ((inputs - min_val) / (max_val - min_val)) - 1
  print(inputs)
  output = policy(inputs).squeeze().detach().cpu().numpy()
  # print(f'our action: {our_action}')

[ 9.82396526e-01  4.79439053e-02 -3.29766128e-01  1.14085569e-01
  2.36817154e-01  8.11988831e-02  1.35434379e-01  8.29208982e-03
  2.63381942e-01 -2.66817681e-01  1.86539290e-01  2.50136268e-01
  2.17640414e-01  7.82742881e-02  1.41631790e-01  9.05683892e-02
 -2.18686888e-01  2.80836242e-01  1.50088629e-01  2.26539031e-01
  2.10000061e-02  1.10441638e-01  6.75695093e-03 -5.96549286e-01
 -2.76583367e-01 -3.37180025e-01  2.91286171e-01  3.16685388e-01
  2.83847125e-01 -4.73767985e-01 -6.22494435e-01 -3.01553190e-01
 -3.78355010e-01  9.01128626e-02  1.62170801e-03  1.97763178e-01
 -1.95664289e-01 -6.12505681e-01 -3.12059346e-01 -2.59155353e-01
  4.75003078e-02  3.73337600e-02 -1.62453313e-01 -5.33968921e-01
 -1.49672800e-01  5.13163261e-02 -1.07207912e-01  3.42944809e-01
  1.87042694e-01 -2.01743415e-02  5.07439864e-02  2.70421739e-02
 -1.31430210e-01 -3.04506248e-01  2.34738853e-01  1.02305800e-01
 -5.80268976e-02  1.16168090e-01  6.30838677e-02 -2.33515969e-01
 -2.91447493e-01  2.34738

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument weight in method wrapper___slow_conv2d_forward)

In [None]:
# Normalized observations
for i in range(80,110):
  obs = train_loader.dataset.__getitem__(i)[0]
  obs = obs.float().unsqueeze(0).to(device)
  print(obs)


## Saving

In [42]:
# Save model

# save_parameters()
version = 3
policy.save_parameters(root_dir+'/checkpoints/', version)

## Loading 

In [37]:
src = root_dir+'/checkpoints/'+policy.name.lower()+'.pt'
policy.load_parameters(src)
print(root_dir+'/checkpoints/'+policy.name.lower()+'.pt')

Loading model Behavioral-Cloning-Agent state parameters
From :/home/bitfra/Desktop/Francesco/rl_project/acrobatic-agents/checkpoints/behavioral-cloning-agent.pt
/home/bitfra/Desktop/Francesco/rl_project/acrobatic-agents/checkpoints/behavioral-cloning-agent.pt


# BCO CNN

## Data 

In [None]:
# Dataset class

class ExpertDataSetCNN(Dataset):

    def __init__(self, expert_observations, expert_actions):
        self.observations = torch.from_numpy(expert_observations).float()
        self.actions = self.__preprocess__(torch.from_numpy(expert_actions))
        

    def __getitem__(self, idx):
        # return (self.observations[index], self.actions[index])
        # normalized_observations = 2 * ((self.observations[idx] - self.observations.min()) / (self.observations.max() - self.observations.min())) - 1
        # normalized_actions = 2 * ((self.actions[idx] - self.actions.min()) / (self.actions.max() - self.actions.min())) - 1
        # normalized_data = (normalized_observations, self.actions[idx])
        normalized_data = (self.observations[idx], self.actions[idx])
        return normalized_data


    def __len__(self):
        return len(self.observations)
    
    
    def __preprocess__(self, data, clip_value=1e38):
        # Clip values to a maximum and minimum range
        data = torch.clamp(data, min=-clip_value, max=clip_value)
        
        # Convert to float
        return data.float()
    
    def __min__max__(self):
        return self.observations.min(), self.observations.max() 

## Model

In [None]:
###### Define student agent


torch.manual_seed(42)

from os.path import join, exists
from os import mkdir, remove, rename
# from os import mkdir, unlink, listdir, getpid, remove


class BCOAgentCNN(nn.Module):

  def __init__(self, obs_space, 
               action_space,
               h_size=16,
               device='cpu'
              ) -> None:
    
    super(BCOAgentCNN, self).__init__()

    self.name = 'bco-fc'
    self.device = device

    self.n_inputs = obs_space
    self.n_outputs = action_space

    # Policy Network
    self.fc1 = nn.Linear(self.n_inputs, h_size) #16
    self.bn1 = nn.BatchNorm1d(h_size) #16
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(h_size, self.n_outputs) #16

  def forward(self, x):
    out = self.fc1(x)
    out = self.bn1(out)
    out = self.relu(out)
    out = self.fc2(out)
    return out
  
  def load_parameters(self, src, version):
    src = src+self.name.lower()+'-'+str(version)+'.pt'
    if exists(src):
        print("Loading model "+self.name.lower()+'-'+str(version)+" state parameters")
        print("From :{}".format(src))
        self.load_state_dict(torch.load(src, map_location=self.device))
        return self
    else:
        print("Error no model "+self.name.lower()+'-'+str(version)+'.pt'+" found!")
        exit(1)

  
  def save_parameters(self, dest, version):
    save_name = self.name.lower()+'-'+str(version)+'.pt'

    if not exists(dest): 
      mkdir(dest)
    else: 
        if exists(dest+save_name):
          rename(dest+save_name, dest+save_name+'.bk')

    torch.save(self.state_dict(), dest+save_name)




## Training

## Testing

## Saving