## Imports 

In [2]:
import os
import sys
import pickle
import joblib
import argparse
from tqdm import tqdm
from os import path
from git import Repo
from os.path import exists
from os import mkdir, remove, rename
# from .autonotebook import tqdm as notebook_tqdm

import math
import random
import numpy as np
from sklearn.preprocessing import RobustScaler

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data.dataset import Dataset, random_split

torch.manual_seed(42)

# set up train device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device: {}".format(device))

# setup project root dir
COLAB = 'google.colab' in sys.modules
if COLAB:
  root_dir = '/content'
  %mkdir ./data/
else:
  repo = Repo(".", search_parent_directories=True)
  root_dir = repo.git.rev_parse("--show-toplevel")
  sys.path.insert(0, root_dir+'/models/')
print("root: {}".format(root_dir))



device: cuda
root: /home/bitfra/Desktop/Francesco/rl_project/acrobatic-agents


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# Import data and build dataloader
dataset_len = 6000
raw_expert_observations = np.load(root_dir+'/data/expert-observations-'+str(dataset_len)+'.npy', allow_pickle=True)
expert_actions = np.load(root_dir+'/data/expert-actions-'+str(dataset_len)+'.npy', allow_pickle=True)

In [4]:
#Experimental import 
dataset_len = 6000
raw_expert_observations_bf = np.load(root_dir+'/data/expert-observations-backflip-'+str(dataset_len)+'.npy', allow_pickle=True)
raw_expert_observations_sk = np.load(root_dir+'/data/expert-observations-spinkick-'+str(dataset_len)+'.npy', allow_pickle=True)
expert_actions_bf = np.load(root_dir+'/data/expert-actions-backflip-'+str(dataset_len)+'.npy', allow_pickle=True)
expert_actions_sk = np.load(root_dir+'/data/expert-actions-spinkick-'+str(dataset_len)+'.npy', allow_pickle=True)





In [12]:
# print(raw_expert_observations_bf.shape)
# print(raw_expert_observations_sk.shape)

# print(expert_actions_bf.shape)
# print(expert_actions_sk.shape)

raw_expert_observations = np.concatenate([raw_expert_observations_bf,raw_expert_observations_sk],axis=0)
expert_actions = np.concatenate([expert_actions_bf,expert_actions_sk],axis=0)
print(raw_expert_observations.shape)
print(expert_actions.shape)




(12000, 197)
(12000, 36)


In [15]:
# Robust scaling

# Create a robust scaler object
scaler = RobustScaler()

# Fit the scaler to your data
scaler.fit(raw_expert_observations)
expert_observations = scaler.transform(raw_expert_observations)

# This will save the scaler to a file named 'scaler.joblib'
# joblib.dump(scaler, root_dir+'/data/scaler-'+str(dataset_len)+'.joblib') # TO FIX
joblib.dump(scaler, root_dir+'/data/scaler-mixed-'+str(dataset_len)+'.joblib')  


# with open(root_dir+'/data/scaler-'+str(dataset_len)+'.joblib', 'wb') as file:
#     pickle.dump(scaler, file)


['/home/bitfra/Desktop/Francesco/rl_project/acrobatic-agents/data/scaler-mixed-6000.joblib']

## Data

In [16]:
# Dataset class

class ExpertDataSet(Dataset):

    def __init__(self, expert_observations, expert_actions):
        self.observations = torch.from_numpy(expert_observations).float()
        self.actions = self.__preprocess__(torch.from_numpy(expert_actions))
        

    def __getitem__(self, idx):
        # return (self.observations[index], self.actions[index])
        # normalized_observations = 2 * ((self.observations[idx] - self.observations.min()) / (self.observations.max() - self.observations.min())) - 1
        # normalized_actions = 2 * ((self.actions[idx] - self.actions.min()) / (self.actions.max() - self.actions.min())) - 1
        # normalized_data = (normalized_observations, self.actions[idx])
        # return normalized_data
        return self.observations[idx], self.actions[idx]


    def __len__(self):
        return len(self.observations)
    
    
    def __preprocess__(self, data, clip_value=1e38):
        # Clip values to a maximum and minimum range
        data = torch.clamp(data, min=-clip_value, max=clip_value)
        
        # Convert to float
        return data.float()
    
    def __min__max__(self):
        return self.observations.min(), self.observations.max() 

In [17]:
# Make Datasets 

count_discarded_numpy = 0
count_discarded = 0

new_exp_action = expert_actions

list_of_index_to_drop = []
for i, a in enumerate(expert_actions):
  if (a > 1e2).any() or (a > 1e2).any():
  # if not np.isfinite(a).all(): 
    list_of_index_to_drop.append(i)
    # print(i)
    # print(a)
    count_discarded_numpy+=1
    # break


print("Expert actions len: {}".format(len(expert_actions)))
print("Expert observations len: {}".format(len(expert_observations)))

expert_dataset = ExpertDataSet(expert_observations, expert_actions)

for i in range(len(expert_dataset)):
  a = expert_dataset.__getitem__(i)[1]
  # print(a.max())
  # print(a.min())
  if (a > 1e2).any() or (a < -1e2).any() :
  # if not torch.isfinite(a).any():
    count_discarded += 1
    # print(a)

print("Discarded data:")
print("Discarded form np: {}".format(count_discarded_numpy))
print("Discarded form torch: {}".format(count_discarded))

min_val, max_val = expert_dataset.__min__max__()

print("Statistics: ")
print("Observations min: {}".format(min_val))
print("Observations max: {}".format(max_val))


Expert actions len: 12000
Expert observations len: 12000
Discarded data:
Discarded form np: 0
Discarded form torch: 0
Statistics: 
Observations min: -17.38599395751953
Observations max: 16.746784210205078


In [18]:
# check correct scaling
idx = random.randint(0, dataset_len)
print(raw_expert_observations[idx])
print(raw_expert_observations.min())
print(raw_expert_observations.max())
print(expert_dataset.__getitem__(idx)[0])
print(expert_observations.min())
print(expert_observations.max())


[ 4.20761905e-01  1.12140572e+00 -6.59840107e-02  2.22270489e-02
 -7.21347332e-03  8.11084747e-01 -4.19022441e-02 -3.01007889e-02
  5.82648814e-01 -3.32576752e-01  1.20199919e-01 -4.05043960e-02
  8.27900231e-01 -4.47045639e-02 -5.45648597e-02  5.56421995e-01
 -5.68579435e-01  2.60748029e-01 -7.73310065e-02  8.84111285e-01
 -2.72704419e-02 -8.98118466e-02  4.57752526e-01  1.86670065e-01
  6.60079718e-02  1.30200803e-01  5.84970474e-01 -6.89322799e-02
 -8.54199752e-02  8.03592801e-01  4.89519835e-01 -3.91112566e-02
  1.89493179e-01  9.53917086e-01 -2.43966398e-03 -1.09737299e-01
  2.79274046e-01  6.47435904e-01 -2.32939541e-01  2.14161873e-01
  9.88884389e-01  2.89996155e-02 -1.29863515e-01  6.63489923e-02
 -4.39795732e-01  1.48507237e-01  2.57130802e-01  4.74809378e-01
 -3.07420641e-01 -5.18453419e-01  6.41291380e-01 -4.23146725e-01
  2.49666572e-01  4.49146688e-01  7.15128630e-02 -5.32391131e-01
 -2.82597035e-01  7.94722915e-01 -4.49162960e-01  3.64999413e-01
  5.22138059e-01  7.15128

In [19]:
# Data Loaders

batch_size = 128
train_prop = 0.7
train_size = int(train_prop * len(expert_dataset))
test_size = int(0.2 * len(expert_dataset))
val_size = int(0.1 * len(expert_dataset))
train_expert_dataset, test_expert_dataset,val_expert_dataset = random_split(expert_dataset, [train_size, test_size,val_size])

train_loader = torch.utils.data.DataLoader(dataset=train_expert_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_expert_dataset, batch_size=batch_size, shuffle=False)
val_loader = torch.utils.data.DataLoader( dataset=val_expert_dataset, batch_size=batch_size, shuffle=False)


print("Shapes:")
print("actions shape: {}".format(train_loader.dataset.__getitem__(0)[1].shape))
print("observations shape: {}".format(train_loader.dataset.__getitem__(0)[0].shape))


Shapes:
actions shape: torch.Size([36])
observations shape: torch.Size([197])


# BCO

## Model

In [43]:
# Policy Agent
import bco_agents as bco

obs_space = train_loader.dataset.__getitem__(0)[0].shape[0]
action_space = train_loader.dataset.__getitem__(0)[1].shape[0]
policy = bco.BCOAgentFC(obs_space, action_space, h_size=obs_space*2).to(device)

print("Policy net: {}".format(policy))

Policy net: BCOAgentFC(
  (fc1): Linear(in_features=197, out_features=394, bias=True)
  (bn1): BatchNorm1d(394, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU()
  (fc2): Linear(in_features=394, out_features=36, bias=True)
)


In [20]:
# Policy Agent

import bco_cnn as bco_cnn
obs_space = train_loader.dataset.__getitem__(0)[0].shape[0]
action_space = train_loader.dataset.__getitem__(0)[1].shape[0]
policy = bco_cnn.BCO_cnn(obs_space, action_space).to(device) # fix add hidden layer size

print("Policy net: {}".format(policy))

Policy net: BCO_cnn(
  (conv1): Conv1d(1, 36, kernel_size=(5,), stride=(2,), padding=(1,))
  (conv2): Conv1d(36, 36, kernel_size=(3,), stride=(2,), padding=(2,))
  (fc1): Linear(in_features=1800, out_features=72, bias=True)
  (fc2): Linear(in_features=72, out_features=36, bias=True)
  (LRelu): LeakyReLU(negative_slope=0.01)
)


In [12]:
# Policy Agent

import encoder as en
obs_space = train_loader.dataset.__getitem__(0)[0].shape[0]
action_space = train_loader.dataset.__getitem__(0)[1].shape[0]
policy = en.Encoder(obs_space, action_space, 50).to(device) # fix add hidden layer size

print("Policy net: {}".format(policy))

Policy net: Encoder(
  (conv1): Conv1d(1, 36, kernel_size=(5,), stride=(2,), padding=(1,))
  (conv2): Conv1d(36, 36, kernel_size=(3,), stride=(2,), padding=(2,))
  (fc1): Linear(in_features=1800, out_features=72, bias=True)
  (fc2): Linear(in_features=72, out_features=36, bias=True)
  (LRelu): LeakyReLU(negative_slope=0.01)
)


## Training

In [21]:
# Train functions

def train(
        policy,
        train_epochs,
        train_loader, 
        val_loader,
        optimizer,
        loss_criterion,
        scheduler,
        thrashold
    ):

    policy.train()
    loss = 0
    epoch_loss = 0
    unused_val = 0

    with tqdm(total=train_epochs, leave=True) as pbar:
        for epoch in range(train_epochs):
            for batch_idx, (data, target) in enumerate(train_loader):

                obs, expert_action = data.to(device), target.to(device)
                obs = obs.float()

                if policy.name == 'bco-cnn':
                    obs = obs.unsqueeze(1)
                

                optimizer.zero_grad()

                student_action = policy(obs)
                expert_action = expert_action.float()

                loss = loss_criterion(student_action, expert_action)
                # loss.register_hook(lambda grad: print(grad))
                loss.backward()
                # print("Loss: {}".format(loss.item()))

                if not loss.item() == torch.inf: 
                    epoch_loss += loss.item()
                    optimizer.step()
                    

                else:
                    unused_val += 1
                    print("### BATCH {} ###".format(batch_idx))
                    print(f'obs -> {obs}')
                    print("\n______________________________________________________________________________")
                    print(f'expert_action -> {expert_action}')
                    print("\n______________________________________________________________________________")
                    print(f'student_action -> {student_action}')
                    print("\n______________________________________________________________________________")
                    return expert_action,student_action

                res = print_gradients(policy)
                
                if torch.isnan(student_action).any(): 
                    print('e successo')
                    break

                if res == 1: 
                    print("\n______________________________________________________________________________")
                    print(student_action.shape)
                    for i, ea in enumerate(expert_action):
                        if not np.isfinite(ea).all():
                            print(i+64)
                            print(f'expert_action -> {ea}')

                    print("\n______________________________________________________________________________")
                    print(f'Max expert_action -> {expert_action.max()}')
                    print(f'Min expert_action -> {expert_action.min()}')
                    print(f'Max student_action -> {student_action.max()}')
                    print(f'Min student_action -> {student_action.min()}')
                    break
                
            # deactivate scheduler
            # if epoch % 50 == 0 and epoch < thrashold :
            #     scheduler.step()
            
            # compute accuracy
            # print("Epoch {}".format(epoch))
            # print("Train Loss: {}".format(epoch_loss/(batch_idx+1)))
            # validation(test_loader,policy,loss_criterion,num_epochs=eval_epochs)
            # print("Unused Loss: {}".format(unused_val))
            t_loss = epoch_loss/(batch_idx+1)
            v_loss = validation(val_loader, policy, loss_criterion)
            epoch_loss = 0
            unused_val = 0
            pbar.set_postfix(train=t_loss, validation=v_loss)
            pbar.update(1)
            
        
        print("###############################################################################\n")
        print("Train Loss: {}".format(t_loss))
        print("Validation Loss: {}".format(v_loss))
        print("###############################################################################\n")


def validation(loader, policy,loss_criterion):
    policy.eval()
    epoch_loss = 0
    for batch_idx, (data, target) in enumerate(loader):
        obs, expert_action = data.to(device), target.to(device)
        obs = obs.float()
        if policy.name == 'bco-cnn':
            obs = obs.unsqueeze(1)
        student_action = policy(obs)
        loss = loss_criterion(student_action, expert_action)
        epoch_loss += loss.item()

    return epoch_loss/(batch_idx+1)

def print_gradients(policy):
    for name, param in policy.named_parameters():
        if param.requires_grad:
            if torch.isnan(param.grad).any(): 
                return 1#break
            # print(f"Gradient of {name}: {param.grad}")
    return 0

In [22]:
# Train module

loss_criterion = nn.MSELoss()
# Create a learning rate scheduler
step_size = 80
gamma = 0.3
optimizer =  optim.Adam(policy.parameters(), lr=1e-3)
optimizer =  optim.SGD(policy.parameters(), lr=1e-3,momentum=0.9)
scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)
eval_epochs = 5

train(policy, 
      train_epochs=2000, 
      train_loader=train_loader, 
      val_loader=val_loader,
      optimizer=optimizer,
      loss_criterion=loss_criterion,
      scheduler=scheduler,
      thrashold = 100
    )

100%|██████████| 2000/2000 [04:11<00:00,  7.94it/s, train=0.0027, validation=0.00309] 

###############################################################################

Train Loss: 0.0026980733576541147
Validation Loss: 0.0030944646801799535
###############################################################################






## Testing

In [23]:
def test(policy, test_loader,loss_criterion):
    policy.eval()   
    loss = 0
    epoch_loss = 0
    
    for batch_idx, (data, target) in enumerate(test_loader):

        obs, expert_action = data.to(device), target.to(device)
        obs = obs.float()
        
        if policy.name == 'bco-cnn':
            obs = obs.unsqueeze(1)

        student_action = policy(obs)
        expert_action = expert_action.float()

        loss = loss_criterion(student_action, expert_action)

        if not loss.item() == torch.inf: 
            epoch_loss += loss.item()
        
        if torch.isnan(student_action).any(): 
            print('e successo')
            break
        
    t_loss = epoch_loss/(batch_idx+1)
    epoch_loss = 0
            
    
    print("Test Loss: {}".format(t_loss))

In [24]:
test(policy=policy, test_loader=test_loader, loss_criterion=loss_criterion)

Test Loss: 0.003253985674267537


In [25]:
for i in range(90,120):
  obs, action  = test_loader.dataset.__getitem__(i)
  print(f'real action: {action}')
  obs = obs.float().unsqueeze(0).to(device)
  if policy.name == 'bco-cnn':
    obs = obs.unsqueeze(1)
  our_action = policy(obs).squeeze().detach().cpu().numpy()
  print(f'our action: {our_action}')

real action: tensor([ 0.2605, -0.2056,  0.2495,  0.1158, -0.8094,  0.0308, -0.2528,  0.3413,
         0.9663, -0.4150, -0.3949, -0.0515,  0.1127,  1.9902,  0.1184, -0.1615,
         0.2445,  2.2917, -0.1314,  0.3295,  0.3565,  0.8631,  1.9305,  0.0823,
        -0.0088,  0.4450, -1.8799,  1.1222, -0.1319,  0.0794,  0.2365,  2.2830,
         0.3647,  0.2719,  0.0164,  1.4682])
our action: [ 0.28638247 -0.20730853  0.23678762  0.11430443 -0.8035834   0.03262676
 -0.2535242   0.34375685  0.967809   -0.4108919  -0.42101482 -0.07376168
  0.10055835  1.9549214   0.12107766 -0.15038149  0.24831706  2.3007522
 -0.13969702  0.29258308  0.34443384  0.84840006  1.8617657   0.09289967
 -0.01601898  0.45429277 -1.858087    1.0812523  -0.1316182   0.08266126
  0.23931949  2.2913105   0.36350787  0.2863686  -0.00837706  1.449285  ]
real action: tensor([-0.2712,  0.0754, -0.1316,  0.0786, -0.9637,  0.0178, -0.1477,  0.5466,
         1.1832, -0.0513, -0.1278,  0.3739, -1.8643, -1.2683,  0.0126, -0.0520,

In [None]:
# not normalized observations / manual normalization
stop_at = 5
policy.eval()
for obs in expert_observations:
  print(obs)
  inputs = torch.from_numpy(obs[:196]).float().unsqueeze(0)
  inputs = 2 * ((inputs - min_val) / (max_val - min_val)) - 1
  print(inputs)
  output = policy(inputs).squeeze().detach().cpu().numpy()
  # print(f'our action: {our_action}')

In [None]:
# Normalized observations
for i in range(80,110):
  obs = train_loader.dataset.__getitem__(i)[0]
  obs = obs.float().unsqueeze(0).to(device)
  print(obs)


## Saving

In [26]:
# Save model

# save_parameters()
version = 'mixed-1'
# policy.save_parameters(root_dir+'/checkpoints/', version) 

policy.save_parameters(root_dir+'/checkpoints/', version)




## Loading 

In [37]:
src = root_dir+'/checkpoints/'+policy.name.lower()+'.pt'
policy.load_parameters(src)
print(root_dir+'/checkpoints/'+policy.name.lower()+'.pt')

Loading model Behavioral-Cloning-Agent state parameters
From :/home/bitfra/Desktop/Francesco/rl_project/acrobatic-agents/checkpoints/behavioral-cloning-agent.pt
/home/bitfra/Desktop/Francesco/rl_project/acrobatic-agents/checkpoints/behavioral-cloning-agent.pt


# ENCODER

## Model

## Training

## Testing

## Saving