In [1]:
import os 

import azureml
from azureml.core.model import Model, InferenceConfig
from azureml.core import Workspace, Datastore, Experiment, Run, Environment, ScriptRunConfig

from azureml.core.compute import ComputeTarget, AmlCompute, AksCompute, ComputeTarget
from azureml.train.dnn import PyTorch
from azureml.widgets import RunDetails

from azureml.core.webservice import Webservice, AksWebservice, AciWebservice
from azureml.core.dataset import Dataset
from azureml.core.resource_configuration import ResourceConfiguration
from azureml.core.conda_dependencies import CondaDependencies 

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.33.0


In [None]:
#FIRST TIME SETUP
#Change the path to your local SPEED data directory after download from https://kelvins.esa.int/satellite-pose-estimation-challenge/data/
DATA_PATH= "/home/salem/Documents/DLR/Challenge/speed" 

In [None]:
from azureml.core import Workspace

#FIRST TIME SETUP : define these two variables after azure account creation
tenant_id = "f77a90dc-721b-4ae2-8371-7629ac13542d" #<your_tenant_id>
subscription_id= "e50a493c-c8fe-4da7-8361-e99f6349f177" #<your_tenant_id>


#reuse the commented lines if you want to switch to another subscription
# from azureml.core.authentication import InteractiveLoginAuthentication
# forced_interactive_auth = InteractiveLoginAuthentication(tenant_id=tenant_id, force=True)


# Create the workspace using the specified parameters
ws = Workspace.create(name = "VisionLab",
                      subscription_id = subscription_id

                      resource_group = "satellite_pose_estimation", 
                      location = "eastus",
                      create_resource_group = True,
                      exist_ok = True)
ws.get_details()



In [None]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

In [4]:
cluster_name = "OptimusPrime" 

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except :
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',
                                                           min_nodes = 0, 
                                                           max_nodes=5)

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

Found existing compute target


In [6]:
# Connect to the datastore for the training images
ds = Datastore.get_default(ws)
print("Datastore:",ds.name)

#FIRST TIME SETUP : 
# Do the following the first time you are setting the dataset on azure storage
# speed_data = ds.upload(src_dir = DATA_PATH ,target_path = "speed", show_progress = True)

# Connect to the experiment
exp = Experiment(workspace=ws, name='Setup_trial')
print("Experiment:",exp.name)

Datastore: workspaceblobstore
Experiment: Setup_trial


In [7]:
from azureml.core import Dataset

# create the dataset object
dataset = Dataset.File.from_files(path=(ds, '/speed'))

# register the dataset for future use
dataset = dataset.register(workspace=ws,
                           name='speed_dataset',
                           description="Satellite images in train, test, real test folders and their pose labels in json files")

In [8]:
# create an enviornment
env = Environment(name='sat_pose_estimation')

# define packages for image
cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 
                                            'azureml-defaults',
                                            "torch",
                                            "torchvision",
                                            "pytorch-lightning",
                                            "numpy",
                                            "matplotlib",
                                            'pillow'],
                             conda_packages=['SciPy'])

env.python.conda_dependencies = cd

# Specify a docker image to use.
env.docker.base_image = (
    "mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.0.3-cudnn8-ubuntu18.04"
)


# Register environment to re-use later
env = env.register(workspace = ws)

In [13]:
# create a directory for the training script
os.makedirs('train_script', exist_ok=True)

In [10]:
%%writefile train_script/train.py

import argparse
import os


import torch 
from torch import nn 
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split 
from torchvision import transforms, models
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from utils import PyTorchSatellitePoseEstimationDataset
from submission import SubmissionWriter
from azureml_env_adapter import set_environment_variables
from pytorch_lightning.plugins import DDPPlugin

# output will be logged, separate output from previous log entries.
print('-'*100)


def add_model_specific_args(parent_parser):
    parser = argparse.ArgumentParser(parents=[parent_parser], add_help=False)
    parser.add_argument('--num_workers', type=int, default=8)
    parser.add_argument('--batch_size', type=int, default=32)
    return parser

class SatellitePoseEstimationModel(pl.LightningModule):
    def __init__(self, submission = None) :
        super().__init__() 
        initialized_model = models.resnet18(pretrained=True)
        num_ftrs = initialized_model.fc.in_features
        initialized_model.fc = torch.nn.Linear(num_ftrs, 7)
        self.model = initialized_model
        self.submission = submission
        
    def forward(self,x):
        return self.model(x)
        
    def training_step(self,batch ,batch_idx):
        x,y = batch 
        y_hat = self.model(x)
        loss = F.mse_loss(y_hat.float(),y.float())
        self.log('step', self.trainer.current_epoch+1)
        self.log('losses', {'train': loss})

#         self.log("train_loss",loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x,y = batch 
        y_hat = self.model(x)
        loss = F.mse_loss(y_hat.float(),y.float())
        self.log('step', self.trainer.current_epoch+1)
        self.log('losses', {'valid': loss})
#         self.log("val_loss", loss) #, on_epoch=True)
        return loss
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(),lr = 0.001)
    
    def test_step(self, batch, batch_idx):
        inputs, filenames = batch 
        outputs = self.model(inputs)
        
        q_batch = outputs[:, :4].cpu().numpy()
        r_batch = outputs[:, -3:].cpu().numpy()
    
        for filename, q, r in zip(filenames, q_batch, r_batch):
            self.submission.append_test(filename, q, r)  

class DataModule(pl.LightningDataModule) : 
    def __init__(self, batch_size = 32, num_workers = 8, speed_root=''):
        super().__init__()
        self.batch_size = batch_size 
        #num_workers = 4*gpu_num
        self.num_workers = num_workers 
        self.speed_root = speed_root

    def setup(self, stage = None):
        #Transforms 
        data_transforms = transforms.Compose([transforms.Resize((224, 224)),
                                              transforms.ToTensor(),
                                              transforms.Normalize([0.485, 0.456, 0.406], 
                                                                   [0.229, 0.224, 0.225])])
        full_dataset = PyTorchSatellitePoseEstimationDataset('train', self.speed_root, data_transforms)
        if stage == "fit" or stage is None:
            self.train_dataset, self.val_dataset = torch.utils.data.random_split(full_dataset, 
                                                                   [int(len(full_dataset) * .8),
                                                                    int(len(full_dataset) * .2)])
        if stage == "test" or stage is None:
            self.test_dataset = PyTorchSatellitePoseEstimationDataset('test', self.speed_root, data_transforms)
            
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers) 
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size = self.batch_size, num_workers = self.num_workers)    
    
if __name__ == '__main__':
    
    
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-path', type=str, 
                        dest='data_path', 
                        default='data', 
                        help='data folder mounting point')
    
    parser.add_argument("--logdir", default="./logs", type=str)
    
    parser = add_model_specific_args(parser)
 
    parser = pl.Trainer.add_argparse_args(parser)
    
    # parse the parameters passed to the this script
    args = parser.parse_args()
    
    trial_name = f"first_model_{args.max_epochs}epochs"
    # set azureml env vars for multi-node ddp
    set_environment_variables()
    
#     MySubmission = SubmissionWriter()
    
    model = SatellitePoseEstimationModel()
    dm = DataModule(batch_size = args.batch_size, num_workers = args.num_workers, speed_root = args.data_path )
    
    tb_logger = TensorBoardLogger(args.logdir, name = trial_name)
    
    
    # ------------
    # training
    # ------------
    trainer = pl.Trainer.from_argparse_args(args, logger=tb_logger, plugins=DDPPlugin(find_unused_parameters=False))    
    try : 
        trainer.fit(model, dm) 
    except : 
        print("ERROR : The model stoped training !")
    finally : 
        print('Saving model...')
        trainer.save_checkpoint(f"outputs/{trial_name}.ckpt")
#         trainer.test(model = model, datamodule = dm)
#         print(MySubmission.test_results)
#         MySubmission.export(out_dir="./outputs", suffix= trial_name)
    print('Done!')
    print('-'*100)

Overwriting train_script/train.py


In [12]:
from azureml.core.runconfig import MpiConfiguration
from azureml.core.runconfig import DockerConfiguration
# setup the run details
num_nodes = 1 


src = ScriptRunConfig(source_directory='train_script',
                      script='train.py',
                      arguments=['--data-path', dataset.as_mount(),
                                 '--logdir','./logs',
                                 '--num_workers',4,
                                 '--batch_size', 32,
                                 '--gpus',-1,
                                 '--max_epochs',10,
                                 '--accelerator','ddp',
                                 '--num_nodes', num_nodes],
                      compute_target=compute_target, #"local" #to run locally
                      
                      distributed_job_config = MpiConfiguration(node_count=num_nodes),
                      
                      docker_runtime_config = DockerConfiguration(use_docker=True, 
                                                                  shared_volumes=True, 
                                                                  arguments=["--ipc","host"], 
                                                                  shm_size='2g'),
                      environment=env)

# Submit the model to azure!
run = exp.submit(config=src)


In [13]:
from azureml.tensorboard import Tensorboard

tb = Tensorboard([run])

# If successful, start() returns a string with the URI of the instance.
tb.start()

http://localhost:6006/


'http://localhost:6006/'

In [11]:
# After your job completes, be sure to stop() the streaming otherwise it will continue to run. 
tb.stop()

NameError: name 'tb' is not defined

In [None]:
#TODO (first priority) : Import a run and get the tensorboard plot of losses

In [50]:
filtered_list_runs = Run.list(exp)
for run_name in filtered_list_runs : 
    print(run_name)

Run(Experiment: Setup_trial,
Id: Setup_trial_1630167182_fd1371b0,
Type: azureml.scriptrun,
Status: Completed)
Run(Experiment: Setup_trial,
Id: Setup_trial_1630165769_c3fde39b,
Type: azureml.scriptrun,
Status: Completed)
Run(Experiment: Setup_trial,
Id: Setup_trial_1630164941_43727c92,
Type: azureml.scriptrun,
Status: Completed)
Run(Experiment: Setup_trial,
Id: Setup_trial_1630161254_e2a88ad8,
Type: azureml.scriptrun,
Status: Completed)
Run(Experiment: Setup_trial,
Id: Setup_trial_1630159084_c9ed9394,
Type: azureml.scriptrun,
Status: Completed)
Run(Experiment: Setup_trial,
Id: Setup_trial_1630157137_efdeac6f,
Type: azureml.scriptrun,
Status: Completed)
Run(Experiment: Setup_trial,
Id: Setup_trial_1630156219_64e4117f,
Type: azureml.scriptrun,
Status: Failed)
Run(Experiment: Setup_trial,
Id: Setup_trial_1630149734_802f2495,
Type: azureml.scriptrun,
Status: Failed)
Run(Experiment: Setup_trial,
Id: Setup_trial_1630070102_5d32daf2,
Type: azureml.scriptrun,
Status: Failed)
Run(Experiment: Set