In [1]:
#GPU count and name
!nvidia-smi -L

GPU 0: GeForce RTX 2080 Ti (UUID: GPU-616398ce-ae25-c52b-7c47-f658d30d9d86)


In [None]:
!pip install tensorflow-gpu==2.2.0

In [None]:
!pip install Keras==2.3.1

In [1]:
!python -c 'import keras; print(keras.__version__)'

Using TensorFlow backend.
2.3.1


## Set paths

In [2]:
import os
DATASET_BASE_DIR = '../datasets/data'
DATASET_VERSION = 'v3'
DATASET_VERSION_DIR = os.path.join(DATASET_BASE_DIR, DATASET_VERSION)
CLASSES_FILE = os.path.join(DATASET_VERSION_DIR, 'classes_iter2.csv')
TRAIN_ANNOTATIONS = os.path.join(DATASET_VERSION_DIR, 'train_iter2.csv')
VAL_ANNOTATIONS = os.path.join(DATASET_VERSION_DIR, 'val_iter2.csv')

TRAINING_BASE_DIR = os.path.join('../trainings', DATASET_VERSION)

## Check directories and files

In [3]:
base_dir = os.getcwd()
base_dir

'/home/aikauel/enap/aerialnet_project/notebooks'

In [4]:
%ls

 alertas_07-08-2020.json            RetinaNet_11_classes.ipynb
 BlobsExtraction.ipynb              RetinaNet_v3.ipynb
'Dibujar predicciones.ipynb'        RetinaNet_v3_iter3.ipynb
 Enap_Dataset_Formatting.ipynb      RetinaNet_v4.ipynb
 Enap_Dataset_Formatting_v3.ipynb   visualization.ipynb
 Enap_Dataset_Preprocessing.ipynb


In [5]:
!wc -l {TRAIN_ANNOTATIONS}

13112 ../datasets/data/v3/train_iter2.csv


In [6]:
!wc -l {VAL_ANNOTATIONS}

548 ../datasets/data/v3/val_iter2.csv


In [7]:
!wc -l {CLASSES_FILE}

7 ../datasets/data/v3/classes_iter2.csv


# New version: (3rd iteration)

## Full model training 

In [8]:
import pandas as pd
import math

BATCH_SIZE = 4

# compute number of annotations to compute number of steps per epoch
df = pd.read_csv(TRAIN_ANNOTATIONS, header=None, names=['img_path', 'x1', 'y1', 'x2', 'y2', 'class'])

countAnn = len(df)-1
countImg = df['img_path'].nunique()
no_steps = math.ceil(countImg/BATCH_SIZE)

print("Count of images: {}".format(countImg))
print("Count of annotations: {}".format(countAnn))
print("Number of steps per epoch: {}".format(no_steps))

Count of images: 3618
Count of annotations: 13112
Number of steps per epoch: 905


In [9]:
CONFIG_FILE = os.path.join(TRAINING_BASE_DIR, 'config.ini')

In [10]:
SNAPSHOTS_DIR_PREV = os.path.join(TRAINING_BASE_DIR, 'snapshots_iter2')

TRAINING_BASE_DIR = '../trainings/v3_iter3'
TENSORBOARD_LOGS_DIR = os.path.join(TRAINING_BASE_DIR, 'logs')
SNAPSHOTS_DIR = os.path.join(TRAINING_BASE_DIR, 'snapshots')
TRAINING_BASE_DIR

'../trainings/v3_iter3'

In [11]:
last_model = os.path.join(SNAPSHOTS_DIR_PREV, 'resnet50_csv_108.h5')

!python ../../keras-retinanet/keras_retinanet/bin/train.py \
--config {CONFIG_FILE} \
--random-transform \
--weights {last_model} \
--initial-epoch 108 \
--weighted-average \
--batch-size {BATCH_SIZE} \
--steps {no_steps} \
--epochs 100 \
--no-resize \
--image-min-side 1000 \
--image-max-side 1000 \
--compute-val-loss \
--multiprocessing \
--workers 2 \
--tensorboard-dir {TENSORBOARD_LOGS_DIR} \
--snapshot-path {SNAPSHOTS_DIR} \
--validation-freq 5 \
--lr 1e-5 \
--reduce-lr-patience 5 \
--reduce-lr-factor 0.1 \
--gamma 2.5 \
csv {TRAIN_ANNOTATIONS} {CLASSES_FILE} \
--val-annotations {VAL_ANNOTATIONS}

Using TensorFlow backend.
Creating model, this may take a second...
2020-10-02 19:53:27.054434: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-10-02 19:53:27.107486: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-10-02 19:53:27.108251: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-10-02 19:53:27.111568: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-10-02 19:53:27.157987: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.s

tracking <tf.Variable 'Variable:0' shape=(15, 4) dtype=float32, numpy=
array([[ -66.944   ,  -16.736   ,   66.944   ,   16.736   ],
       [-105.984   ,  -26.496   ,  105.984   ,   26.496   ],
       [-169.728   ,  -42.432   ,  169.728   ,   42.432   ],
       [ -43.466465,  -25.775614,   43.466465,   25.775614],
       [ -68.81498 ,  -40.80728 ,   68.81498 ,   40.80728 ],
       [-110.20369 ,  -65.35079 ,  110.20369 ,   65.35079 ],
       [ -33.472   ,  -33.472   ,   33.472   ,   33.472   ],
       [ -52.992   ,  -52.992   ,   52.992   ,   52.992   ],
       [ -84.864   ,  -84.864   ,   84.864   ,   84.864   ],
       [ -25.778217,  -43.462074,   25.778217,   43.462074],
       [ -40.811405,  -68.80803 ,   40.811405,   68.80803 ],
       [ -65.35739 , -110.19256 ,   65.35739 ,  110.19256 ],
       [ -16.736   ,  -66.944   ,   16.736   ,   66.944   ],
       [ -26.496   , -105.984   ,   26.496   ,  105.984   ],
       [ -42.432   , -169.728   ,   42.432   ,  169.728   ]],
      dtype=f

2020-10-02 19:53:31.067353: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1408] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI_ERROR_INSUFFICIENT_PRIVILEGES
2020-10-02 19:53:31.068136: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1447] function cupti_interface_->ActivityRegisterCallbacks( AllocCuptiActivityBuffer, FreeCuptiActivityBuffer)failed with error CUPTI_ERROR_INSUFFICIENT_PRIVILEGES
2020-10-02 19:53:31.068209: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1430] function cupti_interface_->EnableCallback( 0 , subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid)failed with error CUPTI_ERROR_INVALID_PARAMETER


### Resume training

In [29]:
last_model = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_27.h5')

!python ../../keras-retinanet/keras_retinanet/bin/train.py \
--config {CONFIG_FILE} \
--random-transform \
--weights {last_model} \
--initial-epoch 27 \
--weighted-average \
--batch-size {BATCH_SIZE} \
--steps {no_steps} \
--epochs 100 \
--no-resize \
--image-min-side 1000 \
--image-max-side 1000 \
--compute-val-loss \
--multiprocessing \
--workers 2 \
--tensorboard-dir {TENSORBOARD_LOGS_DIR} \
--snapshot-path {SNAPSHOTS_DIR} \
--validation-freq 5 \
--lr 0.5e-5 \
--reduce-lr-patience 5 \
--reduce-lr-factor 0.1 \
--gamma 3.0 \
csv {TRAIN_ANNOTATIONS} {CLASSES_FILE} \
--val-annotations {VAL_ANNOTATIONS}

Using TensorFlow backend.
Creating model, this may take a second...
2020-08-28 01:18:45.031624: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-08-28 01:18:45.062730: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-08-28 01:18:45.063041: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-08-28 01:18:45.063813: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-08-28 01:18:45.066774: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.s

tracking <tf.Variable 'Variable:0' shape=(15, 4) dtype=float32, numpy=
array([[ -66.944   ,  -16.736   ,   66.944   ,   16.736   ],
       [-105.984   ,  -26.496   ,  105.984   ,   26.496   ],
       [-169.728   ,  -42.432   ,  169.728   ,   42.432   ],
       [ -43.466465,  -25.775614,   43.466465,   25.775614],
       [ -68.81498 ,  -40.80728 ,   68.81498 ,   40.80728 ],
       [-110.20369 ,  -65.35079 ,  110.20369 ,   65.35079 ],
       [ -33.472   ,  -33.472   ,   33.472   ,   33.472   ],
       [ -52.992   ,  -52.992   ,   52.992   ,   52.992   ],
       [ -84.864   ,  -84.864   ,   84.864   ,   84.864   ],
       [ -25.778217,  -43.462074,   25.778217,   43.462074],
       [ -40.811405,  -68.80803 ,   40.811405,   68.80803 ],
       [ -65.35739 , -110.19256 ,   65.35739 ,  110.19256 ],
       [ -16.736   ,  -66.944   ,   16.736   ,   66.944   ],
       [ -26.496   , -105.984   ,   26.496   ,  105.984   ],
       [ -42.432   , -169.728   ,   42.432   ,  169.728   ]],
      dtype=f

Epoch 28/100
2020-08-28 01:19:00.791264: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-08-28 01:19:01.786699: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-08-28 01:19:03.481172: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.83GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2020-08-28 01:19:03.481277: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.83GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2020-08-28 01:19:03.880397: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator 

Epoch 32/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9565 and recall: 0.9209
11 instances of class 1 with average precision: 0.9008 and recall: 0.8988
5 instances of class 2 with average precision: 0.4653 and recall: 0.9225
6 instances of class 3 with average precision: 0.7502 and recall: 0.9674
67 instances of class 4 with average precision: 0.7495 and recall: 0.8853
10 instances of class 5 with average precision: 0.0599 and recall: 0.4740
242 instances of class 6 with average precision: 0.2387 and recall: 0.7187
134 instances of class 7 with average precision: 0.3435 and recall: 0.7341
mAP: 0.4410

Epoch 00032: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_32.h5
Epoch 33/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 40/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9369 and recall: 0.8919
11 instances of class 1 with average precision: 0.9008 and recall: 0.8993
5 instances of class 2 with average precision: 0.4518 and recall: 0.9017
6 instances of class 3 with average precision: 0.7323 and recall: 0.9650
67 instances of class 4 with average precision: 0.7676 and recall: 0.8516
10 instances of class 5 with average precision: 0.1190 and recall: 0.4783
242 instances of class 6 with average precision: 0.2613 and recall: 0.7332
134 instances of class 7 with average precision: 0.3510 and recall: 0.7007
mAP: 0.4532

Epoch 00040: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_40.h5
Epoch 41/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 48/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9605 and recall: 0.9360
11 instances of class 1 with average precision: 0.8831 and recall: 0.8988
5 instances of class 2 with average precision: 0.5688 and recall: 0.9325
6 instances of class 3 with average precision: 0.7211 and recall: 0.9641
67 instances of class 4 with average precision: 0.7486 and recall: 0.8705
10 instances of class 5 with average precision: 0.1855 and recall: 0.5219
242 instances of class 6 with average precision: 0.2760 and recall: 0.7292
134 instances of class 7 with average precision: 0.3674 and recall: 0.7222
mAP: 0.4663

Epoch 00048: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_48.h5
Epoch 49/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 56/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9577 and recall: 0.9233
11 instances of class 1 with average precision: 0.9091 and recall: 0.9003
5 instances of class 2 with average precision: 0.3687 and recall: 0.9385
6 instances of class 3 with average precision: 0.7354 and recall: 0.9634
67 instances of class 4 with average precision: 0.7427 and recall: 0.8452
10 instances of class 5 with average precision: 0.0286 and recall: 0.3920
242 instances of class 6 with average precision: 0.2641 and recall: 0.7192
134 instances of class 7 with average precision: 0.3668 and recall: 0.7074
mAP: 0.4558

Epoch 00056: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_56.h5
Epoch 57/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 64/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9556 and recall: 0.9226
11 instances of class 1 with average precision: 0.9091 and recall: 0.8985
5 instances of class 2 with average precision: 0.4271 and recall: 0.9401
6 instances of class 3 with average precision: 0.6886 and recall: 0.8033
67 instances of class 4 with average precision: 0.7618 and recall: 0.8801
10 instances of class 5 with average precision: 0.1158 and recall: 0.4654
242 instances of class 6 with average precision: 0.2577 and recall: 0.7241
134 instances of class 7 with average precision: 0.3578 and recall: 0.7244
mAP: 0.4544

Epoch 00064: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_64.h5
Epoch 65/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 72/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9593 and recall: 0.9303
11 instances of class 1 with average precision: 0.9091 and recall: 0.9005
5 instances of class 2 with average precision: 0.4015 and recall: 0.9188
6 instances of class 3 with average precision: 0.7193 and recall: 0.9562
67 instances of class 4 with average precision: 0.7417 and recall: 0.8483
10 instances of class 5 with average precision: 0.0238 and recall: 0.4283
242 instances of class 6 with average precision: 0.2717 and recall: 0.7295
134 instances of class 7 with average precision: 0.3635 and recall: 0.7326
mAP: 0.4585

Epoch 00072: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_72.h5
Epoch 73/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 80/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9488 and recall: 0.9169
11 instances of class 1 with average precision: 0.8926 and recall: 0.8980
5 instances of class 2 with average precision: 0.4776 and recall: 0.9485
6 instances of class 3 with average precision: 0.6975 and recall: 0.8128
67 instances of class 4 with average precision: 0.7364 and recall: 0.8442
10 instances of class 5 with average precision: 0.0558 and recall: 0.3870
242 instances of class 6 with average precision: 0.2943 and recall: 0.7487
134 instances of class 7 with average precision: 0.3698 and recall: 0.7109
mAP: 0.4686

Epoch 00080: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_80.h5
Epoch 81/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 88/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9531 and recall: 0.9304
11 instances of class 1 with average precision: 0.8926 and recall: 0.8997
5 instances of class 2 with average precision: 0.4578 and recall: 0.9467
6 instances of class 3 with average precision: 0.6798 and recall: 0.9551
67 instances of class 4 with average precision: 0.7650 and recall: 0.8649
10 instances of class 5 with average precision: 0.0216 and recall: 0.2604
242 instances of class 6 with average precision: 0.2890 and recall: 0.7384
134 instances of class 7 with average precision: 0.3639 and recall: 0.7034
mAP: 0.4679

Epoch 00088: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_88.h5
Epoch 89/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 96/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9353 and recall: 0.9108
11 instances of class 1 with average precision: 0.8843 and recall: 0.8992
5 instances of class 2 with average precision: 0.5176 and recall: 0.9568
6 instances of class 3 with average precision: 0.7221 and recall: 0.9488
67 instances of class 4 with average precision: 0.7442 and recall: 0.8694
10 instances of class 5 with average precision: 0.0159 and recall: 0.4156
242 instances of class 6 with average precision: 0.2891 and recall: 0.7502
134 instances of class 7 with average precision: 0.3519 and recall: 0.7117
mAP: 0.4608

Epoch 00096: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_96.h5
Epoch 97/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

## Increasing gamma up to 4.0 and resume training

In [59]:
last_model = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_98.h5')

!python ../../keras-retinanet/keras_retinanet/bin/train.py \
--config {CONFIG_FILE} \
--random-transform \
--weights {last_model} \
--initial-epoch 98 \
--weighted-average \
--batch-size {BATCH_SIZE} \
--steps {no_steps} \
--epochs 200 \
--no-resize \
--image-min-side 1000 \
--image-max-side 1000 \
--compute-val-loss \
--multiprocessing \
--workers 2 \
--tensorboard-dir {TENSORBOARD_LOGS_DIR} \
--snapshot-path {SNAPSHOTS_DIR} \
--validation-freq 5 \
--lr 0.5e-5 \
--reduce-lr-patience 5 \
--reduce-lr-factor 0.1 \
--gamma 4.0 \
csv {TRAIN_ANNOTATIONS} {CLASSES_FILE} \
--val-annotations {VAL_ANNOTATIONS}

Using TensorFlow backend.
Creating model, this may take a second...
2020-08-28 13:19:47.475940: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-08-28 13:19:47.506705: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-08-28 13:19:47.507060: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-08-28 13:19:47.507202: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-08-28 13:19:47.508112: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.s

tracking <tf.Variable 'Variable:0' shape=(15, 4) dtype=float32, numpy=
array([[ -66.944   ,  -16.736   ,   66.944   ,   16.736   ],
       [-105.984   ,  -26.496   ,  105.984   ,   26.496   ],
       [-169.728   ,  -42.432   ,  169.728   ,   42.432   ],
       [ -43.466465,  -25.775614,   43.466465,   25.775614],
       [ -68.81498 ,  -40.80728 ,   68.81498 ,   40.80728 ],
       [-110.20369 ,  -65.35079 ,  110.20369 ,   65.35079 ],
       [ -33.472   ,  -33.472   ,   33.472   ,   33.472   ],
       [ -52.992   ,  -52.992   ,   52.992   ,   52.992   ],
       [ -84.864   ,  -84.864   ,   84.864   ,   84.864   ],
       [ -25.778217,  -43.462074,   25.778217,   43.462074],
       [ -40.811405,  -68.80803 ,   40.811405,   68.80803 ],
       [ -65.35739 , -110.19256 ,   65.35739 ,  110.19256 ],
       [ -16.736   ,  -66.944   ,   16.736   ,   66.944   ],
       [ -26.496   , -105.984   ,   26.496   ,  105.984   ],
       [ -42.432   , -169.728   ,   42.432   ,  169.728   ]],
      dtype=f

Epoch 99/200
2020-08-28 13:20:03.536808: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-08-28 13:20:04.446047: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-08-28 13:20:06.365587: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.88GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2020-08-28 13:20:06.365755: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.88GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2020-08-28 13:20:06.523186: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator 

Epoch 103/200
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9495 and recall: 0.9280
11 instances of class 1 with average precision: 0.8636 and recall: 0.8958
5 instances of class 2 with average precision: 0.3491 and recall: 0.8753
6 instances of class 3 with average precision: 0.7206 and recall: 0.9397
67 instances of class 4 with average precision: 0.7300 and recall: 0.8628
10 instances of class 5 with average precision: 0.0250 and recall: 0.4348
242 instances of class 6 with average precision: 0.2717 and recall: 0.7267
134 instances of class 7 with average precision: 0.3436 and recall: 0.7159
mAP: 0.4495

Epoch 00103: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_103.h5
Epoch 104/200
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100%

Epoch 111/200
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9470 and recall: 0.9387
11 instances of class 1 with average precision: 0.7920 and recall: 0.8961
5 instances of class 2 with average precision: 0.4123 and recall: 0.9354
6 instances of class 3 with average precision: 0.7284 and recall: 0.9519
67 instances of class 4 with average precision: 0.7294 and recall: 0.8501
10 instances of class 5 with average precision: 0.0373 and recall: 0.3491
242 instances of class 6 with average precision: 0.2866 and recall: 0.7570
134 instances of class 7 with average precision: 0.3675 and recall: 0.7510
mAP: 0.4610

Epoch 00111: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_111.h5
Epoch 112/200
Process ForkPoolWorker-35:
Process ForkPoolWorker-34:
Process ForkPoolWorker-33:
Traceback (most recent c

## Increasing gamma up to 5.0 and resume training

In [76]:
last_model = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_108.h5')

!python ../../keras-retinanet/keras_retinanet/bin/train.py \
--config {CONFIG_FILE} \
--random-transform \
--weights {last_model} \
--initial-epoch 108 \
--weighted-average \
--batch-size {BATCH_SIZE} \
--steps {no_steps} \
--epochs 200 \
--no-resize \
--image-min-side 1000 \
--image-max-side 1000 \
--compute-val-loss \
--multiprocessing \
--workers 2 \
--tensorboard-dir {TENSORBOARD_LOGS_DIR} \
--snapshot-path {SNAPSHOTS_DIR} \
--validation-freq 5 \
--lr 1e-4 \
--reduce-lr-patience 5 \
--reduce-lr-factor 0.1 \
--gamma 5.0 \
csv {TRAIN_ANNOTATIONS} {CLASSES_FILE} \
--val-annotations {VAL_ANNOTATIONS}

Using TensorFlow backend.
Creating model, this may take a second...
2020-08-28 19:29:38.706037: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-08-28 19:29:38.735227: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-08-28 19:29:38.735565: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-08-28 19:29:38.735731: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-08-28 19:29:38.736664: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.s

tracking <tf.Variable 'Variable:0' shape=(15, 4) dtype=float32, numpy=
array([[-133.888   ,  -33.472   ,  133.888   ,   33.472   ],
       [-211.968   ,  -52.992   ,  211.968   ,   52.992   ],
       [-339.456   ,  -84.864   ,  339.456   ,   84.864   ],
       [ -86.93293 ,  -51.551228,   86.93293 ,   51.551228],
       [-137.62996 ,  -81.61456 ,  137.62996 ,   81.61456 ],
       [-220.40738 , -130.70158 ,  220.40738 ,  130.70158 ],
       [ -66.944   ,  -66.944   ,   66.944   ,   66.944   ],
       [-105.984   , -105.984   ,  105.984   ,  105.984   ],
       [-169.728   , -169.728   ,  169.728   ,  169.728   ],
       [ -51.556435,  -86.92415 ,   51.556435,   86.92415 ],
       [ -81.62281 , -137.61606 ,   81.62281 ,  137.61606 ],
       [-130.71478 , -220.38512 ,  130.71478 ,  220.38512 ],
       [ -33.472   , -133.888   ,   33.472   ,  133.888   ],
       [ -52.992   , -211.968   ,   52.992   ,  211.968   ],
       [ -84.864   , -339.456   ,   84.864   ,  339.456   ]],
      dtype=f

Epoch 109/200
2020-08-28 19:29:54.620315: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-08-28 19:29:55.527177: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-08-28 19:29:57.149450: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.83GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2020-08-28 19:29:57.162597: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.83GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2020-08-28 19:29:57.228350: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator

# DEBUG

If the model is not learning or has low accuracy, check the dataset and the anchors of the model to determine if the problem lies in a mismatch between the dataset and the anchors.

In [25]:
!python ../../keras-retinanet/keras_retinanet/bin/debug.py \
--config {CONFIG_FILE} \
--show-annotations \
csv {TRAIN_ANNOTATIONS} {CLASSES_FILE}

Using TensorFlow backend.
Traceback (most recent call last):
  File "../../keras-retinanet/keras_retinanet/bin/debug.py", line 327, in <module>
    main()
  File "../../keras-retinanet/keras_retinanet/bin/debug.py", line 305, in main
    generator = create_generator(args)
  File "../../keras-retinanet/keras_retinanet/bin/debug.py", line 112, in create_generator
    config=args.config
  File "../../keras-retinanet/keras_retinanet/bin/../../keras_retinanet/preprocessing/csv_generator.py", line 158, in __init__
    raise_from(ValueError('invalid CSV annotations file: {}: {}'.format(csv_data_file, e)), None)
  File "<string>", line 3, in raise_from
ValueError: invalid CSV annotations file: ../datasets/data/v3/train.csv: line 2: unknown class name: '9' (classes: OrderedDict([('0', 0), ('1', 1), ('2', 2), ('3', 3), ('4', 4), ('5', 5), ('6', 6), ('7', 7)]))


In [84]:
%pwd

'/home/aikauel/enap'

# Evaluate model

Evaluate model with test dataset (test.csv)

In [2]:
TEST_ANNOTATIONS = os.path.join(DATASET_VERSION_DIR, 'test_iter2.csv')

In [6]:
# path to trained weights
last_model = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_108.h5')
#output_images_path = os.path.join(TRAINING_BASE_DIR, 'test_output')

# evaluate model on test.csv
!python ../../keras-retinanet/keras_retinanet/bin/evaluate.py \
--config {CONFIG_FILE} \
--score-threshold 0.53 \
--iou-threshold 0.35 \
--no-resize \
--max-detections 300 \
csv {TEST_ANNOTATIONS} \
{CLASSES_FILE} \
{last_model}

Using TensorFlow backend.
Loading model, this may take a second...
2020-09-14 11:58:07.326737: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-09-14 11:58:07.355801: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-14 11:58:07.356104: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-09-14 11:58:07.359888: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-09-14 11:58:07.406472: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so

Running network: N/A% (0 of 37) |        | Elapsed Time: 0:00:00 ETA:  --:--:--2020-09-14 11:58:13.095156: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-09-14 11:58:14.932120: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
Running network: 100% (37 of 37) |#######| Elapsed Time: 0:00:35 Time:  0:00:35
Parsing annotations: 100% (37 of 37) |###| Elapsed Time: 0:00:00 Time:  0:00:00
label 0
label 1
label 2
label 3
label 4
label 5
label 6
label 7
317 instances of class 0 with average precision: 0.8168 TP: 294 FP: 220 precision: 0.5720 recall: 0.9274 f1_score: 0.7076 f2_score: 0.8249
41 instances of class 1 with average precision: 0.3103 TP: 16 FP: 20 precision: 0.4444 recall: 0.3902 f1_score: 0.4156 f2_score: 0.4000
7 instances of class 2 with average precision: 0.0357 TP: 1 FP: 6 precision: 0.1429 recall: 0.1429 f1_score: 0.1429 f2_score: 0.1429
10 

## Test model
#### Test model on images without annotations (unseen in train-val-test)

#### Convert training model to inference model

In [6]:
os.getcwd()

'/home/aikauel/enap/aerialnet_project/notebooks'

In [23]:
SNAPSHOTS_DIR

'../trainings/v3/snapshots_iter2'

In [10]:
# path to trained weights
last_model = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_108.h5')

# convert trained model to inference model to be used for generating predictions
inference_model = last_model.replace('snapshots/', 'inference/')
!python ../../keras-retinanet/keras_retinanet/bin/convert_model.py {last_model} {inference_model} \
--config {CONFIG_FILE}

Using TensorFlow backend.
2020-09-01 16:20:01.784098: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-09-01 16:20:01.820061: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-01 16:20:01.820541: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-09-01 16:20:01.820744: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-09-01 16:20:01.822082: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-09-01 16:20:01.823422: I tensorf

#### Set paths

In [11]:
test_images_path = '/home/aikauel/enap/data/vitto/'
test_output_path = os.path.join(TRAINING_BASE_DIR, 'vitto_output_epoch108_threshold53')

In [22]:
!cat {CLASSES_FILE}

0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7

In [21]:
!python ../../aerialnet_project/shared_utils/image_inference_write.py \
--input_dir {test_images_path} \
--model {inference_model} \
--output_dir {test_output_path} \
--labels {CLASSES_FILE} \
--threshold 0.35 \
--no_csv True

Using TensorFlow backend.
/home/aikauel/enap/data/vitto/
Output dir already exist
2020-09-01 16:36:28.201738: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-09-01 16:36:28.231149: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-01 16:36:28.231476: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-09-01 16:36:28.231641: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-09-01 16:36:28.232575: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic libra

no csv: /home/aikauel/enap/data/vitto/
[INFO] predicting image 1 of 1: a4.jpg
2020-09-01 16:36:31.757737: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-09-01 16:36:33.462599: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
processing time:  8.474244594573975
../trainings/v3/vitto_output_epoch108_threshold53/a4.jpg
[FINAL] Predictions completed!


# Create saved_model to load tfserving model

In [42]:
last_model = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_107.h5')

saved_model = os.path.dirname(last_model).replace('snapshots_iter2', 'saved_models')

In [43]:
print(last_model, saved_model)

../trainings/v3/snapshots_iter2/resnet50_csv_107.h5 ../trainings/v3/saved_models


In [44]:
!python ../shared_utils/convert_model.py {last_model} {saved_model} \
--config {CONFIG_FILE}

Using TensorFlow backend.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
2020-09-01 18:34:37.042366: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-09-01 18:34:37.074438: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-01 18:34:37.074740: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-09-01 18:34:37.074883: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-09-01 18:34:37.075773: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successf

[INFO] Model saved


## Visualization
#### Generate detections on test images - IMAGES

In [46]:
os.getcwd()

'/home/aikauel/enap'

In [13]:
# create output directory where you want to save images with bounding boxes
#!mkdir /content/data/output

# generate detections on images
!python ije_retinanet/image_inference_print.py \
-i 'data/test_images' \
-t 0.6 \
-m  {inference_model} \
-o data/output

python: can't open file 'ije_retinanet/image_inference_print.py': [Errno 2] No such file or directory
