In [2]:
#GPU count and name
!nvidia-smi -L

GPU 0: GeForce RTX 2080 Ti (UUID: GPU-616398ce-ae25-c52b-7c47-f658d30d9d86)


In [None]:
!pip install tensorflow-gpu==2.2.0

In [None]:
!pip install Keras==2.3.1

In [1]:
!python -c 'import keras; print(keras.__version__)'

Using TensorFlow backend.
2.3.1


## Set paths

In [1]:
import os
DATASET_BASE_DIR = '../datasets/data'
DATASET_VERSION = 'v3'
DATASET_VERSION_DIR = os.path.join(DATASET_BASE_DIR, DATASET_VERSION)
CLASSES_FILE = os.path.join(DATASET_VERSION_DIR, 'classes.csv')
TRAIN_ANNOTATIONS = os.path.join(DATASET_VERSION_DIR, 'train.csv')
VAL_ANNOTATIONS = os.path.join(DATASET_VERSION_DIR, 'val.csv')

TRAINING_BASE_DIR = os.path.join('../trainings', DATASET_VERSION)

## Check directories and files

In [3]:
base_dir = os.getcwd()
base_dir

'/home/aikauel/enap/aerialnet_project/notebooks'

In [4]:
%ls

BlobsExtraction.ipynb             RetinaNet_v3-Copy1.ipynb
Enap_Dataset_Formatting_v3.ipynb  RetinaNet_v3.ipynb


In [5]:
!wc -l {TRAIN_ANNOTATIONS}

13259 ../datasets/data/v3/train.csv


In [6]:
!wc -l {VAL_ANNOTATIONS}

556 ../datasets/data/v3/val.csv


In [7]:
!wc -l {CLASSES_FILE}

7 ../datasets/data/v3/classes.csv


## Set pretrained model

In [8]:
# utilize best weights from 4 classes model as baseline
PRETRAINED_MODEL = 'aerialnet_project/11_classes/snapshots/resnet50_csv_65.h5'
BATCH_SIZE = 8

# Compute best anchors for dataset annotations

In [19]:
!anchor-optimization {TRAIN_ANNOTATIONS} --no-resize --ratios=5 --threads 100

Using TensorFlow backend.
Process ForkPoolWorker-96:
^C
Process ForkPoolWorker-97:
Process ForkPoolWorker-89:
Process ForkPoolWorker-57:
Process ForkPoolWorker-1:
Process ForkPoolWorker-23:
Process ForkPoolWorker-19:
Process ForkPoolWorker-29:
Process ForkPoolWorker-44:
Process ForkPoolWorker-100:
Process ForkPoolWorker-98:
Process ForkPoolWorker-64:
Process ForkPoolWorker-99:
Process ForkPoolWorker-82:
Process ForkPoolWorker-95:
Process ForkPoolWorker-12:
Process ForkPoolWorker-18:
Process ForkPoolWorker-87:
Process ForkPoolWorker-17:
Process ForkPoolWorker-90:
Process ForkPoolWorker-55:
Process ForkPoolWorker-9:
Process ForkPoolWorker-11:
Process ForkPoolWorker-2:
Process ForkPoolWorker-3:
Process ForkPoolWorker-8:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/aikauel/anaconda3/envs/enap/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/aikauel/anaconda3/envs/enap/li

Save anchor optimization to config.ini file

# Set number of steps per epoch

In [1]:
import pandas as pd
import math
# compute number of annotations to compute number of steps per epoch
df = pd.read_csv(TRAIN_ANNOTATIONS, header=None, names=['img_path', 'x1', 'y1', 'x2', 'y2', 'class'])

countAnn = len(df)-1
countImg = df['img_path'].nunique()
no_steps = math.ceil(countImg/BATCH_SIZE)

print("Count of images: {}".format(countImg))
print("Count of annotations: {}".format(countAnn))
print("Number of steps per epoch: {}".format(no_steps))

NameError: name 'TRAIN_ANNOTATIONS' is not defined

# Train model

In [4]:
CONFIG_FILE = os.path.join(TRAINING_BASE_DIR, 'config.ini')
TENSORBOARD_LOGS_DIR = os.path.join(TRAINING_BASE_DIR, 'logs')
SNAPSHOTS_DIR = os.path.join(TRAINING_BASE_DIR, 'snapshots')

### FCNs training (regression, classification) with freezed backbone  

First, train the model freezing the backbone. The backbone weights will be the weights from the previous model train before.
Set gamma with high value (>2.0) to weight the loss of well-classified classes down, forcing the model to learn on harder (in this case: less examples) classes.

In [10]:
!python ../../keras-retinanet/keras_retinanet/bin/train.py \
--config {CONFIG_FILE} \
--freeze-backbone \
--random-transform \
--weights {PRETRAINED_MODEL} \
--weighted-average \
--batch-size {BATH_SIZE} \
--steps {no_steps} \
--epochs 50 \
--no-resize \
--image-min-side 1000 \
--image-max-side 1000 \
--compute-val-loss \
--multiprocessing \
--workers 2 \
--tensorboard-dir {TENSORBOARD_LOGS_DIR} \
--snapshot-path {SNAPSHOTS_DIR} \
--validation-freq 5 \
--gamma 3.0 \
csv {TRAIN_ANNOTATIONS} {CLASSES_FILE} \
--val-annotations {VAL_ANNOTATIONS}

.4778 - val_regression_loss: 1.3160 - val_classification_loss: 0.1056
Running network: 100% (108 of 108) |#####| Elapsed Time: 0:00:11 Time:  0:00:11
Parsing annotations: 100% (108 of 108) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
215 instances of class 0 with average precision: 0.8306 and recall: 0.8910
105 instances of class 1 with average precision: 0.7785 and recall: 0.8629
38 instances of class 2 with average precision: 0.6369 and recall: 0.8507
30 instances of class 3 with average precision: 0.5981 and recall: 0.8722
45 instances of class 4 with average precision: 0.4321 and recall: 0.8084
7 instances of class 5 with average precision: 0.8332 and recall: 0.9637
14 instances of class 6 with average precision: 0.8948 and recall: 0.9157
13 instances of class 7 with average precision: 0.1715 and recall: 0.4089
16 instances of class 8 with average precision: 0.0387 and recall: 0.5245
1 instances of class 9 with average precision: 0.0000 and recall: 0.0000
27 instan

#### Full model training

With a decent mAP result in previous step, we now unfreeze the backbone weights to train the full model.

In [15]:
BATCH_SIZE = 4

# compute number of annotations to compute number of steps per epoch
df = pd.read_csv(TRAIN_ANNOTATIONS, header=None, names=['img_path', 'x1', 'y1', 'x2', 'y2', 'class'])

countAnn = len(df)-1
countImg = df['img_path'].nunique()
no_steps = math.ceil(countImg/BATCH_SIZE)

print("Count of images: {}".format(countImg))
print("Count of annotations: {}".format(countAnn))
print("Number of steps per epoch: {}".format(no_steps))

Count of images: 3652
Count of annotations: 13258
Number of steps per epoch: 913


In [16]:
last_model = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_28.h5')

!python ../../keras-retinanet/keras_retinanet/bin/train.py \
--config {CONFIG_FILE} \
--random-transform \
--weights {last_model} \
--initial-epoch 28 \
--weighted-average \
--batch-size {BATCH_SIZE} \
--steps {no_steps} \
--epochs 100 \
--no-resize \
--image-min-side 1000 \
--image-max-side 1000 \
--compute-val-loss \
--multiprocessing \
--workers 2 \
--tensorboard-dir {TENSORBOARD_LOGS_DIR} \
--snapshot-path {SNAPSHOTS_DIR} \
--validation-freq 5 \
--lr 1e-5 \
--reduce-lr-patience 2 \
--reduce-lr-factor 0.1 \
--gamma 3.0 \
csv {TRAIN_ANNOTATIONS} {CLASSES_FILE} \
--val-annotations {VAL_ANNOTATIONS}

Running network: 100% (173 of 173) |#####| Elapsed Time: 0:00:18 Time:  0:00:18
Parsing annotations: 100% (173 of 173) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9491 and recall: 0.9177
11 instances of class 1 with average precision: 0.8523 and recall: 0.8956
5 instances of class 2 with average precision: 0.3770 and recall: 0.8891
6 instances of class 3 with average precision: 0.8502 and recall: 0.9771
67 instances of class 4 with average precision: 0.7331 and recall: 0.8333
10 instances of class 5 with average precision: 0.0343 and recall: 0.2749
242 instances of class 6 with average precision: 0.1895 and recall: 0.6782
2 instances of class 7 with average precision: 0.0147 and recall: 0.4431
6 instances of class 8 with average precision: 0.0054 and recall: 0.2288
134 instances of class 9 with average precision: 0.2577 and recall: 0.6212
mAP: 0.3887

Epoch 00029: saving model to ../trainings/v3/snapshots/resnet50_csv_

## Continue training full model

In [11]:
BATCH_SIZE = 4

# compute number of annotations to compute number of steps per epoch
df = pd.read_csv(TRAIN_ANNOTATIONS, header=None, names=['img_path', 'x1', 'y1', 'x2', 'y2', 'class'])

countAnn = len(df)-1
countImg = df['img_path'].nunique()
no_steps = math.ceil(countImg/BATCH_SIZE)

print("Count of images: {}".format(countImg))
print("Count of annotations: {}".format(countAnn))
print("Number of steps per epoch: {}".format(no_steps))

Count of images: 3652
Count of annotations: 13258
Number of steps per epoch: 913


In [14]:
'''last_model = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_33.h5')

!python ../../keras-retinanet/keras_retinanet/bin/train.py \
--config {CONFIG_FILE} \
--random-transform \
--weights {last_model} \
--initial-epoch 33 \
--weighted-average \
--batch-size {BATCH_SIZE} \
--steps {no_steps} \
--epochs 100 \
--no-resize \
--image-min-side 1000 \
--image-max-side 1000 \
--compute-val-loss \
--multiprocessing \
--workers 2 \
--tensorboard-dir {TENSORBOARD_LOGS_DIR} \
--snapshot-path {SNAPSHOTS_DIR} \
--validation-freq 5 \
--lr 1e-5 \
--reduce-lr-patience 5 \
--reduce-lr-factor 0.1 \
--gamma 2.5 \
csv {TRAIN_ANNOTATIONS} {CLASSES_FILE} \
--val-annotations {VAL_ANNOTATIONS}'''
# ABORTED

Using TensorFlow backend.
Creating model, this may take a second...
2020-08-27 18:13:34.186578: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-08-27 18:13:34.219122: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-08-27 18:13:34.219428: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-08-27 18:13:34.220206: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-08-27 18:13:34.223095: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.s

tracking <tf.Variable 'Variable:0' shape=(15, 4) dtype=float32, numpy=
array([[-133.888   ,  -33.472   ,  133.888   ,   33.472   ],
       [-211.968   ,  -52.992   ,  211.968   ,   52.992   ],
       [-339.456   ,  -84.864   ,  339.456   ,   84.864   ],
       [ -86.93293 ,  -51.551228,   86.93293 ,   51.551228],
       [-137.62996 ,  -81.61456 ,  137.62996 ,   81.61456 ],
       [-220.40738 , -130.70158 ,  220.40738 ,  130.70158 ],
       [ -66.944   ,  -66.944   ,   66.944   ,   66.944   ],
       [-105.984   , -105.984   ,  105.984   ,  105.984   ],
       [-169.728   , -169.728   ,  169.728   ,  169.728   ],
       [ -51.556435,  -86.92415 ,   51.556435,   86.92415 ],
       [ -81.62281 , -137.61606 ,   81.62281 ,  137.61606 ],
       [-130.71478 , -220.38512 ,  130.71478 ,  220.38512 ],
       [ -33.472   , -133.888   ,   33.472   ,  133.888   ],
       [ -52.992   , -211.968   ,   52.992   ,  211.968   ],
       [ -84.864   , -339.456   ,   84.864   ,  339.456   ]],
      dtype=f

Epoch 34/100
2020-08-27 18:13:50.407626: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-08-27 18:13:51.387392: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
  1/913 [..............................] - ETA: 3:25:41 - loss: 1.1222 - regression_loss: 0.8542 - classification_loss: 0.26802020-08-27 18:13:54.912787: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session started.
2020-08-27 18:13:54.912827: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1408] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI could not be loaded or symbol could not be found.
2020-08-27 18:13:54.912836: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1447] function cupti_interface_->ActivityRegisterCallbacks( AllocCuptiActivityBuffer, FreeCuptiActivityBuffer)failed with error

# New version: (2nd iteration)
### Deleting classes 7 (Juegos) and 8 (Zanja)

In [5]:
import pandas as pd
import math, os
TRAIN_ANNOTATIONS = os.path.join(DATASET_VERSION_DIR, 'train_iter2.csv')
VAL_ANNOTATIONS = os.path.join(DATASET_VERSION_DIR, 'val_iter2.csv')

TENSORBOARD_LOGS_DIR = os.path.join(TRAINING_BASE_DIR, 'logs_iter2')

SNAPSHOTS_DIR = os.path.join(TRAINING_BASE_DIR, 'snapshots')
PRETRAINED_MODEL = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_33.h5')
# now change snapshots directory
SNAPSHOTS_DIR = os.path.join(TRAINING_BASE_DIR, 'snapshots_iter2')

BATCH_SIZE = 8

df = pd.read_csv(TRAIN_ANNOTATIONS, header=None, names=['img_path', 'x1', 'y1', 'x2', 'y2', 'class'])

countAnn = len(df)-1
countImg = df['img_path'].nunique()
no_steps = math.ceil(countImg/BATCH_SIZE)

print("Count of images: {}".format(countImg))
print("Count of annotations: {}".format(countAnn))
print("Number of steps per epoch: {}".format(no_steps))

Count of images: 3618
Count of annotations: 13112
Number of steps per epoch: 453


In [24]:
!python ../../keras-retinanet/keras_retinanet/bin/train.py \
--config {CONFIG_FILE} \
--freeze-backbone \
--random-transform \
--weights {PRETRAINED_MODEL} \
--weighted-average \
--batch-size {BATCH_SIZE} \
--steps {no_steps} \
--epochs 50 \
--no-resize \
--image-min-side 1000 \
--image-max-side 1000 \
--compute-val-loss \
--multiprocessing \
--workers 2 \
--tensorboard-dir {TENSORBOARD_LOGS_DIR} \
--snapshot-path {SNAPSHOTS_DIR} \
--validation-freq 5 \
--gamma 3.0 \
csv {TRAIN_ANNOTATIONS} {CLASSES_FILE} \
--val-annotations {VAL_ANNOTATIONS}

Using TensorFlow backend.
Creating model, this may take a second...
2020-08-27 18:54:50.993611: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-08-27 18:54:51.022216: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-08-27 18:54:51.022527: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-08-27 18:54:51.022686: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-08-27 18:54:51.023641: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.s

tracking <tf.Variable 'Variable:0' shape=(15, 4) dtype=float32, numpy=
array([[-33.472   ,  -8.368   ,  33.472   ,   8.368   ],
       [-52.992   , -13.248   ,  52.992   ,  13.248   ],
       [-84.864   , -21.216   ,  84.864   ,  21.216   ],
       [-21.733232, -12.887807,  21.733232,  12.887807],
       [-34.40749 , -20.40364 ,  34.40749 ,  20.40364 ],
       [-55.101845, -32.675396,  55.101845,  32.675396],
       [-16.736   , -16.736   ,  16.736   ,  16.736   ],
       [-26.496   , -26.496   ,  26.496   ,  26.496   ],
       [-42.432   , -42.432   ,  42.432   ,  42.432   ],
       [-12.889109, -21.731037,  12.889109,  21.731037],
       [-20.405703, -34.404015,  20.405703,  34.404015],
       [-32.678696, -55.09628 ,  32.678696,  55.09628 ],
       [ -8.368   , -33.472   ,   8.368   ,  33.472   ],
       [-13.248   , -52.992   ,  13.248   ,  52.992   ],
       [-21.216   , -84.864   ,  21.216   ,  84.864   ]], dtype=float32)> anchors
tracking <tf.Variable 'Variable:0' shape=(15, 4) 

Epoch 1/50
2020-08-27 18:55:00.903692: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-08-27 18:55:02.400414: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
  1/453 [..............................] - ETA: 1:20:30 - loss: 2.7288 - regression_loss: 1.6366 - classification_loss: 1.09212020-08-27 18:55:05.962989: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session started.
2020-08-27 18:55:05.963022: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1408] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI could not be loaded or symbol could not be found.
2020-08-27 18:55:05.963032: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1447] function cupti_interface_->ActivityRegisterCallbacks( AllocCuptiActivityBuffer, FreeCuptiActivityBuffer)failed with error C


Epoch 00013: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-07.
Epoch 14/50
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9369 and recall: 0.8980
11 instances of class 1 with average precision: 0.9091 and recall: 0.8986
5 instances of class 2 with average precision: 0.5366 and recall: 0.9194
6 instances of class 3 with average precision: 0.8571 and recall: 0.9845
67 instances of class 4 with average precision: 0.7539 and recall: 0.8750
10 instances of class 5 with average precision: 0.1353 and recall: 0.3601
242 instances of class 6 with average precision: 0.2403 and recall: 0.7075
134 instances of class 7 with average precision: 0.3323 and recall: 0.6701
mAP: 0.4403

Epoch 00014: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_14.h5
Epoch 15/50
Running network: 100% (171 of

returning metrics...
74 instances of class 0 with average precision: 0.9423 and recall: 0.9104
11 instances of class 1 with average precision: 0.9091 and recall: 0.8992
5 instances of class 2 with average precision: 0.5512 and recall: 0.9347
6 instances of class 3 with average precision: 0.8571 and recall: 0.9822
67 instances of class 4 with average precision: 0.7539 and recall: 0.8699
10 instances of class 5 with average precision: 0.1351 and recall: 0.3543
242 instances of class 6 with average precision: 0.2423 and recall: 0.7160
134 instances of class 7 with average precision: 0.3357 and recall: 0.6742
mAP: 0.4428

Epoch 00021: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_21.h5
Epoch 22/50
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:16 Time:  0:00:16
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9423 and recall: 0.9103
11 instances of class 1 wit

## Full model training (2nd iteration)

In [9]:
BATCH_SIZE = 4

# compute number of annotations to compute number of steps per epoch
df = pd.read_csv(TRAIN_ANNOTATIONS, header=None, names=['img_path', 'x1', 'y1', 'x2', 'y2', 'class'])

countAnn = len(df)-1
countImg = df['img_path'].nunique()
no_steps = math.ceil(countImg/BATCH_SIZE)

print("Count of images: {}".format(countImg))
print("Count of annotations: {}".format(countAnn))
print("Number of steps per epoch: {}".format(no_steps))

Count of images: 3618
Count of annotations: 13112
Number of steps per epoch: 905


In [27]:
last_model = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_17.h5')

!python ../../keras-retinanet/keras_retinanet/bin/train.py \
--config {CONFIG_FILE} \
--random-transform \
--weights {last_model} \
--initial-epoch 17 \
--weighted-average \
--batch-size {BATCH_SIZE} \
--steps {no_steps} \
--epochs 100 \
--no-resize \
--image-min-side 1000 \
--image-max-side 1000 \
--compute-val-loss \
--multiprocessing \
--workers 2 \
--tensorboard-dir {TENSORBOARD_LOGS_DIR} \
--snapshot-path {SNAPSHOTS_DIR} \
--validation-freq 5 \
--lr 1e-5 \
--reduce-lr-patience 5 \
--reduce-lr-factor 0.1 \
--gamma 2.5 \
csv {TRAIN_ANNOTATIONS} {CLASSES_FILE} \
--val-annotations {VAL_ANNOTATIONS}

Using TensorFlow backend.
Creating model, this may take a second...
2020-08-27 21:12:23.470421: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-08-27 21:12:23.499508: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-08-27 21:12:23.499820: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-08-27 21:12:23.499976: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-08-27 21:12:23.500889: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.s

tracking <tf.Variable 'Variable:0' shape=(15, 4) dtype=float32, numpy=
array([[-133.888   ,  -33.472   ,  133.888   ,   33.472   ],
       [-211.968   ,  -52.992   ,  211.968   ,   52.992   ],
       [-339.456   ,  -84.864   ,  339.456   ,   84.864   ],
       [ -86.93293 ,  -51.551228,   86.93293 ,   51.551228],
       [-137.62996 ,  -81.61456 ,  137.62996 ,   81.61456 ],
       [-220.40738 , -130.70158 ,  220.40738 ,  130.70158 ],
       [ -66.944   ,  -66.944   ,   66.944   ,   66.944   ],
       [-105.984   , -105.984   ,  105.984   ,  105.984   ],
       [-169.728   , -169.728   ,  169.728   ,  169.728   ],
       [ -51.556435,  -86.92415 ,   51.556435,   86.92415 ],
       [ -81.62281 , -137.61606 ,   81.62281 ,  137.61606 ],
       [-130.71478 , -220.38512 ,  130.71478 ,  220.38512 ],
       [ -33.472   , -133.888   ,   33.472   ,  133.888   ],
       [ -52.992   , -211.968   ,   52.992   ,  211.968   ],
       [ -84.864   , -339.456   ,   84.864   ,  339.456   ]],
      dtype=f

2020-08-27 21:12:25.001634: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1363] Profiler found 1 GPUs
2020-08-27 21:12:25.001796: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcupti.so.10.1'; dlerror: libcupti.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/x86_64-linux-gnu
2020-08-27 21:12:25.001807: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1408] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI could not be loaded or symbol could not be found.
2020-08-27 21:12:25.001812: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1447] function cupti_interface_->ActivityRegisterCallbacks( AllocCuptiActivityBuffer, FreeCuptiActivityBuffer)failed with error CUPTI could not be loaded or symbol could not be found.
2020-08-27 21:12:25.001822: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1430] func

Epoch 28/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9498 and recall: 0.9215
11 instances of class 1 with average precision: 0.8788 and recall: 0.8965
5 instances of class 2 with average precision: 0.3893 and recall: 0.9224
6 instances of class 3 with average precision: 0.7341 and recall: 0.9594
67 instances of class 4 with average precision: 0.7159 and recall: 0.8618
10 instances of class 5 with average precision: 0.1199 and recall: 0.3654
242 instances of class 6 with average precision: 0.1948 and recall: 0.6926
134 instances of class 7 with average precision: 0.3108 and recall: 0.6754
mAP: 0.4085

Epoch 00028: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_28.h5
Epoch 29/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 36/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9619 and recall: 0.9229
11 instances of class 1 with average precision: 0.8558 and recall: 0.8971
5 instances of class 2 with average precision: 0.5444 and recall: 0.9551
6 instances of class 3 with average precision: 0.7222 and recall: 0.9714
67 instances of class 4 with average precision: 0.7217 and recall: 0.8627
10 instances of class 5 with average precision: 0.2078 and recall: 0.4574
242 instances of class 6 with average precision: 0.2770 and recall: 0.7319
134 instances of class 7 with average precision: 0.3507 and recall: 0.6964
mAP: 0.4592

Epoch 00036: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_36.h5
Epoch 37/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 44/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9440 and recall: 0.9243
11 instances of class 1 with average precision: 0.9091 and recall: 0.9013
5 instances of class 2 with average precision: 0.4964 and recall: 0.9377
6 instances of class 3 with average precision: 0.7370 and recall: 0.9475
67 instances of class 4 with average precision: 0.7496 and recall: 0.8768
10 instances of class 5 with average precision: 0.0357 and recall: 0.4319
242 instances of class 6 with average precision: 0.2682 and recall: 0.7326
134 instances of class 7 with average precision: 0.3409 and recall: 0.7019
mAP: 0.4516

Epoch 00044: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_44.h5
Epoch 45/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

### Resume training

In [29]:
last_model = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_27.h5')

!python ../../keras-retinanet/keras_retinanet/bin/train.py \
--config {CONFIG_FILE} \
--random-transform \
--weights {last_model} \
--initial-epoch 27 \
--weighted-average \
--batch-size {BATCH_SIZE} \
--steps {no_steps} \
--epochs 100 \
--no-resize \
--image-min-side 1000 \
--image-max-side 1000 \
--compute-val-loss \
--multiprocessing \
--workers 2 \
--tensorboard-dir {TENSORBOARD_LOGS_DIR} \
--snapshot-path {SNAPSHOTS_DIR} \
--validation-freq 5 \
--lr 0.5e-5 \
--reduce-lr-patience 5 \
--reduce-lr-factor 0.1 \
--gamma 3.0 \
csv {TRAIN_ANNOTATIONS} {CLASSES_FILE} \
--val-annotations {VAL_ANNOTATIONS}

Using TensorFlow backend.
Creating model, this may take a second...
2020-08-28 01:18:45.031624: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-08-28 01:18:45.062730: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-08-28 01:18:45.063041: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-08-28 01:18:45.063813: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-08-28 01:18:45.066774: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.s

tracking <tf.Variable 'Variable:0' shape=(15, 4) dtype=float32, numpy=
array([[ -66.944   ,  -16.736   ,   66.944   ,   16.736   ],
       [-105.984   ,  -26.496   ,  105.984   ,   26.496   ],
       [-169.728   ,  -42.432   ,  169.728   ,   42.432   ],
       [ -43.466465,  -25.775614,   43.466465,   25.775614],
       [ -68.81498 ,  -40.80728 ,   68.81498 ,   40.80728 ],
       [-110.20369 ,  -65.35079 ,  110.20369 ,   65.35079 ],
       [ -33.472   ,  -33.472   ,   33.472   ,   33.472   ],
       [ -52.992   ,  -52.992   ,   52.992   ,   52.992   ],
       [ -84.864   ,  -84.864   ,   84.864   ,   84.864   ],
       [ -25.778217,  -43.462074,   25.778217,   43.462074],
       [ -40.811405,  -68.80803 ,   40.811405,   68.80803 ],
       [ -65.35739 , -110.19256 ,   65.35739 ,  110.19256 ],
       [ -16.736   ,  -66.944   ,   16.736   ,   66.944   ],
       [ -26.496   , -105.984   ,   26.496   ,  105.984   ],
       [ -42.432   , -169.728   ,   42.432   ,  169.728   ]],
      dtype=f

Epoch 28/100
2020-08-28 01:19:00.791264: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-08-28 01:19:01.786699: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-08-28 01:19:03.481172: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.83GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2020-08-28 01:19:03.481277: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.83GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2020-08-28 01:19:03.880397: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator 

Epoch 32/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9565 and recall: 0.9209
11 instances of class 1 with average precision: 0.9008 and recall: 0.8988
5 instances of class 2 with average precision: 0.4653 and recall: 0.9225
6 instances of class 3 with average precision: 0.7502 and recall: 0.9674
67 instances of class 4 with average precision: 0.7495 and recall: 0.8853
10 instances of class 5 with average precision: 0.0599 and recall: 0.4740
242 instances of class 6 with average precision: 0.2387 and recall: 0.7187
134 instances of class 7 with average precision: 0.3435 and recall: 0.7341
mAP: 0.4410

Epoch 00032: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_32.h5
Epoch 33/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 40/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9369 and recall: 0.8919
11 instances of class 1 with average precision: 0.9008 and recall: 0.8993
5 instances of class 2 with average precision: 0.4518 and recall: 0.9017
6 instances of class 3 with average precision: 0.7323 and recall: 0.9650
67 instances of class 4 with average precision: 0.7676 and recall: 0.8516
10 instances of class 5 with average precision: 0.1190 and recall: 0.4783
242 instances of class 6 with average precision: 0.2613 and recall: 0.7332
134 instances of class 7 with average precision: 0.3510 and recall: 0.7007
mAP: 0.4532

Epoch 00040: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_40.h5
Epoch 41/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 48/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9605 and recall: 0.9360
11 instances of class 1 with average precision: 0.8831 and recall: 0.8988
5 instances of class 2 with average precision: 0.5688 and recall: 0.9325
6 instances of class 3 with average precision: 0.7211 and recall: 0.9641
67 instances of class 4 with average precision: 0.7486 and recall: 0.8705
10 instances of class 5 with average precision: 0.1855 and recall: 0.5219
242 instances of class 6 with average precision: 0.2760 and recall: 0.7292
134 instances of class 7 with average precision: 0.3674 and recall: 0.7222
mAP: 0.4663

Epoch 00048: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_48.h5
Epoch 49/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 56/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9577 and recall: 0.9233
11 instances of class 1 with average precision: 0.9091 and recall: 0.9003
5 instances of class 2 with average precision: 0.3687 and recall: 0.9385
6 instances of class 3 with average precision: 0.7354 and recall: 0.9634
67 instances of class 4 with average precision: 0.7427 and recall: 0.8452
10 instances of class 5 with average precision: 0.0286 and recall: 0.3920
242 instances of class 6 with average precision: 0.2641 and recall: 0.7192
134 instances of class 7 with average precision: 0.3668 and recall: 0.7074
mAP: 0.4558

Epoch 00056: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_56.h5
Epoch 57/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 64/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9556 and recall: 0.9226
11 instances of class 1 with average precision: 0.9091 and recall: 0.8985
5 instances of class 2 with average precision: 0.4271 and recall: 0.9401
6 instances of class 3 with average precision: 0.6886 and recall: 0.8033
67 instances of class 4 with average precision: 0.7618 and recall: 0.8801
10 instances of class 5 with average precision: 0.1158 and recall: 0.4654
242 instances of class 6 with average precision: 0.2577 and recall: 0.7241
134 instances of class 7 with average precision: 0.3578 and recall: 0.7244
mAP: 0.4544

Epoch 00064: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_64.h5
Epoch 65/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 72/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9593 and recall: 0.9303
11 instances of class 1 with average precision: 0.9091 and recall: 0.9005
5 instances of class 2 with average precision: 0.4015 and recall: 0.9188
6 instances of class 3 with average precision: 0.7193 and recall: 0.9562
67 instances of class 4 with average precision: 0.7417 and recall: 0.8483
10 instances of class 5 with average precision: 0.0238 and recall: 0.4283
242 instances of class 6 with average precision: 0.2717 and recall: 0.7295
134 instances of class 7 with average precision: 0.3635 and recall: 0.7326
mAP: 0.4585

Epoch 00072: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_72.h5
Epoch 73/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 80/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9488 and recall: 0.9169
11 instances of class 1 with average precision: 0.8926 and recall: 0.8980
5 instances of class 2 with average precision: 0.4776 and recall: 0.9485
6 instances of class 3 with average precision: 0.6975 and recall: 0.8128
67 instances of class 4 with average precision: 0.7364 and recall: 0.8442
10 instances of class 5 with average precision: 0.0558 and recall: 0.3870
242 instances of class 6 with average precision: 0.2943 and recall: 0.7487
134 instances of class 7 with average precision: 0.3698 and recall: 0.7109
mAP: 0.4686

Epoch 00080: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_80.h5
Epoch 81/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 88/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9531 and recall: 0.9304
11 instances of class 1 with average precision: 0.8926 and recall: 0.8997
5 instances of class 2 with average precision: 0.4578 and recall: 0.9467
6 instances of class 3 with average precision: 0.6798 and recall: 0.9551
67 instances of class 4 with average precision: 0.7650 and recall: 0.8649
10 instances of class 5 with average precision: 0.0216 and recall: 0.2604
242 instances of class 6 with average precision: 0.2890 and recall: 0.7384
134 instances of class 7 with average precision: 0.3639 and recall: 0.7034
mAP: 0.4679

Epoch 00088: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_88.h5
Epoch 89/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

Epoch 96/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9353 and recall: 0.9108
11 instances of class 1 with average precision: 0.8843 and recall: 0.8992
5 instances of class 2 with average precision: 0.5176 and recall: 0.9568
6 instances of class 3 with average precision: 0.7221 and recall: 0.9488
67 instances of class 4 with average precision: 0.7442 and recall: 0.8694
10 instances of class 5 with average precision: 0.0159 and recall: 0.4156
242 instances of class 6 with average precision: 0.2891 and recall: 0.7502
134 instances of class 7 with average precision: 0.3519 and recall: 0.7117
mAP: 0.4608

Epoch 00096: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_96.h5
Epoch 97/100
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (1

## Increasing gamma up to 4.0 and resume training

In [59]:
last_model = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_98.h5')

!python ../../keras-retinanet/keras_retinanet/bin/train.py \
--config {CONFIG_FILE} \
--random-transform \
--weights {last_model} \
--initial-epoch 98 \
--weighted-average \
--batch-size {BATCH_SIZE} \
--steps {no_steps} \
--epochs 200 \
--no-resize \
--image-min-side 1000 \
--image-max-side 1000 \
--compute-val-loss \
--multiprocessing \
--workers 2 \
--tensorboard-dir {TENSORBOARD_LOGS_DIR} \
--snapshot-path {SNAPSHOTS_DIR} \
--validation-freq 5 \
--lr 0.5e-5 \
--reduce-lr-patience 5 \
--reduce-lr-factor 0.1 \
--gamma 4.0 \
csv {TRAIN_ANNOTATIONS} {CLASSES_FILE} \
--val-annotations {VAL_ANNOTATIONS}

Using TensorFlow backend.
Creating model, this may take a second...
2020-08-28 13:19:47.475940: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-08-28 13:19:47.506705: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-08-28 13:19:47.507060: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-08-28 13:19:47.507202: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-08-28 13:19:47.508112: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.s

tracking <tf.Variable 'Variable:0' shape=(15, 4) dtype=float32, numpy=
array([[ -66.944   ,  -16.736   ,   66.944   ,   16.736   ],
       [-105.984   ,  -26.496   ,  105.984   ,   26.496   ],
       [-169.728   ,  -42.432   ,  169.728   ,   42.432   ],
       [ -43.466465,  -25.775614,   43.466465,   25.775614],
       [ -68.81498 ,  -40.80728 ,   68.81498 ,   40.80728 ],
       [-110.20369 ,  -65.35079 ,  110.20369 ,   65.35079 ],
       [ -33.472   ,  -33.472   ,   33.472   ,   33.472   ],
       [ -52.992   ,  -52.992   ,   52.992   ,   52.992   ],
       [ -84.864   ,  -84.864   ,   84.864   ,   84.864   ],
       [ -25.778217,  -43.462074,   25.778217,   43.462074],
       [ -40.811405,  -68.80803 ,   40.811405,   68.80803 ],
       [ -65.35739 , -110.19256 ,   65.35739 ,  110.19256 ],
       [ -16.736   ,  -66.944   ,   16.736   ,   66.944   ],
       [ -26.496   , -105.984   ,   26.496   ,  105.984   ],
       [ -42.432   , -169.728   ,   42.432   ,  169.728   ]],
      dtype=f

Epoch 99/200
2020-08-28 13:20:03.536808: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-08-28 13:20:04.446047: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-08-28 13:20:06.365587: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.88GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2020-08-28 13:20:06.365755: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.88GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2020-08-28 13:20:06.523186: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator 

Epoch 103/200
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9495 and recall: 0.9280
11 instances of class 1 with average precision: 0.8636 and recall: 0.8958
5 instances of class 2 with average precision: 0.3491 and recall: 0.8753
6 instances of class 3 with average precision: 0.7206 and recall: 0.9397
67 instances of class 4 with average precision: 0.7300 and recall: 0.8628
10 instances of class 5 with average precision: 0.0250 and recall: 0.4348
242 instances of class 6 with average precision: 0.2717 and recall: 0.7267
134 instances of class 7 with average precision: 0.3436 and recall: 0.7159
mAP: 0.4495

Epoch 00103: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_103.h5
Epoch 104/200
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100%

Epoch 111/200
Running network: 100% (171 of 171) |#####| Elapsed Time: 0:00:15 Time:  0:00:15
Parsing annotations: 100% (171 of 171) |#| Elapsed Time: 0:00:00 Time:  0:00:00
returning metrics...
74 instances of class 0 with average precision: 0.9470 and recall: 0.9387
11 instances of class 1 with average precision: 0.7920 and recall: 0.8961
5 instances of class 2 with average precision: 0.4123 and recall: 0.9354
6 instances of class 3 with average precision: 0.7284 and recall: 0.9519
67 instances of class 4 with average precision: 0.7294 and recall: 0.8501
10 instances of class 5 with average precision: 0.0373 and recall: 0.3491
242 instances of class 6 with average precision: 0.2866 and recall: 0.7570
134 instances of class 7 with average precision: 0.3675 and recall: 0.7510
mAP: 0.4610

Epoch 00111: saving model to ../trainings/v3/snapshots_iter2/resnet50_csv_111.h5
Epoch 112/200
Process ForkPoolWorker-35:
Process ForkPoolWorker-34:
Process ForkPoolWorker-33:
Traceback (most recent c

## Increasing gamma up to 5.0 and resume training

In [76]:
last_model = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_108.h5')

!python ../../keras-retinanet/keras_retinanet/bin/train.py \
--config {CONFIG_FILE} \
--random-transform \
--weights {last_model} \
--initial-epoch 108 \
--weighted-average \
--batch-size {BATCH_SIZE} \
--steps {no_steps} \
--epochs 200 \
--no-resize \
--image-min-side 1000 \
--image-max-side 1000 \
--compute-val-loss \
--multiprocessing \
--workers 2 \
--tensorboard-dir {TENSORBOARD_LOGS_DIR} \
--snapshot-path {SNAPSHOTS_DIR} \
--validation-freq 5 \
--lr 1e-4 \
--reduce-lr-patience 5 \
--reduce-lr-factor 0.1 \
--gamma 5.0 \
csv {TRAIN_ANNOTATIONS} {CLASSES_FILE} \
--val-annotations {VAL_ANNOTATIONS}

Using TensorFlow backend.
Creating model, this may take a second...
2020-08-28 19:29:38.706037: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-08-28 19:29:38.735227: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-08-28 19:29:38.735565: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-08-28 19:29:38.735731: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-08-28 19:29:38.736664: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.s

tracking <tf.Variable 'Variable:0' shape=(15, 4) dtype=float32, numpy=
array([[-133.888   ,  -33.472   ,  133.888   ,   33.472   ],
       [-211.968   ,  -52.992   ,  211.968   ,   52.992   ],
       [-339.456   ,  -84.864   ,  339.456   ,   84.864   ],
       [ -86.93293 ,  -51.551228,   86.93293 ,   51.551228],
       [-137.62996 ,  -81.61456 ,  137.62996 ,   81.61456 ],
       [-220.40738 , -130.70158 ,  220.40738 ,  130.70158 ],
       [ -66.944   ,  -66.944   ,   66.944   ,   66.944   ],
       [-105.984   , -105.984   ,  105.984   ,  105.984   ],
       [-169.728   , -169.728   ,  169.728   ,  169.728   ],
       [ -51.556435,  -86.92415 ,   51.556435,   86.92415 ],
       [ -81.62281 , -137.61606 ,   81.62281 ,  137.61606 ],
       [-130.71478 , -220.38512 ,  130.71478 ,  220.38512 ],
       [ -33.472   , -133.888   ,   33.472   ,  133.888   ],
       [ -52.992   , -211.968   ,   52.992   ,  211.968   ],
       [ -84.864   , -339.456   ,   84.864   ,  339.456   ]],
      dtype=f

Epoch 109/200
2020-08-28 19:29:54.620315: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-08-28 19:29:55.527177: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-08-28 19:29:57.149450: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.83GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2020-08-28 19:29:57.162597: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.83GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2020-08-28 19:29:57.228350: W tensorflow/core/common_runtime/bfc_allocator.cc:245] Allocator

# DEBUG

If the model is not learning or has low accuracy, check the dataset and the anchors of the model to determine if the problem lies in a mismatch between the dataset and the anchors.

In [25]:
!python ../../keras-retinanet/keras_retinanet/bin/debug.py \
--config {CONFIG_FILE} \
--show-annotations \
csv {TRAIN_ANNOTATIONS} {CLASSES_FILE}

Using TensorFlow backend.
Traceback (most recent call last):
  File "../../keras-retinanet/keras_retinanet/bin/debug.py", line 327, in <module>
    main()
  File "../../keras-retinanet/keras_retinanet/bin/debug.py", line 305, in main
    generator = create_generator(args)
  File "../../keras-retinanet/keras_retinanet/bin/debug.py", line 112, in create_generator
    config=args.config
  File "../../keras-retinanet/keras_retinanet/bin/../../keras_retinanet/preprocessing/csv_generator.py", line 158, in __init__
    raise_from(ValueError('invalid CSV annotations file: {}: {}'.format(csv_data_file, e)), None)
  File "<string>", line 3, in raise_from
ValueError: invalid CSV annotations file: ../datasets/data/v3/train.csv: line 2: unknown class name: '9' (classes: OrderedDict([('0', 0), ('1', 1), ('2', 2), ('3', 3), ('4', 4), ('5', 5), ('6', 6), ('7', 7)]))


In [84]:
%pwd

'/home/aikauel/enap'

# Evaluate model

Evaluate model with test dataset (test.csv)

In [2]:
TEST_ANNOTATIONS = os.path.join(DATASET_VERSION_DIR, 'test_iter2.csv')

In [6]:
# path to trained weights
last_model = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_108.h5')
#output_images_path = os.path.join(TRAINING_BASE_DIR, 'test_output')

# evaluate model on test.csv
!python ../../keras-retinanet/keras_retinanet/bin/evaluate.py \
--config {CONFIG_FILE} \
--score-threshold 0.53 \
--iou-threshold 0.35 \
--no-resize \
--max-detections 300 \
csv {TEST_ANNOTATIONS} \
{CLASSES_FILE} \
{last_model}

Using TensorFlow backend.
Loading model, this may take a second...
2020-09-14 11:58:07.326737: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-09-14 11:58:07.355801: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-14 11:58:07.356104: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-09-14 11:58:07.359888: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-09-14 11:58:07.406472: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so

Running network: N/A% (0 of 37) |        | Elapsed Time: 0:00:00 ETA:  --:--:--2020-09-14 11:58:13.095156: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-09-14 11:58:14.932120: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
Running network: 100% (37 of 37) |#######| Elapsed Time: 0:00:35 Time:  0:00:35
Parsing annotations: 100% (37 of 37) |###| Elapsed Time: 0:00:00 Time:  0:00:00
label 0
label 1
label 2
label 3
label 4
label 5
label 6
label 7
317 instances of class 0 with average precision: 0.8168 TP: 294 FP: 220 precision: 0.5720 recall: 0.9274 f1_score: 0.7076 f2_score: 0.8249
41 instances of class 1 with average precision: 0.3103 TP: 16 FP: 20 precision: 0.4444 recall: 0.3902 f1_score: 0.4156 f2_score: 0.4000
7 instances of class 2 with average precision: 0.0357 TP: 1 FP: 6 precision: 0.1429 recall: 0.1429 f1_score: 0.1429 f2_score: 0.1429
10 

## Test model
#### Test model on images without annotations (unseen in train-val-test)

#### Convert training model to inference model

In [6]:
os.getcwd()

'/home/aikauel/enap/aerialnet_project/notebooks'

In [23]:
SNAPSHOTS_DIR

'../trainings/v3/snapshots_iter2'

In [10]:
# path to trained weights
last_model = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_108.h5')

# convert trained model to inference model to be used for generating predictions
inference_model = last_model.replace('snapshots/', 'inference/')
!python ../../keras-retinanet/keras_retinanet/bin/convert_model.py {last_model} {inference_model} \
--config {CONFIG_FILE}

Using TensorFlow backend.
2020-09-01 16:20:01.784098: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-09-01 16:20:01.820061: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-01 16:20:01.820541: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-09-01 16:20:01.820744: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-09-01 16:20:01.822082: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-09-01 16:20:01.823422: I tensorf

#### Set paths

In [11]:
test_images_path = '/home/aikauel/enap/data/vitto/'
test_output_path = os.path.join(TRAINING_BASE_DIR, 'vitto_output_epoch108_threshold53')

In [22]:
!cat {CLASSES_FILE}

0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7

In [21]:
!python ../../aerialnet_project/shared_utils/image_inference_write.py \
--input_dir {test_images_path} \
--model {inference_model} \
--output_dir {test_output_path} \
--labels {CLASSES_FILE} \
--threshold 0.35 \
--no_csv True

Using TensorFlow backend.
/home/aikauel/enap/data/vitto/
Output dir already exist
2020-09-01 16:36:28.201738: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-09-01 16:36:28.231149: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-01 16:36:28.231476: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-09-01 16:36:28.231641: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-09-01 16:36:28.232575: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic libra

no csv: /home/aikauel/enap/data/vitto/
[INFO] predicting image 1 of 1: a4.jpg
2020-09-01 16:36:31.757737: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-09-01 16:36:33.462599: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
processing time:  8.474244594573975
../trainings/v3/vitto_output_epoch108_threshold53/a4.jpg
[FINAL] Predictions completed!


# Create saved_model to load tfserving model

In [42]:
last_model = os.path.join(SNAPSHOTS_DIR, 'resnet50_csv_107.h5')

saved_model = os.path.dirname(last_model).replace('snapshots_iter2', 'saved_models')

In [43]:
print(last_model, saved_model)

../trainings/v3/snapshots_iter2/resnet50_csv_107.h5 ../trainings/v3/saved_models


In [44]:
!python ../shared_utils/convert_model.py {last_model} {saved_model} \
--config {CONFIG_FILE}

Using TensorFlow backend.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
2020-09-01 18:34:37.042366: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-09-01 18:34:37.074438: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-01 18:34:37.074740: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.755GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2020-09-01 18:34:37.074883: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-09-01 18:34:37.075773: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successf

[INFO] Model saved


## Visualization
#### Generate detections on test images - IMAGES

In [46]:
os.getcwd()

'/home/aikauel/enap'

In [13]:
# create output directory where you want to save images with bounding boxes
#!mkdir /content/data/output

# generate detections on images
!python ije_retinanet/image_inference_print.py \
-i 'data/test_images' \
-t 0.6 \
-m  {inference_model} \
-o data/output

python: can't open file 'ije_retinanet/image_inference_print.py': [Errno 2] No such file or directory
