# TED Trainer

In [1]:
from train import *
from generator.generator import *

In [2]:
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) 

In [3]:
# z = np.load('/Volumes/SSD3/everyone-faze-recode/profile-recode-00005.npz', allow_pickle=True)

# Setup model configs

In [4]:
def get_mock_config():
    return {
        'densenet_growthrate' : 32,
        'z_dim_app' : 64,
        'z_dim_gaze' : 2, 
        'z_dim_head' : 16,
        'decoder_input_c' : 32,

        'normalize_3d_codes' : True,     
        'normalize_3d_codes_axis': 1,

        'triplet_loss_type' : 'angular',   # or euclidean
        'triplet_loss_margin' : 0.0,
        'triplet_regularize_d_within' : True,

        'all_equal_embeddings': True,
        'embedding_consistency_loss_type' : None, # angular, euclidean
        'embedding_consistency_loss_warmup_samples' : 1000000,

        'backprop_gaze_to_encoder' : True,

        'coeff_l1_recon_loss' : 1.0,
        'coeff_gaze_loss' : 0.1,
        'coeff_embedding_consistency_loss' : 2.0, 

        # 0이 아닌 값들은 윈도우 쥬피터 노트북에서 parallel broken pipe
        'num_data_loaders' : 8,     
        'pin_memory': False,
        #  'batch_size' : 32,
        'batch_size' : 64,
        # 'batch_size' : 256,
        'use_apex' : False,
        'base_lr': 0.0005,
        'warmup_period_for_lr' : 1000000, 
        
        'decay_interval' : 0,
        'decay' : 0.8,
        'num_training_epochs' : 20,
        'l2_reg' : 1e-4,
        'print_freq_train' : 20,
        'print_freq_test' : 5000,

        # 프로파일별 불균형 허용되도록 아래 옵션 사용 안함
        # 'pick_exactly_per_person' : None, 'pick_at_least_per_person' : 400,
        
        # 'resource_path': '/Volumes/SSD3/faze-resources/',
        # 'npz_root_path': '/Volumes/SSD3/everyone-faze-sample/',
        
        'resource_path': '../../../../../panny/vcubuntu3/data-archive/faze-resources/',
        'npz_root_path': '../../../../../panny/vcubuntu3/data-archive/faze-recode-profile-npz/',
    }

# Build context

In [5]:
config = get_mock_config()
Context.build(config)
ctx = Context.get()

print(ctx.z_dim_app)

64


# Build network

In [6]:
network = TED(
    growth_rate=ctx.densenet_growthrate,
    z_dim_app=ctx.z_dim_app,
    z_dim_gaze=ctx.z_dim_gaze,
    z_dim_head=ctx.z_dim_head,
    decoder_input_c=ctx.decoder_input_c,
    normalize_3d_codes=ctx.normalize_3d_codes,
    normalize_3d_codes_axis=ctx.normalize_3d_codes_axis,
    use_triplet=ctx.triplet_loss_type,
    backprop_gaze_to_encoder=ctx.backprop_gaze_to_encoder,
)

In [7]:
ctx.set_network('ted', network)
ctx.load_network_to_device()

# Setup Base Loss Policy

In [8]:
ctx.setup_base_loss_policy(verbose=True)

2020-01-23 04:10:37,937 >>> base lose policy <<<
2020-01-23 04:10:37,938 max learning rate: 0.032000
2020-01-23 04:10:37,938 ramp up a: 0.000002, b: 0.000500


# Build Optimizer

In [9]:
optimizer, gaze_optimizer = ctx.build_optimizer('ted')
ctx.set_main_optimizer(optimizer)
ctx.set_gaze_optimizer(gaze_optimizer)

2020-01-23 04:10:37,957 Set optimizer as key: optimizer
2020-01-23 04:10:37,958 Set optimizer as key: gaze_optimizer


# Setup Loss function

In [10]:
loss_functions = ctx.make_loss_functions()
ctx.set_loss_functions(loss_functions)

2020-01-23 04:10:37,963 >>> Setup loss functions
2020-01-23 04:10:37,963 list: { gaze, recon_l1, triplet, all_equal } 


# Setup data-generator

In [11]:
gen = NPZDatasetGenerator(ctx, shuffle_train=False)
databag = gen.generate(verbose=True)
ctx.set_databag(databag)

2020-01-23 04:10:38,988 
>>> Data-Generator prepared <<<
2020-01-23 04:10:38,989 [gc/train] full set size:           1797228
2020-01-23 04:10:38,989 [gc/train] current set size:        1797228
2020-01-23 04:10:38,989 [gc/train] num people:                 1274
2020-01-23 04:10:38,990 [gc/train] mean entries per person:    1410
2020-01-23 04:10:38,990 
2020-01-23 04:10:38,990   [gc/val] full set size:             81280
2020-01-23 04:10:38,990   [gc/val] current set size:          81280
2020-01-23 04:10:38,991   [gc/val] num people:                   50
2020-01-23 04:10:38,991   [gc/val] mean entries per person:    1625
2020-01-23 04:10:38,991 
2020-01-23 04:10:38,991  [gc/test] full set size:            251472
2020-01-23 04:10:38,992  [gc/test] current set size:         251472
2020-01-23 04:10:38,992  [gc/test] num people:                  150
2020-01-23 04:10:38,992  [gc/test] mean entries per person:    1676
2020-01-23 04:10:38,992 
2020-01-23 04:10:38,993 >>> data-bag is setup
2020-0

In [12]:
train_ds, loader = ctx.get_dataset_and_loader(Context.TAG_TRAIN)
# print(train_ds.profiles)
# print(len(train_ds))

In [13]:
# """
#  DataLoader iter 돌다가 
#  expected Tensor as element 1 in argument 0, but got float
#  나오는 현상 -> 랜덤 false로 했는데 발생할 때 있고 아닐 때 있음
# """
# date_iter = iter(loader)
# for i in range(100):
#     x = next(date_iter)
#     print(i)
#     # print(x)

# Train

In [14]:
ctx.use_multiple_gpu_if_available()
ctx.run_train('ted')

2020-01-23 04:10:39,022 No multiple GPUs
2020-01-23 04:10:39,150 *** Run Training ***  Steps: [561633]
2020-01-23 04:10:39,152 >>> current step: 0
2020-01-23 04:10:55,557 >>> current step: 10
2020-01-23 04:11:10,625 >>> current step: 20
2020-01-23 04:11:25,761 >>> current step: 30


AttributeError: Caught AttributeError in DataLoader worker process 5.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/collate.py", line 74, in default_collate
    return {key: default_collate([d[key] for d in batch]) for key in elem}
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/collate.py", line 74, in <dictcomp>
    return {key: default_collate([d[key] for d in batch]) for key in elem}
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/collate.py", line 52, in default_collate
    numel = sum([x.numel() for x in batch])
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/collate.py", line 52, in <listcomp>
    numel = sum([x.numel() for x in batch])
AttributeError: 'float' object has no attribute 'numel'


In [None]:
# execute_training_step(current_step)

# if current_step % args.print_freq_train == args.print_freq_train - 1:
#     conv1_wt_lr = optimizer.param_groups[0]['lr']
#     running_loss_means = running_losses.means()
#     logging.info('Losses at [%7d]: %s' %
#                  (current_step + 1,
#                   ', '.join(['%s: %.5f' % v
#                              for v in running_loss_means.items()])))
#     if args.use_tensorboard:
#         tensorboard.add_scalar('train_lr', conv1_wt_lr, current_step + 1)
#         for k, v in running_loss_means.items():
#             tensorboard.add_scalar('train/' + k, v, current_step + 1)
#     running_losses.reset()

# # Print some timing statistics
# if current_step % 100 == 99:
#     if args.use_tensorboard:
#         for k, v in running_timings.means().items():
#             tensorboard.add_scalar('timing/' + k, v, current_step + 1)
#     running_timings.reset()

# # print some memory statistics
# if current_step % 5000 == 0:
#     for i in range(torch.cuda.device_count()):
#         bytes = (torch.cuda.memory_allocated(device=i)
#                  + torch.cuda.memory_cached(device=i))
#         logging.info('GPU %d: probably allocated approximately %.2f GB' % (i, bytes / 1e9))
