You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am training my model on ml.p2.xlarge and I am using the conda_amazonei_tensorflow2_p36 notebook . I updated the tensorflow to 2.3.0 and keras to 2.4.3 and installed keras-unet package(through which I am applying unet) but while trying to train, I am getting an error :
WARNING:tensorflow:From <ipython-input-16-a832c74cfe86>:7: Model.fit_generator (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.
Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/3
---------------------------------------------------------------------------
UnknownError Traceback (most recent call last)
<ipython-input-16-a832c74cfe86> in <module>
5
6 validation_data=(x_val, y_val),
----> 7 callbacks=[callback_checkpoint]
8 )
~/anaconda3/envs/amazonei_tensorflow2_p36/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py in new_func(*args, **kwargs)
322 'in a future version' if date is None else ('after %s' % date),
323 instructions)
--> 324 return func(*args, **kwargs)
325 return tf_decorator.make_decorator(
326 func, new_func, 'deprecated',
~/anaconda3/envs/amazonei_tensorflow2_p36/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1477 use_multiprocessing=use_multiprocessing,
1478 shuffle=shuffle,
-> 1479 initial_epoch=initial_epoch)
1480
1481 @deprecation.deprecated(
~/anaconda3/envs/amazonei_tensorflow2_p36/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
64 def _method_wrapper(self, *args, **kwargs):
65 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
---> 66 return method(self, *args, **kwargs)
67
68 # Running inside `run_distribute_coordinator` already.
~/anaconda3/envs/amazonei_tensorflow2_p36/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
846 batch_size=batch_size):
847 callbacks.on_train_batch_begin(step)
--> 848 tmp_logs = train_function(iterator)
849 # Catch OutOfRangeError for Datasets of unknown size.
850 # This blocks until the batch has finished executing.
~/anaconda3/envs/amazonei_tensorflow2_p36/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
578 xla_context.Exit()
579 else:
--> 580 result = self._call(*args, **kwds)
581
582 if tracing_count == self._get_tracing_count():
~/anaconda3/envs/amazonei_tensorflow2_p36/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
642 # Lifting succeeded, so variables are initialized and we can run the
643 # stateless function.
--> 644 return self._stateless_fn(*args, **kwds)
645 else:
646 canon_args, canon_kwds = \
~/anaconda3/envs/amazonei_tensorflow2_p36/lib/python3.6/site-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
2418 with self._lock:
2419 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 2420 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
2421
2422 @property
~/anaconda3/envs/amazonei_tensorflow2_p36/lib/python3.6/site-packages/tensorflow/python/eager/function.py in _filtered_call(self, args, kwargs)
1663 if isinstance(t, (ops.Tensor,
1664 resource_variable_ops.BaseResourceVariable))),
-> 1665 self.captured_inputs)
1666
1667 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
~/anaconda3/envs/amazonei_tensorflow2_p36/lib/python3.6/site-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1744 # No tape is watching; skip to running the function.
1745 return self._build_call_outputs(self._inference_function.call(
-> 1746 ctx, args, cancellation_manager=cancellation_manager))
1747 forward_backward = self._select_forward_and_backward_functions(
1748 args,
~/anaconda3/envs/amazonei_tensorflow2_p36/lib/python3.6/site-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
596 inputs=args,
597 attrs=attrs,
--> 598 ctx=ctx)
599 else:
600 outputs = execute.execute_with_cancellation(
~/anaconda3/envs/amazonei_tensorflow2_p36/lib/python3.6/site-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[node model/conv2d/Conv2D (defined at <ipython-input-16-a832c74cfe86>:7) ]] [Op:__inference_train_function_9610]
Function call stack:
train_function
Not able to find any solution. Please guide.
The text was updated successfully, but these errors were encountered:
I see that you have an issue opened here as as well - aws/sagemaker-python-sdk#1922
I am going to close this one if you don't mind. Let's track this in the other issue since it doesn't seem to have anything to do with pytorch. Feel free to reopen if you think I miss understand something here.
I am training my model on ml.p2.xlarge and I am using the conda_amazonei_tensorflow2_p36 notebook . I updated the tensorflow to 2.3.0 and keras to 2.4.3 and installed keras-unet package(through which I am applying unet) but while trying to train, I am getting an error :
Not able to find any solution. Please guide.
The text was updated successfully, but these errors were encountered: