diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index af9224a11292..98c2861fb149 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -54,26 +54,6 @@ def update(self, label, pred):
         self.sum_metric += numpy.sum(pred_label == label)
         self.num_inst += label.size
 
-# pylint: disable=pointless-string-statement
-"""
-class LogLoss(EvalMetric):
-    # remove because it because it is too slow
-    def __init__(self):
-        self.eps = 1e-15
-        super(LogLoss, self).__init__('logloss')
-
-    def update(self, label, pred):
-        # pylint: disable=invalid-name
-        pred = pred.asnumpy()
-        label = label.asnumpy().astype('int32')
-        for i in range(label.size):
-            p = pred[i][label[i]]
-            assert(numpy.isnan(p) == False)
-            p = max(min(p, 1 - self.eps), self.eps)
-            self.sum_metric += -numpy.log(p)
-        self.num_inst += label.size
-"""
-# pylint: enable=pointless-string-statement
 
 class CustomMetric(EvalMetric):
     """Custom evaluation metric that takes a NDArray function.
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index b0b4f46ccb65..959ac75a16d9 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -122,7 +122,7 @@ def _train_multi_device(symbol, ctx, input_shape,
                         begin_round, end_round, optimizer,
                         train_data, eval_data=None, eval_metric=None,
                         iter_end_callback=None, epoch_end_callback=None,
-                        update_on_kvstore=False,
+                        update_on_kvstore=None,
                         logger=None):
     """Internal training function on multiple devices.
 
@@ -183,8 +183,9 @@ def _train_multi_device(symbol, ctx, input_shape,
     -----
     - This function will inplace update the NDArrays in arg_parans and aux_states.
     - Turning update_on_kvstore on and off can affect speed of multi-gpu training.
-    - update_on_kvstore=True works well for inception type nets that contains many small weights.
-    - update_on_kvstore=False works better for Alexnet style net with bulk weights.
+      - It is auto selected by default.
+      - update_on_kvstore=True works well for inception type nets that contains many small weights.
+      - update_on_kvstore=False works better for Alexnet style net with bulk weights.
     """
     if logger is None:
         logger = logging
@@ -210,10 +211,17 @@ def _train_multi_device(symbol, ctx, input_shape,
 
     for texec in train_execs:
         texec.copy_params_from(arg_params, aux_params)
+
     # ky value store
     kv = kvstore.create() if num_device != 1 else None
     if kv is None:
         update_on_kvstore = False
+    else:
+        # auto decide update_on_kvstore
+        if update_on_kvstore is None:
+            max_size = max(np.prod(param.shape) for param in arg_params.values())
+            update_on_kvstore = max_size < 1024 * 1024 * 16
+            logging.info('Auto-select update_on_kvstore=%s', str(update_on_kvstore))
 
     opt_state_blocks = []
     # If there are multiple devices, initialize the weights.
@@ -586,7 +594,7 @@ def predict(self, X):
 
     def fit(self, X, y=None, eval_data=None, eval_metric='acc',
             iter_end_callback=None, epoch_end_callback=None,
-            update_on_kvstore=False, logger=None):
+            update_on_kvstore=None, logger=None):
         """Fit the model.
 
         Parameters
@@ -618,6 +626,7 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
 
         update_on_kvstore: boolean, optional
             Whether to perform parameter update on kvstore instead of training device.
+            By default, the trainer will automatically decide the policy.
 
         logger : logging logger, optional
             When not specified, default logger will be used.
@@ -711,7 +720,7 @@ def load(prefix, iteration, ctx=None):
     def create(symbol, X, y=None, ctx=None,
                num_round=None, optimizer='sgd', initializer=Xavier(),
                eval_data=None, eval_metric='acc', iter_end_callback=None,
-               update_on_kvstore=False, logger=None, **kwargs):
+               update_on_kvstore=None, logger=None, **kwargs):
         """Functional style to create a model.
 
         This function will be more consistent with functional
@@ -755,6 +764,7 @@ def create(symbol, X, y=None, ctx=None,
 
         update_on_kvstore: boolean, optional
             Whether to perform parameter update on kvstore instead of training device.
+            By default, the trainer will automatically decide the policy.
 
         logger : logging logger, optional
         """