diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index c96d769af..0170e6c7e 100644
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -1829,6 +1829,24 @@ def backward(self, dy):
 def leakyrelu(x, a=0.01):
     return LeakyRelu(a)(x)[0]
 
+class Sqrt(Operation):
+    def __init__(self):
+        super(Sqrt, self).__init__()  
+    
+    def forward(self, x):
+        if training:
+            self.input = x
+        return singa.Sqrt(x)
+​
+    def backward(self, dy):
+        dx = singa.PowFloat(self.input,-0.5)
+        dx = singa.MultFloat(dx,0.5)
+        dx = singa.__mul__(dy, dx)
+        return dx
+​
+
+def sqrt(x):
+    return Sqrt()(x)[0]
 
 class Sub(Operation):
     def __init__(self):
@@ -1845,4 +1863,3 @@ def backward(self, dy):
 
 def sub(a, b):
     return Sub()(a,b)[0]
-   
diff --git a/test/python/test_operation.py b/test/python/test_operation.py
index a9b8f7a15..5ba8d0eb6 100755
--- a/test/python/test_operation.py
+++ b/test/python/test_operation.py
@@ -824,6 +824,44 @@ def test_Sub_gpu(self):
         np.testing.assert_array_almost_equal(tensor.to_numpy(tensor.from_raw_tensor(dx0)), DX0, decimal=5)
         np.testing.assert_array_almost_equal(tensor.to_numpy(tensor.from_raw_tensor(dx1)), DX1, decimal=5)
 
+    def test_Sqrt_cpu(self):
+        X = np.array([0.1,1.0,0.4,4.0,0.9,9.0]).reshape(3,2).astype(np.float32)
+        XT = np.sqrt(X)
+        DY = np.ones((3, 2), dtype = np.float32)
 
+        x = tensor.from_numpy(X)
+        dy = tensor.from_numpy(DY)
+        x.to_device(cpu_dev)
+        dy.to_device(cpu_dev)
+
+        result = autograd.sqrt(x)
+        dx = result.creator.backward(dy.data)
+
+        G = 0.5 * np.power(X, -0.5)
+        DX = np.multiply(G, DY)
+
+        np.testing.assert_array_almost_equal(tensor.to_numpy(result), XT, decimal=5)
+        np.testing.assert_array_almost_equal(tensor.to_numpy(tensor.from_raw_tensor(dx)), DX, decimal=5)    
+
+    def test_Sqrt_gpu(self):
+        X = np.array([0.1,1.0,0.4,4.0,0.9,9.0]).reshape(3,2).astype(np.float32)
+        XT = np.sqrt(X)
+        DY = np.ones((3, 2), dtype = np.float32)
+
+        x = tensor.from_numpy(X)
+        dy = tensor.from_numpy(DY)
+        x.to_device(gpu_dev)
+        dy.to_device(gpu_dev)
+
+        result = autograd.sqrt(x)
+        dx = result.creator.backward(dy.data)
+
+        G = 0.5 * np.power(X, -0.5)
+        DX = np.multiply(G, DY)
+
+        np.testing.assert_array_almost_equal(tensor.to_numpy(result), XT, decimal=5)
+        np.testing.assert_array_almost_equal(tensor.to_numpy(tensor.from_raw_tensor(dx)), DX, decimal=5)
+    
+    
 if __name__ == '__main__':
     unittest.main()