#Neural network model by Pytorh  
1. Numpy code: Create basic NN model step by step from init model to compute gradient and train model. This code use Pytorch to enable using GPU

2. Pytorch code: Creat basic NN model using pytorch API instead. Code faster, shorter


In [1]:
import torch

device = torch.device("cuda:0")
dtype=torch.float

##1. Numpy code (but using pytorch instead)

Check data type

In [None]:
a = torch.randint(0, 10, [3, 4], device=device, dtype=torch.float)
print(a)
# print(a.shape)
# w = torch.tensor([10, 10, 10], device=device, dtype=torch.float)
w = torch.randint(0, 10, [3, 4], device=device, dtype=torch.float)
print("W before", w)
# print("W shape", w.shape)
# w = w.t()
# print("W after", w)
# a = a.matmul(w)
# print("a after mm", a)
# print(a.shape)
print(a*w)

tensor([[2., 4., 5., 4.],
        [7., 1., 1., 7.],
        [6., 5., 5., 4.]], device='cuda:0')
W before tensor([[4., 3., 9., 4.],
        [6., 3., 2., 5.],
        [4., 6., 2., 2.]], device='cuda:0')
tensor([[ 8., 12., 45., 16.],
        [42.,  3.,  2., 35.],
        [24., 30., 10.,  8.]], device='cuda:0')


In [None]:
x = torch.tensor([1,2,3,4], device=device)
y = torch.tensor([3,4,5], device=device)
x2, y2 = torch.meshgrid(x, x)
print(x2 * torch.eye(4, device=device))

tensor([[1., 0., 0., 0.],
        [0., 2., 0., 0.],
        [0., 0., 3., 0.],
        [0., 0., 0., 4.]], device='cuda:0')


In [2]:
def init_model(input_shape=1000, layer_config=[1000, 500, 10], device="cuda:0", dtype=torch.float):
  # Init parameter of model
  # Return params = [[w1, b1], [w2, b2], ...]
  params = []
  i_shape = input_shape
  for num_n in layer_config:
    w = torch.randn([i_shape, num_n], device=device, dtype=dtype)
    b = torch.randn([1, num_n], device=device, dtype=dtype)
    # w = torch.randint(-10, 10, [i_shape, num_n], device=device, dtype=dtype)
    # b = torch.randint(-10, 10, [1, num_n], device=device, dtype=dtype)
    i_shape = num_n

    params.append([w, b])
  return params

In [5]:
def run_model(x=None, model_params=None, acti_fs=['relu', 'softmax']):
  # forward phase
  # return output of each layer and each step in layer
  # output = [[input], [h1, h1_ac], [h2, h2_ac], ...]
  out_l = x
  assert len(model_params) == len(acti_fs)
  outputs = [[x]]
  for (w, b), ac in zip(model_params, acti_fs):
    out_l_list = []
    out_l = out_l.matmul(w) + b
    out_l_list.append(out_l)
    if ac is 'relu':
      out_l = out_l.clamp(min=0)
    elif ac is 'softmax':
      out_l = torch.nn.Softmax()(out_l)
    out_l_list.append(out_l)
    outputs.append(out_l_list)
  return outputs


In [None]:
def loss_fn(y_pred, y_gt):
  # compute loss and loss gradient
  loss = (y_pred - y_gt).pow(2).sum().item()
  loss_grad = 2.0 * (y_pred - y_gt)
  return loss, loss_grad

In [None]:
def backward(model_params=None, acti_fs=['relu', 'softmax'], output_layers=[], lr=0.001, loss_gra=[0.0], device='cuda:0', dtype=torch.float):
  ## compute gradient of each layer and update weight, bias
  debug = False
  grads = []
  grad_previous_layer = loss_gra
  if debug:
    print("Step 0")
    print("\tgrad_previous_layer", grad_previous_layer.shape)
  # compute gradient from the last layer to first layer
  for idx in range(len(model_params))[::-1]:
    w, b = model_params[idx] # w, b of layer
    ac =  acti_fs[idx]        # activation func of layer
    out_l = output_layers[idx + 1]  # output of layer (because output[0] is input), out_l = [h, h_ac]
    out_pre = output_layers[idx]    # output of previous layer (because output[0] is input)

    # out_l[0] = w * out_pre[-1] + b
    # out_l[1] = ac(out_l[0])
    if debug:
      print("Step", idx)
    if ac == 'softmax':
      grad_softmax = out_l[-1] * (1.0 - out_l[-1] + grad_previous_layer / 2.0) # 1x2
      grad_softmax = grad_softmax.t().matmul(torch.ones([1, grad_softmax.shape[-1]], device=device, dtype=dtype))
      eye_matrix = torch.eye(grad_softmax.shape[-1], device=device, dtype=dtype)
      grad_softmax = grad_softmax * eye_matrix  # = 0 for i#j
      grad_previous_layer = grad_previous_layer.matmul(grad_softmax)

      if debug:
        print("\tgrad_softmax", grad_softmax.shape)
        print("\tgrad_previous_layer", grad_previous_layer.shape)
    elif ac == 'relu':
      grad_previous_layer[out_l[0] < 0] = 0
      if debug:
        print("\tgrad_relu/grad_previous_layer", grad_previous_layer.shape)
    if debug:
      print("\tout_pre", out_pre[-1].shape)

    grad_w = out_pre[-1].t().mm(grad_previous_layer)
    grad_b = grad_previous_layer.clone()
    grads.append([grad_w, grad_b])

    if debug:
      print("\tgrad_w", grad_w.shape)
      print("\tgrad_b", grad_b.shape)
      print("\tw", w.shape)

    grad_previous_layer = grad_previous_layer.mm(w.t())
    if debug:
      print("\tgrad_previous_layer", grad_previous_layer.shape)

  grads = grads[::-1]
  # update weight and bias
  if debug:
    print(len(model_params), len(grads))
  updated_params = []
  for (w, b), (grad_w, grad_b) in zip(model_params, grads):
    w -= lr * grad_w
    b -= lr * grad_b
    # print("\tgrad_w", grad_w)
    updated_params.append([w, b])
  return updated_params

In [None]:
x = torch.randint(0, 10, [1, 3], device=device, dtype=dtype)
y = torch.randint(0, 10, [1, 2], device=device, dtype=dtype)
y[:, 0] = 0
y[:, 1] = 1

In [None]:
print(x)
print(y)

tensor([[8., 9., 5.]], device='cuda:0')
tensor([[0., 1.]], device='cuda:0')


Init and train model

In [None]:
model_params = init_model(3, layer_config=[5, 2])
print("y:", y, "\n")
for i in range(2000):
  outputs = run_model(x=x, model_params=model_params, acti_fs=['relu', 'softmax'])
  if i == 0:
    print(outputs[-1][-1])
  loss, loss_grad = loss_fn(outputs[-1][-1], y)
  if i%100 == 0:
    print(loss)
  model_params = backward(model_params, acti_fs=['relu', 'softmax'], output_layers=outputs, loss_gra=loss_grad, lr=0.07)

print("\n", outputs[-1][-1])

y: tensor([[0., 1.]], device='cuda:0') 

tensor([[0.0082, 0.9918]], device='cuda:0')
0.00013357952411752194
9.512425458524376e-05
7.38535454729572e-05


  if sys.path[0] == '':


6.0351369029376656e-05
5.102079740026966e-05
4.418753815116361e-05
3.896611451637e-05
3.4848366340156645e-05
3.151717100990936e-05
2.876606595236808e-05
2.6456289560883306e-05
2.4490018404321745e-05
2.2795253244112246e-05
2.1319996449165046e-05
2.0023737306473777e-05
1.8875427485909313e-05
1.7852962628239766e-05
1.6934965969994664e-05
1.6106290786410682e-05
1.535378396511078e-05

 tensor([[0.0027, 0.9973]], device='cuda:0')


##2. Pytorch code

In [8]:
# add requires_grad=True when init w and b
def init_model_grad(input_shape=1000, layer_config=[1000, 500, 10], device="cuda:0", dtype=torch.float):
  # Init parameter of model
  # Return params = [[w1, b1], [w2, b2], ...]
  params = []
  i_shape = input_shape
  for num_n in layer_config:
    w = torch.randn([i_shape, num_n], device=device, dtype=dtype, requires_grad=True)
    b = torch.randn([1, num_n], device=device, dtype=dtype, requires_grad=True)
    # w = torch.randint(-10, 10, [i_shape, num_n], device=device, dtype=dtype)
    # b = torch.randint(-10, 10, [1, num_n], device=device, dtype=dtype)
    i_shape = num_n

    params.append([w, b])
  return params

In [6]:
# Init data
x = torch.randint(0, 10, [1, 3], device=device, dtype=dtype)
y = torch.randint(0, 10, [1, 2], device=device, dtype=dtype)
y[:, 0] = 0
y[:, 1] = 1

In [12]:
# Init model
model_params = init_model_grad(3, layer_config=[5, 2])

lr = 0.07

# Train model
for i in range(2000):
  output = run_model(x, model_params=model_params, acti_fs=['relu', 'softmax'])
  y_pred = output[-1][-1]
  loss = (y_pred - y).pow(2).sum()
  if i%100 == 0:
    print("loss", i, ":\t", loss.item())
  
  # enable autograd()
  loss.backward()
  # model_params_update = model_params.clone()
  # update w, b
  with torch.no_grad():
    for w, b in model_params:
      w -= lr * w.grad
      b -= lr * b.grad

      w.grad.zero_()
      b.grad.zero_()
print(y_pred)

  from ipykernel import kernelapp as app


loss 0 :	 0.8558268547058105
loss 100 :	 0.0013833274133503437
loss 200 :	 0.0005539511912502348
loss 300 :	 0.0003328070160932839
loss 400 :	 0.0002338044869247824
loss 500 :	 0.00017850073345471174
loss 600 :	 0.00014349933189805597
loss 700 :	 0.00011948654719162732
loss 800 :	 0.00010205530270468444
loss 900 :	 8.886550494935364e-05
loss 1000 :	 7.855683361412957e-05
loss 1100 :	 7.02936522429809e-05
loss 1200 :	 6.353146454785019e-05
loss 1300 :	 5.790196155430749e-05
loss 1400 :	 5.3145318815950304e-05
loss 1500 :	 4.907741822535172e-05
loss 1600 :	 4.556164640234783e-05
loss 1700 :	 4.249424091540277e-05
loss 1800 :	 3.979599568992853e-05
loss 1900 :	 3.74043156625703e-05
tensor([[0.0042, 0.9958]], device='cuda:0', grad_fn=<SoftmaxBackward>)
