In [3]:
# Code in file tensor/two_layer_net_tensor.py
import torch

device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# print(x)
# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss; loss is a scalar, and is stored in a PyTorch Tensor
    # of shape (); we can get its value as a Python number with loss.item().
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 27962212.0
1 23586116.0
2 23735140.0
3 24899520.0
4 24513336.0
5 21187266.0
6 15682645.0
7 10080956.0
8 5937800.5
9 3413120.5
10 2034441.75
11 1303015.625
12 907236.75
13 679881.9375
14 538488.125
15 442904.59375
16 373437.03125
17 320017.90625
18 277190.46875
19 241949.890625
20 212505.390625
21 187569.5625
22 166281.71875
23 147917.65625
24 131982.109375
25 118098.328125
26 105971.8125
27 95321.96875
28 85936.1796875
29 77633.53125
30 70275.9453125
31 63734.34765625
32 57897.25
33 52678.9765625
34 48004.33203125
35 43809.78515625
36 40040.765625
37 36643.1328125
38 33580.39453125
39 30814.248046875
40 28309.123046875
41 26037.59375
42 23973.2109375
43 22095.013671875
44 20384.6875
45 18824.931640625
46 17399.720703125
47 16096.6875
48 14904.55859375
49 13814.2373046875
50 12814.60546875
51 11896.2158203125
52 11051.9697265625
53 10274.880859375
54 9559.849609375
55 8901.4775390625
56 8293.5517578125
57 7731.81640625
58 7212.94970703125
59 6732.5703125
60 6287.94091796875
61 5875.95

412 0.0008820065995678306
413 0.0008544131997041404
414 0.0008288935059681535
415 0.0008029300370253623
416 0.0007806764333508909
417 0.0007570317829959095
418 0.0007349321385845542
419 0.0007151938043534756
420 0.0006940520834177732
421 0.0006749729509465396
422 0.0006568673416040838
423 0.0006375535158440471
424 0.0006210028077475727
425 0.0006041654269210994
426 0.0005881584947928786
427 0.0005706666270270944
428 0.0005563328159041703
429 0.0005411757156252861
430 0.0005270182155072689
431 0.0005123081500642002
432 0.0004987611318938434
433 0.0004853340797126293
434 0.00047261829604394734
435 0.00045958286500535905
436 0.0004490898281801492
437 0.0004371441900730133
438 0.0004262023139744997
439 0.00041502073872834444
440 0.0004036753380205482
441 0.0003943161864299327
442 0.0003854167880490422
443 0.000375336006982252
444 0.00036619286402128637
445 0.00035729273804463446
446 0.0003486908390186727
447 0.00033977185375988483
448 0.00033126160269603133
449 0.00032401777571067214
450 0

In [13]:
# Code in file autograd/two_layer_net_autograd.py
import torch

device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Create random Tensors for weights; setting requires_grad=True means that we
# want to compute gradients for these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors. Since w1 and
    # w2 have requires_grad=True, operations involving these Tensors will cause
    # PyTorch to build a computational graph, allowing automatic computation of
    # gradients. Since we are no longer implementing the backward pass by hand we
    # don't need to keep references to intermediate values.

    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss. Loss is a Tensor of shape (), and loss.item()
    # is a Python number giving its value.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Update weights using gradient descent. For this step we just want to mutate
    # the values of w1 and w2 in-place; we don't want to build up a computational
    # graph for the update steps, so we use the torch.no_grad() context manager
    # to prevent PyTorch from building a computational graph for the updates
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

    # Manually zero the gradients after running the backward pass
    w1.grad.zero_()
    w2.grad.zero_()


0 30501936.0
1 28909068.0
2 31149196.0
3 32044040.0
4 27620014.0
5 19133780.0
6 10748395.0
7 5474221.0
8 2854488.5
9 1683062.25
10 1141348.0
11 862872.75
12 696719.875
13 583412.6875
14 498459.34375
15 431077.90625
16 375780.09375
17 329543.5
18 290408.1875
19 256975.890625
20 228268.28125
21 203537.34375
22 182111.09375
23 163446.828125
24 147091.9375
25 132702.5
26 120007.3828125
27 108785.5078125
28 98836.0390625
29 89982.921875
30 82060.3125
31 74959.328125
32 68583.0546875
33 62844.38671875
34 57675.67578125
35 53002.64453125
36 48773.0390625
37 44935.625
38 41449.03125
39 38276.45703125
40 35385.76953125
41 32747.75
42 30336.3203125
43 28128.49609375
44 26107.560546875
45 24253.416015625
46 22550.0078125
47 20983.689453125
48 19541.18359375
49 18211.51953125
50 16985.19921875
51 15852.798828125
52 14806.0302734375
53 13837.87109375
54 12941.6748046875
55 12110.251953125
56 11339.21484375
57 10623.443359375
58 9960.40234375
59 9345.830078125
60 8774.072265625
61 8241.6220703125
62

480 0.000768663827329874
481 0.0007501337677240372
482 0.0007320304284803569
483 0.0007156286155804992
484 0.0006983237690292299
485 0.0006834224332123995
486 0.0006664646789431572
487 0.0006522846524603665
488 0.0006375465309247375
489 0.0006232071318663657
490 0.0006102420156821609
491 0.0005956158274784684
492 0.0005832348251715302
493 0.0005718600004911423
494 0.0005588078638538718
495 0.0005455601494759321
496 0.0005338597111403942
497 0.0005226670182310045
498 0.0005118801491335034
499 0.0005010741297155619


In [14]:
# Code in file autograd/two_layer_net_custom_function.py
import torch

class MyReLU(torch.autograd.Function):
  """
  We can implement our own custom autograd Functions by subclassing
  torch.autograd.Function and implementing the forward and backward passes
  which operate on Tensors.
  """
  @staticmethod
  def forward(ctx, x):
    """
    In the forward pass we receive a context object and a Tensor containing the
    input; we must return a Tensor containing the output, and we can use the
    context object to cache objects for use in the backward pass.
    """
    ctx.save_for_backward(x)
    return x.clamp(min=0)

  @staticmethod
  def backward(ctx, grad_output):
    """
    In the backward pass we receive the context object and a Tensor containing
    the gradient of the loss with respect to the output produced during the
    forward pass. We can retrieve cached data from the context object, and must
    compute and return the gradient of the loss with respect to the input to the
    forward function.
    """
    x, = ctx.saved_tensors
    grad_x = grad_output.clone()
    grad_x[x < 0] = 0
    return grad_x


device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and output
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y using operations on Tensors; we call our
  # custom ReLU implementation using the MyReLU.apply function
  y_pred = MyReLU.apply(x.mm(w1)).mm(w2)
 
  # Compute and print loss
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.item())

  # Use autograd to compute the backward pass.
  loss.backward()

  with torch.no_grad():
    # Update weights using gradient descent
    w1 -= learning_rate * w1.grad
    w2 -= learning_rate * w2.grad

    # Manually zero the gradients after running the backward pass
    w1.grad.zero_()
    w2.grad.zero_()

0 33545500.0
1 32456208.0
2 32427304.0
3 28930010.0
4 21313172.0
5 13040536.0
6 7092082.0
7 3840489.25
8 2255524.5
9 1488921.625
10 1088447.125
11 853394.5625
12 697643.5
13 584573.9375
14 497142.53125
15 426859.25
16 369151.0625
17 321054.4375
18 280513.0
19 246060.421875
20 216637.984375
21 191378.703125
22 169576.265625
23 150665.203125
24 134224.015625
25 119879.0078125
26 107314.84375
27 96264.65625
28 86532.03125
29 77918.015625
30 70283.40625
31 63499.7421875
32 57458.86328125
33 52071.6015625
34 47253.3671875
35 42937.43359375
36 39066.6484375
37 35590.0078125
38 32464.654296875
39 29644.99609375
40 27097.388671875
41 24791.779296875
42 22703.294921875
43 20809.49609375
44 19089.203125
45 17525.822265625
46 16104.630859375
47 14809.55859375
48 13628.6435546875
49 12550.8935546875
50 11566.46484375
51 10665.8642578125
52 9842.646484375
53 9088.6943359375
54 8397.298828125
55 7763.0498046875
56 7180.8662109375
57 6645.80908203125
58 6154.05859375
59 5701.6552734375
60 5285.098144

In [18]:
# Code in file nn/two_layer_net_nn.py
import torch

device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
# After constructing the model we use the .to() method to move it to the
# desired device.
model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, D_out),
        ).to(device)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function. Setting
# reduction='sum' means that we are computing the *sum* of squared errors rather
# than the mean; this is for consistency with the examples above where we
# manually compute the loss, but in practice it is more common to use mean
# squared error as a loss by setting reduction='elementwise_mean'.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
  # Forward pass: compute predicted y by passing x to the model. Module objects
  # override the __call__ operator so you can call them like functions. When
  # doing so you pass a Tensor of input data to the Module and it produces
  # a Tensor of output data.
  y_pred = model(x)

  # Compute and print loss. We pass Tensors containing the predicted and true
  # values of y, and the loss function returns a Tensor containing the loss.
  loss = loss_fn(y_pred, y)
  print(t, loss.item())
  
  # Zero the gradients before running the backward pass.
  model.zero_grad()

  # Backward pass: compute gradient of the loss with respect to all the learnable
  # parameters of the model. Internally, the parameters of each Module are stored
  # in Tensors with requires_grad=True, so this call will compute gradients for
  # all learnable parameters in the model.
  loss.backward()

  # Update the weights using gradient descent. Each parameter is a Tensor, so
  # we can access its data and gradients like we did before.
  with torch.no_grad():
    for param in model.parameters():
      param.data -= learning_rate * param.grad

0 631.7567138671875
1 631.5046997070312
2 631.2628173828125
3 631.0296020507812
4 630.8043823242188
5 630.5878295898438
6 630.37939453125
7 630.1783447265625
8 629.984130859375
9 629.7962036132812
10 629.6141967773438
11 629.4381713867188
12 629.2685546875
13 629.1041259765625
14 628.9444580078125
15 628.7899169921875
16 628.6401977539062
17 628.4951171875
18 628.3541259765625
19 628.2172241210938
20 628.0845947265625
21 627.9557495117188
22 627.8309326171875
23 627.7094116210938
24 627.5910034179688
25 627.475830078125
26 627.3640747070312
27 627.2552490234375
28 627.1497192382812
29 627.0474243164062
30 626.947998046875
31 626.8509521484375
32 626.7565307617188
33 626.6646118164062
34 626.5753173828125
35 626.4880981445312
36 626.4029541015625
37 626.3198852539062
38 626.2386474609375
39 626.1593017578125
40 626.0817260742188
41 626.0059204101562
42 625.9317016601562
43 625.8589477539062
44 625.787841796875
45 625.7182006835938
46 625.6495971679688
47 625.5824584960938
48 625.5166625

392 566.154541015625
393 565.1474609375
394 564.1192626953125
395 563.06982421875
396 561.9989013671875
397 560.9038696289062
398 559.7865600585938
399 558.6456909179688
400 557.480224609375
401 556.2866821289062
402 555.0668334960938
403 553.82373046875
404 552.559326171875
405 551.2721557617188
406 549.9568481445312
407 548.6155395507812
408 547.2484130859375
409 545.85400390625
410 544.4364624023438
411 542.9908447265625
412 541.5187377929688
413 540.0170288085938
414 538.4851684570312
415 536.932373046875
416 535.3499755859375
417 533.7417602539062
418 532.1004638671875
419 530.4334716796875
420 528.737060546875
421 527.012451171875
422 525.261474609375
423 523.483154296875
424 521.6809692382812
425 519.8504028320312
426 517.9940185546875
427 516.1159057617188
428 514.2053833007812
429 512.2728881835938
430 510.3196105957031
431 508.3424987792969
432 506.34539794921875
433 504.3285827636719
434 502.2977600097656
435 500.2537536621094
436 498.18896484375
437 496.11004638671875
438 4

In [24]:
# Code in file nn/two_layer_net_module.py
import torch

class TwoLayerNet(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
    """
    In the constructor we instantiate two nn.Linear modules and assign them as
    member variables.
    """
    super(TwoLayerNet, self).__init__()
    self.linear1 = torch.nn.Linear(D_in, H)
    self.linear2 = torch.nn.Linear(H, D_out)
    self.tanh = torch.nn.Tanh()

  def forward(self, x):
    """
    In the forward function we accept a Tensor of input data and we must return
    a Tensor of output data. We can use Modules defined in the constructor as
    well as arbitrary (differentiable) operations on Tensors.
    """
    h_relu = self.tanh(self.linear1(x))
    y_pred = self.linear2(h_relu)
    return y_pred

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above.
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
  # Forward pass: Compute predicted y by passing x to the model
  y_pred = model(x)

  # Compute and print loss
  loss = loss_fn(y_pred, y)
  print(t, loss.item())

  # Zero gradients, perform a backward pass, and update the weights.
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

0 726.4171142578125
1 652.8029174804688
2 588.0338745117188
3 530.7543334960938
4 479.86175537109375
5 434.4570007324219
6 393.80438232421875
7 357.2981262207031
8 324.43560791015625
9 294.79534912109375
10 268.02020263671875
11 243.8040771484375
12 221.8820037841797
13 202.0224609375
14 184.0213165283203
15 167.697265625
16 152.8883056640625
17 139.44903564453125
18 127.24848937988281
19 116.16838836669922
20 106.10179138183594
21 96.95198822021484
22 88.63145446777344
23 81.06119537353516
24 74.16984558105469
25 67.89307403564453
26 62.1729621887207
27 56.957366943359375
28 52.199310302734375
29 47.85651397705078
30 43.89088439941406
31 40.268062591552734
32 36.95705032348633
33 33.92982482910156
34 31.161067962646484
35 28.627830505371094
36 26.309314727783203
37 24.186670303344727
38 22.242767333984375
39 20.4620361328125
40 18.830318450927734
41 17.334718704223633
42 15.963517189025879
43 14.706007957458496
44 13.55245590209961
45 12.493961334228516
46 11.522418975830078
47 10.630

372 1.5570019513688749e-06
373 1.4966150274631218e-06
374 1.4386977227331954e-06
375 1.3819567357131746e-06
376 1.3292841458678595e-06
377 1.2779215694536106e-06
378 1.2269575790924137e-06
379 1.179942955786828e-06
380 1.1342439165673568e-06
381 1.0913391861322452e-06
382 1.0476383067725692e-06
383 1.0074885494759656e-06
384 9.683234338808688e-07
385 9.30930752929271e-07
386 8.949847938310995e-07
387 8.601742820246727e-07
388 8.267255111604754e-07
389 7.952309601932939e-07
390 7.637902399437735e-07
391 7.339456260524457e-07
392 7.051394277368672e-07
393 6.784269430681888e-07
394 6.514486017294985e-07
395 6.269915502343792e-07
396 6.028687948855804e-07
397 5.794765911559807e-07
398 5.569012273554108e-07
399 5.354687004910375e-07
400 5.146674766365322e-07
401 4.948208243149566e-07
402 4.7632090627303114e-07
403 4.5735723119832983e-07
404 4.3954213424513e-07
405 4.226348266911373e-07
406 4.06206567049594e-07
407 3.9119714756452595e-07
408 3.7547511055890936e-07
409 3.6147511650597153e-07


In [25]:
# Code in file nn/dynamic_net.py
import random
import torch

class DynamicNet(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
    """
    In the constructor we construct three nn.Linear instances that we will use
    in the forward pass.
    """
    super(DynamicNet, self).__init__()
    self.input_linear = torch.nn.Linear(D_in, H)
    self.middle_linear = torch.nn.Linear(H, H)
    self.output_linear = torch.nn.Linear(H, D_out)

  def forward(self, x):
    """
    For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
    and reuse the middle_linear Module that many times to compute hidden layer
    representations.

    Since each forward pass builds a dynamic computation graph, we can use normal
    Python control-flow operators like loops or conditional statements when
    defining the forward pass of the model.

    Here we also see that it is perfectly safe to reuse the same Module many
    times when defining a computational graph. This is a big improvement from Lua
    Torch, where each Module could be used only once.
    """
    h_relu = self.input_linear(x).clamp(min=0)
    for _ in range(random.randint(0, 3)):
      h_relu = self.middle_linear(h_relu).clamp(min=0)
    y_pred = self.output_linear(h_relu)
    return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs.
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
  # Forward pass: Compute predicted y by passing x to the model
  y_pred = model(x)

  # Compute and print loss
  loss = criterion(y_pred, y)
  print(t, loss.item())

  # Zero gradients, perform a backward pass, and update the weights.
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

0 609.2322998046875
1 610.7882690429688
2 608.37353515625
3 596.3297119140625
4 583.9382934570312
5 605.0707397460938
6 636.9856567382812
7 604.9656982421875
8 599.88818359375
9 529.9497680664062
10 519.8696899414062
11 602.83154296875
12 591.5848388671875
13 485.18463134765625
14 585.0599975585938
15 579.9739379882812
16 598.32177734375
17 437.7008361816406
18 423.8642578125
19 377.1807556152344
20 330.1797180175781
21 539.158447265625
22 227.48326110839844
23 349.3792419433594
24 329.04803466796875
25 491.70806884765625
26 469.5274963378906
27 539.3211669921875
28 406.7391662597656
29 480.4644470214844
30 231.26942443847656
31 310.3476257324219
32 374.787841796875
33 239.72750854492188
34 260.9079895019531
35 215.00265502929688
36 209.0128936767578
37 134.53436279296875
38 166.14320373535156
39 118.68724822998047
40 144.34193420410156
41 214.37216186523438
42 82.6351318359375
43 77.73656463623047
44 62.3203010559082
45 260.46868896484375
46 163.79067993164062
47 74.95854949951172
48 

394 0.9583476781845093
395 0.17079667747020721
396 0.9882294535636902
397 0.9300455451011658
398 0.24232779443264008
399 0.6004571914672852
400 0.7372729778289795
401 0.2810194790363312
402 0.931012749671936
403 0.9508612155914307
404 0.7758787274360657
405 0.5351707339286804
406 0.4577297866344452
407 0.4200015068054199
408 1.2990292310714722
409 0.38597697019577026
410 0.25748011469841003
411 1.6742830276489258
412 0.49375343322753906
413 0.39947956800460815
414 1.4659653902053833
415 0.2707727253437042
416 0.17175161838531494
417 1.1956827640533447
418 0.4877736568450928
419 0.5483067035675049
420 1.5368738174438477
421 0.5936941504478455
422 0.6280806660652161
423 0.22112813591957092
424 1.2673574686050415
425 0.6329777836799622
426 0.09016742557287216
427 0.7024178504943848
428 0.0729869082570076
429 0.059759967029094696
430 1.051111102104187
431 0.5519311428070068
432 0.4911973476409912
433 0.6946014165878296
434 0.7607543468475342
435 0.57140052318573
436 0.43386879563331604
437

In [43]:
import numpy as np
g = np.array([]).reshape(0,10)
a = np.zeros((1,10))
b = np.ones((2,10))
print(b)
a = np.concatenate((g,a,b),axis = 0)

print(a)

[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]
