In [184]:
# Import libraries and magics

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh')
import pandas as pd
from sklearn.model_selection import train_test_split

In [185]:
def graph(formula, x_range):  
    x = np.array(x_range)  
    y = eval(formula)
    # plt.plot(x, y) 
    #return axis.plot(x, y, color='red')
    return x, y
    #plt.show()

## The Backpropagation Algorithm

In [190]:
# Writing the ReLU activation function

inputs = [0, 2, -1, 3.3, -2.7, 1.1, 2.2, -100]
output = []

In [191]:
def spiral_data(points, classes):
    X = np.zeros((points*classes, 2))
    y = np.zeros((points*classes), dtype='int')
    
    
    for class_number in range(classes):
        ix = range(points*class_number, points*(class_number+1))
        r = np.linspace(0.0, 1, points)  # radius
        t = np.linspace(class_number*4, (class_number+1)*4, points) + np.random.randn(points)*0.2
        X[ix] = np.c_[r*np.sin(t*2.5), r*np.cos(t*2.5)]
        y[ix] = class_number
        
    return X, y

In [192]:
def one_hot_encode(data):

    # Get unique categories in the data  [0, 0, 0,] ---. [[0, 1], [1, 0]]
    unique_categories = np.unique(data)

    # Create an empty array to hold the one-hot encoding
    one_hot_encoded = np.zeros((len(data), len(unique_categories)))

    # Fill in the one-hot encoding
    for i, category in enumerate(data):
        one_hot_encoded[i, np.where(unique_categories == category)] = 1

    return one_hot_encoded

In [193]:
def backprop(training_data, training_labels, o1_raw, o1, output_weights, o2):
    # Output bias
    # m = training_labels.size
    # dSSR = -2 * (training_labels - o2)
    # dW2 = 1/m * dSSR.dot(o1.T)
    # db2 = 1/m * np.sum(dSSR, axis=1, keepdims=True)

    m = training_labels.size
    dSSR = -2 * (training_labels - o2)
    dW2 = dSSR.dot(o1.T)
    db2 = np.sum(dSSR, axis=1, keepdims=True)


    # Hidden
    dZ1 = output_weights.T.dot(dSSR) * relu_prime(o1_raw)
    dW1 = 1/m * dZ1.dot(training_data.T)
    db1 = 1/m * np.sum(dZ1, axis=1, keepdims=True)

    return dW1, db1, dW2, db2

## The Code

In [292]:
# Batch size- how MANY of the samples we want to show at a time

# set seed of random initializations
np.random.seed(2)

X = [[1,0],
     [0,1],
     [-1,0],
     [0,-1],
     [0.5,0.5],
     [-0.5,0.5],
     [0.5, -0.5],
     [-0.5,-0.5]]

t = [1,1,1,1,0,0,0,0]
t_one_hot_encoded = [[0,1],
                     [0,1],
                     [0,1],
                     [0,1],
                     [1,0],
                     [1,0],
                     [1,0],
                     [1,0]]

X = np.array(X)
t = np.array(t)


class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        
        # initialize weights as inputs x num neurons instead of other way, so that when we do matrix operations, we don't have to transpose
        #self.weights = 0.10 * np.random.randn(n_inputs, n_neurons) # to keep random values constrained between -1 and 1
        self.weights = np.random.uniform(-0.5, 0.5, (n_neurons, n_inputs))
        self.biases = np.zeros((n_neurons, 1))
    #def forward(self, inputs):
    #    self.output = self.weights.dot(inputs) + self.biases

class Activation_ReLU:
    def forward(self, inputs):
        self.output = np.maximum(0, inputs)
    def derivative(self, netValue): # netValue is an array of values
        self.derivative_value = (netValue > 0)
        return netValue > 0
    
class Activation_Sigmoid:
    def forward(self, inputs):
        self.output = 1/(1+np.exp(-inputs))
    def derivative(self, netValue):
        self.derivative_value = netValue *(1-netValue)
        return self.derivative_value
        
class Activation_Softmax:
    def forward(self, inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True)) # we want max values WITHIN a batch, not among other batches
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities
        
class Activation_TanH:
    def forward(self, inputs): # where input is a SCALER, or the NET (sum of weights and inputs)
        self.output = np.tanh(inputs)
    def derivative(self, netValue): # netValue is an array of values
        self.derivative_value = 0.5 * (1 - np.square(netValue))
        return self.derivative_value
        
        
class Loss:
    def calculate(self, output, y): # output is output of model, y is INTENDED target values
        sample_losses = self.forward(output,y)
        data_loss = np.mean(sample_losses) # or the mean loss of every sample in batch
        return data_loss
    
# basically, for the below, the Categorical Cross Entropy for a one-hot encoded target vector is just -log(prediction) at that one hot encoded vector
class Loss_CategoricalCrossEntropy(Loss): # will inherit from base loss class
    def forward(self, y_pred, y_true):
        
        samples = len(y_pred)
        
        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7) # because -log(0) = infinity, so we never want exactly 0
        
        if len(y_true.shape) == 1: # meaning the target vector is something passed in like [1, 2, 0, 1] where lets say index 0 tells you for batch 0, class 1 is the predicted class
            correct_confidences = y_pred_clipped[range(samples), y_true] # range(samples) means we want ALL the batches, and y_true tells us for each batch, which sample to grab (corresponding to target class)
        elif len(y_true.shape) == 2: # if one-hot encoded vectors are passed for each batch, like [ [1,0,0], [0,1,0],... ]
            correct_confidences = np.sum(y_pred_clipped*y_true.T, axis=1) # for each batch, multiply samples by one-hot encoded vecotr, to only keep data corresponding to target class, and then sum to get only the values we want, not all the 0's
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods # is the vector of values corresponding to -log( sample within a batch corresponding to target class, for each sample within a batch)
        
    

In [293]:
num_points = 1000
num_classes = 2


X, labels = spiral_data(num_points, num_classes)

X = X.T

t = one_hot_encode(labels)

print(len(np.array(t)))

print(X)
print(X.shape)
print(t)
print(t.shape)

2000
[[-0.00000000e+00 -1.81405520e-05 -1.73466528e-03 ...  9.97979335e-01
   8.80843050e-01  9.99107962e-01]
 [ 0.00000000e+00  1.00083661e-03  9.99474056e-04 ...  6.10337564e-03
   4.71290272e-01  4.22289048e-02]]
(2, 2000)
[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]
(2000, 2)


In [294]:
# Define Network

# Define Layers
dense1 = Layer_Dense(2,8) # input is 2 because input data is just (x,y)
activation1 = Activation_TanH()

dense2 = Layer_Dense(8,1)
activation2 = Activation_TanH()

print(dense1.weights)
print(dense2.weights)




[[ 0.18047218  0.12673595]
 [ 0.4027876   0.26632841]
 [ 0.40484565 -0.31309311]
 [ 0.07301932  0.39097352]
 [ 0.14459498  0.42747351]
 [-0.45108355  0.014763  ]
 [ 0.41902219  0.14581801]
 [-0.20059297  0.43925547]]
[[ 0.25452902  0.05429244 -0.45887834  0.08294022 -0.0452265   0.21835405
   0.09125661  0.21703291]]


In [295]:

# ----------- Hyperparameters ------------ #

eta = 0.005 # define learning rate
EPOCHS = 10
batch_size = 2

# ------------------------------------------------- #

# Define data

#print('initial weights are:', dense1.weights, dense2.weights)

#print('data is', X[:10])
#print(X.shape)

index = 0

for epoch in range(EPOCHS):
    
    error2 = 0
    
    for i in range(1000):

        
        first = index
        second = index+batch_size


        if (first == num_points):
            first = index % num_points
            second = (index+batch_size) % num_points
            index = index % num_points

    # ---------------- For batch training -------------------#

        # Grab the batch of sampels / labels from data
        #input_data = np.array( X[first:second] )
        #target_OHE = np.array( t[first:second] )
    # --------------------------------------------------------#
    
        input_data = np.array(X)
        target_OHE = np.array(labels)

        m = len(np.array(labels))

        # Run Network
        o1_raw = dense1.weights.dot(input_data) + dense1.biases
        activation1.forward(o1_raw)

        o2_raw = dense2.weights.dot(activation1.output) + dense2.biases
        activation2.forward(o2_raw) # output of neural network


        # ------------------- Loss for Categorical Entropy -----------------#
        #loss_function = Loss_CategoricalCrossEntropy()
        #loss = loss_function.calculate(activation2.output, target_OHE)
        # ------------------------------------------------------------------- #
        
        # ---------------------------Accuracy -------------------------------#
        #num_correct = 0
        #if i % 1000 == 0:
        #print('the loss of iteration', i, 'is:', loss)
        #  for j in range(len(target_OHE)):
        #      if (activation2.output[0][j] > 0 and target_OHE[j] == 1) or (activation2.output[0][j] < 0 and target_OHE[j] == -1):
        #          num_correct += 1

        #  print('the accuracy of iteration', i, 'is:', num_correct / len(np.array(target_OHE)))   

        
        for j in range(m):
            error2 += (labels[j]-activation2.output[0][j])**2
        error2 = error2/m

        

        # BP for output layer
        error = target_OHE - activation2.output

        #softmax_derivative = target_OHE.T - activation2.output
        layer2_derivative = activation2.derivative(o2_raw)


        output_error = -error * layer2_derivative * activation2.output

        output_weight_update = output_error.dot(activation1.output.T)


        temp_dense2_weights = dense2.weights + eta * output_weight_update
        temp_dense2_biases = dense2.biases + eta * np.sum(output_error, axis=1, keepdims=True)

        # BP for hidden layer
        scaled_by_weights = dense2.weights.T.dot(output_error) 

        overall_error = scaled_by_weights * activation1.derivative(o1_raw)

        weight_updates = overall_error.dot(input_data.T)

        # update weight parameters
        dense1.weights = dense1.weights + eta * weight_updates
        dense2.weights = temp_dense2_weights

        # Update bias parameters
        dense1.biases = dense1.biases + eta * np.sum(overall_error, axis=1, keepdims=True)
        dense2.biases = temp_dense2_biases

        # ------------ For Batch Training ---------#
        # Update counter
        #index += batch_size
        # ---------------------------------------- #

        
    MSE = error2
    print(f'Training MSE: {MSE}')

Training MSE: 0.3587803912191983
Training MSE: 0.35873211146589573
Training MSE: 0.3587635612587303
Training MSE: 0.35880221316201844
Training MSE: 0.3588447506737724
Training MSE: 0.35889023945681103
Training MSE: 0.35893795610712004
Training MSE: 0.35898730014779834
Training MSE: 0.35903777443475043
Training MSE: 0.35908896937922896


## Code With Other BackProp

In [64]:
# Batch size- how MANY of the samples we want to show at a time

# set seed of random initializations
np.random.seed(2)

X = [[1,0],
     [0,1],
     [-1,0],
     [0,-1],
     [0.5,0.5],
     [-0.5,0.5],
     [0.5, -0.5],
     [-0.5,-0.5]]

t = [1,1,1,1,0,0,0,0]
t_one_hot_encoded = [[0,1],
                     [0,1],
                     [0,1],
                     [0,1],
                     [1,0],
                     [1,0],
                     [1,0],
                     [1,0]]

X = np.array(X)
t = np.array(t)


class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        
        # initialize weights as inputs x num neurons instead of other way, so that when we do matrix operations, we don't have to transpose
        #self.weights = 0.10 * np.random.randn(n_inputs, n_neurons) # to keep random values constrained between -1 and 1
        self.weights = np.random.uniform(-0.5, 0.5, (n_inputs, n_neurons))
        self.biases = np.zeros((n_neurons, 1))
    def forward(self, inputs):
        self.output = self.weights.dot(inputs) + self.biases

class Activation_ReLU:
    def forward(self, inputs):
        self.output = np.maximum(0, inputs)
    def derivative(self, netValue): # netValue is an array of values
        self.derivative_value = (netValue > 0)
        return netValue > 0
    
class Activation_Sigmoid:
    def forward(self, inputs):
        return 1/(1+np.exp(-x))
    def derivative(self, netValue):
        return x *(1-x)
        
class Activation_Softmax:
    def forward(self, inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True)) # we want max values WITHIN a batch, not among other batches
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities
        
class Activation_TanH:
    def forward(self, inputs): # where input is a SCALER, or the NET (sum of weights and inputs)
        self.output = np.tanh(inputs)
    def derivative(self, netValue): # netValue is an array of values
        self.derivative_value = 0.5 * (1 - np.square(netValue))
        return self.derivative_value
        
        
class Loss:
    def calculate(self, output, y): # output is output of model, y is INTENDED target values
        sample_losses = self.forward(output,y)
        data_loss = np.mean(sample_losses) # or the mean loss of every sample in batch
        return data_loss
    
# basically, for the below, the Categorical Cross Entropy for a one-hot encoded target vector is just -log(prediction) at that one hot encoded vector
class Loss_CategoricalCrossEntropy(Loss): # will inherit from base loss class
    def forward(self, y_pred, y_true):
        
        samples = len(y_pred)
        
        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7) # because -log(0) = infinity, so we never want exactly 0
        
        if len(y_true.shape) == 1: # meaning the target vector is something passed in like [1, 2, 0, 1] where lets say index 0 tells you for batch 0, class 1 is the predicted class
            correct_confidences = y_pred_clipped[range(samples), y_true] # range(samples) means we want ALL the batches, and y_true tells us for each batch, which sample to grab (corresponding to target class)
        elif len(y_true.shape) == 2: # if one-hot encoded vectors are passed for each batch, like [ [1,0,0], [0,1,0],... ]
            correct_confidences = np.sum(y_pred_clipped*y_true.T, axis=1) # for each batch, multiply samples by one-hot encoded vecotr, to only keep data corresponding to target class, and then sum to get only the values we want, not all the 0's
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods # is the vector of values corresponding to -log( sample within a batch corresponding to target class, for each sample within a batch)
        
    

In [52]:
num_points = 1000
num_classes = 2


X, labels = spiral_data(num_points, num_classes)
#t = one_hot_encode(labels)

print(len(np.array(t)))

print(X)
print(X.shape)
print(t)
print(t.shape)

8
[[-0.00000000e+00  0.00000000e+00]
 [-1.81405520e-05  1.00083661e-03]
 [-1.73466528e-03  9.99474056e-04]
 ...
 [ 9.97979335e-01  6.10337564e-03]
 [ 8.80843050e-01  4.71290272e-01]
 [ 9.99107962e-01  4.22289048e-02]]
(2000, 2)
[1 1 1 1 0 0 0 0]
(8,)


In [53]:
# Define Network

# Define Layers
dense1 = Layer_Dense(2,4) # input is 2 because input data is just (x,y)
activation1 = Activation_Sigmoid()

dense2 = Layer_Dense(4,1)
activation2 = Activation_Sigmoid()

print(dense1.weights)
print(dense2.weights)

[[ 0.01804722  0.0126736   0.04027876  0.02663284]
 [ 0.04048456 -0.03130931  0.00730193  0.03909735]]
[[ 0.0144595 ]
 [ 0.04274735]
 [-0.04510836]
 [ 0.0014763 ]]


In [54]:
#Sigmoid activation 
def sig(x):
    return 1/(1+np.exp(-x))
#derivative of sigmoid activation
def dsig(x):
    return x *(1-x)

np.random.seed(2)

In [55]:
L1 = np.random.uniform(-0.5,0.5,(2,4))
L2 = np.random.uniform(-0.5,0.5,4)

In [56]:
# ----------- Hyperparameters ------------ #

eta = 0.001 # define learning rate
EPOCHS = 10
batch_size = 2

# ------------------------------------------------- #

for epoch in range(EPOCHS):
    error2 = 0
    
    for n in range(num_points):
        #Forward Pass (no bias)
        net_0 = np.sum(L1.T*X[n,:],axis=1) #more explict way to write dot product
        fnet_0 = sig(net_0)
        net_1 = np.sum(L2*fnet_0)
        fnet1 = sig(net_1)

        error2 += (labels[n]-fnet1)**2
        
        #Backwards Pass 
        error = labels[n]- fnet1
        gradient2 = -error*dsig(net_1)*fnet_0 #gradient for L2
        gradient1 = gradient2*L2*dsig(net_0) #gradient for L1
       

        L1[0,:] += gradient1*eta*X[n,0]
        L1[1,:] += gradient1*eta*X[n,1]
        L2 += gradient2*eta
        
    MSE = error2/num_points
    print(f'Training MSE: {MSE}')
    

Training MSE: 0.2059778874908301
Training MSE: 0.17710006990982471
Training MSE: 0.13252270099136024
Training MSE: 0.07613486919158846
Training MSE: 0.029986832968767377
Training MSE: 0.009011028711387023
Training MSE: 0.002762177842354298
Training MSE: 0.0010139654368932426
Training MSE: 0.0004483545968355276
Training MSE: 0.00022995256417176547


## BackProp using Strictly Formulas From Class

In [108]:
# Batch size- how MANY of the samples we want to show at a time

# set seed of random initializations
np.random.seed(2)

X = [[1,0],
     [0,1],
     [-1,0],
     [0,-1],
     [0.5,0.5],
     [-0.5,0.5],
     [0.5, -0.5],
     [-0.5,-0.5]]

t = [1,1,1,1,0,0,0,0]
t_one_hot_encoded = [[0,1],
                     [0,1],
                     [0,1],
                     [0,1],
                     [1,0],
                     [1,0],
                     [1,0],
                     [1,0]]

X = np.array(X)
t = np.array(t)


class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        
        # initialize weights as inputs x num neurons instead of other way, so that when we do matrix operations, we don't have to transpose
        #self.weights = 0.10 * np.random.randn(n_inputs, n_neurons) # to keep random values constrained between -1 and 1
        self.weights = np.random.uniform(-0.5, 0.5, (n_inputs, n_neurons))
        self.biases = np.zeros((n_neurons, 1))
    def forward(self, inputs):
        self.output = self.weights.dot(inputs) + self.biases

class Activation_ReLU:
    def forward(self, inputs):
        self.output = np.maximum(0, inputs)
    def derivative(self, netValue): # netValue is an array of values
        self.derivative_value = (netValue > 0)
        return netValue > 0
    
class Activation_Sigmoid:
    def forward(self, inputs):
        self.output = 1/(1+np.exp(-inputs))
    def derivative(self, netValue):
        self.derivative = netValue *(1-netValue)
        return netValue *(1-netValue)
        
class Activation_Softmax:
    def forward(self, inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True)) # we want max values WITHIN a batch, not among other batches
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities
        
class Activation_TanH:
    def forward(self, inputs): # where input is a SCALER, or the NET (sum of weights and inputs)
        self.output = np.tanh(inputs)
    def derivative(self, netValue): # netValue is an array of values
        self.derivative_value = 0.5 * (1 - np.square(netValue))
        return self.derivative_value
        
        
class Loss:
    def calculate(self, output, y): # output is output of model, y is INTENDED target values
        sample_losses = self.forward(output,y)
        data_loss = np.mean(sample_losses) # or the mean loss of every sample in batch
        return data_loss
    
# basically, for the below, the Categorical Cross Entropy for a one-hot encoded target vector is just -log(prediction) at that one hot encoded vector
class Loss_CategoricalCrossEntropy(Loss): # will inherit from base loss class
    def forward(self, y_pred, y_true):
        
        samples = len(y_pred)
        
        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7) # because -log(0) = infinity, so we never want exactly 0
        
        if len(y_true.shape) == 1: # meaning the target vector is something passed in like [1, 2, 0, 1] where lets say index 0 tells you for batch 0, class 1 is the predicted class
            correct_confidences = y_pred_clipped[range(samples), y_true] # range(samples) means we want ALL the batches, and y_true tells us for each batch, which sample to grab (corresponding to target class)
        elif len(y_true.shape) == 2: # if one-hot encoded vectors are passed for each batch, like [ [1,0,0], [0,1,0],... ]
            correct_confidences = np.sum(y_pred_clipped*y_true.T, axis=1) # for each batch, multiply samples by one-hot encoded vecotr, to only keep data corresponding to target class, and then sum to get only the values we want, not all the 0's
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods # is the vector of values corresponding to -log( sample within a batch corresponding to target class, for each sample within a batch)
        
    

In [109]:
num_points = 1000
num_classes = 2


X, labels = spiral_data(num_points, num_classes)
#t = one_hot_encode(labels)

print(X[5])
print(X[5][0])

[-0.00181373  0.00466481]
-0.0018137297526784884


In [113]:
# Define Network

# Define Layers
dense1 = Layer_Dense(2,4) # input is 2 because input data is just (x,y)
activation1 = Activation_Sigmoid()

dense2 = Layer_Dense(4,1)
activation2 = Activation_Sigmoid()

print(dense1.weights)
print(dense2.weights)

[[ 0.41902219  0.14581801 -0.20059297  0.43925547]
 [ 0.25452902  0.05429244 -0.45887834  0.08294022]]
[[-0.0452265 ]
 [ 0.21835405]
 [ 0.09125661]
 [ 0.21703291]]


In [114]:
print(X[:2])

[[-0.00000000e+00  0.00000000e+00]
 [-1.81405520e-05  1.00083661e-03]]


In [117]:
# ----------- Hyperparameters ------------ #

eta = 0.001 # define learning rate
EPOCHS = 10
batch_size = 2

# ------------------------------------------------- #

#for epoch in range(EPOCHS):
for epoch in range(1):

    error2 = 0
    
    #for n in range(num_points):
    for n in range(1):
        #Forward Pass (no bias)
        print('input is', X[:2])
        print('dense.weights is', dense1.weights, 'and its shape is', dense1.weights)
        print('dense2.weights is', dense2.weights, 'and its shape is', dense2.weights)
        net0 = dense1.weights.T.dot(X[:2])
        activation1.forward(net0)
        print('net_0 is', net_0, 'and its shape is', net_0.shape)
        print('fnet_0 is', fnet_0, 'and its shape is', fnet_0.shape)
        
        print(dense2.weights.shape)
        print(activation1.output.shape)
        net1 = dense2.weights.T.dot(activation1.output)
        activation2.forward(net1) 
        
        print('net_1 is', net_1, 'and its shape is', net_1.shape)
        print('fnet_1 is', fnet1, 'and its shape is', fnet1.shape)
        

        error2 += (labels[n]-activation2.output)**2
        
        #Backwards Pass 
        error = labels[n]- activation2.output
        gradient2 = -error*activation2.derivative(net1)*activation2.output #gradient for L2
        gradient1 = gradient2*dense2.weights*activation1.derivative(net0) #gradient for L1
       

        dense1.weights += gradient1*eta*X[:2]
        dense1.weights += gradient1*eta*X[:2]
        dense2.weights += gradient2*eta
        
    MSE = error2/num_points
    print(f'Training MSE: {MSE}')
    

input is [[-0.00000000e+00  0.00000000e+00]
 [-1.81405520e-05  1.00083661e-03]]
dense.weights is [[ 0.41902219  0.14581801 -0.20059297  0.43925547]
 [ 0.25452902  0.05429244 -0.45887834  0.08294022]] and its shape is [[ 0.41902219  0.14581801 -0.20059297  0.43925547]
 [ 0.25452902  0.05429244 -0.45887834  0.08294022]]
dense2.weights is [[-0.0452265 ]
 [ 0.21835405]
 [ 0.09125661]
 [ 0.21703291]] and its shape is [[-0.0452265 ]
 [ 0.21835405]
 [ 0.09125661]
 [ 0.21703291]]
net_0 is [ 0.16191093  0.6566658   0.28601525 -0.08414073] and its shape is (4,)
fnet_0 is [0.54038954 0.65851101 0.57102032 0.47897722] and its shape is (4,)
(4, 1)
(4, 2)
net_1 is -4.950052474590725 and its shape is ()
fnet_1 is 0.0070332206758568685 and its shape is ()


TypeError: 'numpy.ndarray' object is not callable