In [49]:
import numpy as np

def mse_loss(y_hat, y):
    return np.mean(np.square(y_hat - y))

def derivative_mse_loss(y_true,y_pred):
    return 2 * ( y_pred-y_true) / y_true.size

def mse_loss_grad(y_hat, y):
    return 2 * ( y_hat-y) / y.shape[0]

class FFN:
    def __init__(self, d_model=10, units=100) -> None:
        self.num_layers = 2
        self.d_model = d_model
        self.units = units
        self.weight1 = None
        self.bias1 = None
        self.weight2 = None
        self.bias2 = None

    def init_weights(self):
        self.weight1 = np.random.randn(self.d_model, self.units) * 0.01
        self.bias1 = np.zeros((1, self.units))
        self.weight2 = np.random.randn(self.units, self.d_model) * 0.01
        self.bias2 = np.zeros((1, self.d_model))

    def forward(self, inputs):
        self.Z1 = np.dot(inputs, self.weight1) + self.bias1
        self.A1 = np.maximum(0, self.Z1)  # ReLU activation
        Z2 = np.dot(self.A1, self.weight2) + self.bias2  # Linear activation for regression
        return Z2

    def backward(self, grads, inputs):
        dZ2 = grads  # Gradient of loss wrt output (Z2) since no activation
        dW2 = np.dot(self.A1.T, dZ2)
        db2 = np.sum(dZ2, axis=0, keepdims=True)
        dA1 = np.dot(dZ2, self.weight2.T)
        dZ1 = dA1 * (self.Z1 > 0)  # ReLU derivative
        dW1 = np.dot(inputs.T, dZ1)
        db1 = np.sum(dZ1, axis=0, keepdims=True)
        return dW1, db1, dW2, db2

    def update(self, dW1, db1, dW2, db2, lr=0.01):
        self.weight1 -= lr * dW1
        self.bias1 -= lr * db1
        self.weight2 -= lr * dW2
        self.bias2 -= lr * db2

    def train(self, X, y, epochs=100, lr=0.01):
        for i in range(epochs):
            y_hat = self.forward(X)
            loss = mse_loss(y_hat, y)
            if i % 100 == 0:
                print(f"Epoch {i} loss: {loss}")
            grads = mse_loss_grad(y_hat, y)
            #grads= derivative_mse_loss(y,y_hat)
            dW1, db1, dW2, db2 = self.backward(grads, X)
            self.update(dW1, db1, dW2, db2, lr)

        return self.weight1, self.bias1, self.weight2, self.bias2

    def predict(self, X):
        return self.forward(X)


# Set the model parameters
d_model = 20
units = 32
max_seq_len=100
learning_rate = 0.1


# Initialize the FFN model
ffn = FFN(units=units, d_model=d_model)
ffn.init_weights()

# Training example (with dummy data)
X_train = np.random.randn(max_seq_len, d_model)  # 100 samples, each with d_model features
y_train = np.random.randn(max_seq_len, d_model)  # 100 target values

# Train the model
ffn.train(X_train, y_train, epochs=1001, lr=learning_rate)

# Make a prediction with the trained model
#prediction = ffn.predict(inputs)
#print(ffn.predict(X_train).shape)


Epoch 0 loss: 1.0118065162695764
Epoch 100 loss: 0.45554832164516423
Epoch 200 loss: 0.37989061628251014
Epoch 300 loss: 0.3367182440923296
Epoch 400 loss: 0.35536868393489374
Epoch 500 loss: 0.3080069746031077
Epoch 600 loss: 0.2584868493679288
Epoch 700 loss: 0.24668133816565377
Epoch 800 loss: 0.2380488174860455
Epoch 900 loss: 0.25492989625617596
Epoch 1000 loss: 0.2433922537709417


(array([[ 0.09222483,  0.23610442, -0.07960877,  0.26102906, -0.16104752,
          0.71732121,  0.0571656 , -0.61153176, -0.74936487, -0.7064529 ,
          0.14107594,  0.05653899,  0.32498236,  0.17200699, -0.1643701 ,
         -0.32769989,  0.35786743,  0.15550948,  0.13467892, -0.01100896,
          0.16502404,  0.12487318,  0.28128402, -0.30300196, -0.01525377,
         -0.58009673,  0.77326102, -0.38910487,  0.0522092 ,  0.4630669 ,
          0.19471979,  0.11907894],
        [-0.23950785, -0.2485489 ,  0.04368154, -0.40599549, -0.09085084,
          0.37679343, -0.22278931, -0.22314474, -0.27944323,  0.21005009,
         -0.7452149 , -0.80305265,  0.07408985, -0.41528652, -0.54923709,
          0.33252409,  0.5760936 ,  0.16916737,  0.51260101,  0.34219234,
         -0.71268865, -0.57388881, -0.18933534, -0.26909683,  0.41169055,
         -0.33648075,  0.01327882,  0.11022188, -0.4914657 , -0.64768665,
          0.21688631,  0.56505411],
        [-0.14696368,  0.04059544,  0.82