In [2]:
import numpy as np

In [3]:
class Linear:
    def __init__(
        self, 
        in_features: int, 
        out_features: int, 
        bias: bool = True
    ) -> None:
        self.in_features = in_features
        self.out_features = out_features
        
        self.weights = np.zeros((self.in_features, self.out_features))
        if bias:
            self.bias = np.zeros(self.out_features)    
        else:
            self.bias = None

        self.init_weights()

    def init_weights(self, how: str='normal') -> None:
        if how == 'normal':
            self.weights = np.random.normal(0.0, 0.01, size = (self.out_features, self.in_features))
            if self.bias is not None:
                self.bias = np.random.normal(0, 0.01, size = (self.out_features))
    
    def __call__(self, inputs: np.ndarray) -> np.ndarray:
        assert inputs.shape[0] == self.weights.shape[1]
        return np.dot(self.weights, inputs) + self.bias        
        
    def backward(self, gradient: np.ndarray) -> np.ndarray:
        self.weights_grad = np.dot(self.input.T, gradient)
        self.bias_grad = np.sum(gradient, axis=0)
        return gradient @ self.weights.T

In [4]:
linear1 = Linear(2, 3, bias=True)

In [5]:
inputs = np.array([0.5, 0.3])

In [6]:
inputs.shape

(2,)

In [10]:
inputs = np.array([0.5, 0.3])
output = linear1(inputs)

In [14]:
class Relu():
    def __call__(self, x: np.ndarray) -> np.ndarray:
        self.output = np.maximum(0, x)   
        return self.output

    def backward(self, grad):
        return grad * np.clip(self.output, 0, 1)

In [None]:
class MSE:
    def __call__(self, predict_values: np.ndarray, true_values: np.ndarray) -> float:
        assert predict_values.shape == true_values.shape
        self.error = predict_values - true_values
        return np.mean(self.error ** 2)
    
    def backward(self):
        return 2 * (1 / self.error.shape[-1]) * self.error 

In [None]:
from typing import Any

class Softmax:
    def __call__(self, inputs: np.ndarray) -> np.ndarray:
        exps = exps - np.max(exps)
        sm_logits = np.exp(exps) / np.sum(np.exp(exps))
        return sm_logits

    def backward(self, gradient: np.ndarray) -> np.ndarray:
        
        

In [None]:
class Optimizer:
    def __init__(self) -> None:
        pass
    
    def step(self, model: Any, loss: Any) -> None:
        pass


class SGD(Optimizer):
    def __init__(self, learning_rate: float) -> None:
        self.learning_rate = learning_rate
        
    def step(self, model: Any, loss: Any) -> None:
        error_gradient = loss.backward()
        for layer in model.layers:
            layer.backward(error_gradient)
            layer.weights -= self.learning_rate * layer.weights_grad
            if layer.bias is not None:
                layer.bias -= self.learning_rate * layer.bias_grad
                
class Adam(Optimizer):
    def __init__(self, params, lr=0.001, betas=(0.9, 0.999), eps=1e-8):
        self.params = params
        self.lr = lr
        self.betas = betas
        self.eps = eps
        self.t = 0
        self.v = {param: np.zeros(param) for param in self.params}
        self.m = {param: np.zeros(param) for param in self.params}

    def step(self):
        self.t += 1
        for param in self.params:
            grad = param.grad.data
            self.v[param] = self.betas[0] * self.v[param] + (1 - self.betas[0]) * grad
            self.m[param] = self.betas[1] * self.m[param] + (1 - self.betas[1]) * (grad * grad)
            v_corrected = self.v[param] / (1 - self.betas[0]**self.t)
            m_corrected = self.m[param] / (1 - self.betas[1]**self.t)
            param.data -= self.lr * v_corrected / (m_corrected.sqrt() + self.eps)
