In [106]:
from typing import Callable, Dict, Tuple, List
from numpy import ndarray
import numpy as np
#weights are dict with str as key and value as ndarray
#forward will return two values, first value is a float
#and second value is the weight

def init_weights(inputsize:int,hiddensize:int, outputsize:int) -> Dict[str,ndarray]:
    weights: Dict[str,ndarray] = {}
#     weights['W'] = np.full((n_in,1),1/n_in)
    weights['W1'] = np.random.randn(inputsize,hiddensize)
    weights['W2'] = np.random.randn(hiddensize,outputsize)
    weights['B1'] = np.random.randn(1,hiddensize)
    weights['B2'] = np.random.randn(1,1)
    return weights


def forward(X: ndarray,y:ndarray,weights:Dict[str,ndarray])  -> Tuple[float,Dict[str,ndarray]]: # Tuple(forward info, loss)
    
    # weights['B']
    # weights ['W']
    
#     print(weights['W'].shape)
    
    ### Assert batch sizes of X and y are equal
    assert X.shape[0] == y.shape[0]
    ### Assert that X and w can be dotted
    assert X.shape[1] == weights['W1'].shape[0]
    
    ### Assert that B is just a value (is shape (1,1))
    assert weights['B1'].shape[0] == 1
    assert weights['B1'].shape[1] == weights['W1'].shape[1]
    assert weights['B2'].shape[0] == 1
    assert weights['B2'].shape[1] == 1
    
    ### compute M1
    M1 = X @ weights['W1']
    
    ### compute N1
    N1 = M1 + weights['B1']
    
    ### compute sigmoid
    O1 = 1/(1+np.exp(-N1))
    
    ### compute V
    M2 = O1@ weights['W2']
    
    ### compute alpha
    P = M2 + weights['B2']
    
    ### compute L
    L = np.mean(np.power(P-y,2))
#     print('Loss', L)
    
    # save the information of N,P,L in a dictionary called forward_inf
    forward_info: Dict[str,ndarray] = {} # initializing dictionary with data type specified
        
    ### set the forward_info to remember X,N,P,y
    # for example
    #maybe use this for calculating gradients
    forward_info['X'] = X
    forward_info['M1'] = M1
    forward_info['N1'] = N1
    forward_info['B1'] = weights['B1']
    forward_info['B2'] = weights['B2']
    forward_info['O1'] = O1
    forward_info['W1'] = weights['W1']
    forward_info['W2'] = weights['W2']
    forward_info['M2'] = M2
    forward_info['P'] = P
    forward_info['y'] = y
    
    return forward_info, L

In [107]:
def backward(forward_info: Dict[str,ndarray],weights: Dict[str,ndarray]) -> Dict[str,ndarray]:
    
#     y = forward_info['y']
#     P = forward_info['P']
#     N = forward_info['N']
#     X = forward_info['X']

#     alpha = forward_info['alpha']
    y = forward_info['y']
    P = forward_info['P']
    M2 = forward_info['M2']
    W2 = forward_info['W2']
    W1 = forward_info['W1']
    O1 = forward_info['O1']
    B1 = forward_info['B1']
    B2 = forward_info['B2']
    N1 = forward_info['N1']
    M1 = forward_info['M1']
    X = forward_info['X']
    

    dLdP = 2* (P - y) #m,1
    
    dPdM2 = np.ones_like(M2) #m,1 #shape of M2
    
    dLdM2 = dLdP * dPdM2 #m,1
    
    dM2dO1 = W2.T #1,h
    
    dLdO1 = dLdM2 @ dM2dO1 #m,h
    
    dO1dN1 = O1*(1-O1) #m,h
    
    dLdN1 = dLdO1 * dO1dN1 #m,h
    
    dN1dM1 = np.ones_like(M1) #m,h #shape of M1
    
    dLdM1 = dLdN1 * dN1dM1 #m,h  
    
    dM1dW1 = X.T #n,mq  
    
    dLdW1 = dM1dW1 @ dLdM1 #
    
    dN1dB1 = np.ones_like(B1) #shape of B1
    
    dLdB1 = (dLdN1 * dN1dB1).sum(axis = 0)
    
    dM2dW2 = O1.T #h,m 
    
    dLdW2 = dM2dW2  @ dLdM2 
    
    dPdB2 = np.ones_like(B2) #shape of B2
    
    dLdB2 = (dLdP * dPdB2).sum(axis = 0)

    
    grads: Dict[str,ndarray] = {}

    grads['W2'] = dLdW2  
    grads['B2'] = dLdB2
    grads['W1'] = dLdW1
    grads['B1'] = dLdB1
    
    return grads

In [108]:
def permuteXY(X, y):
    perm = np.random.permutation(X.shape[0])
    return X[perm],y[perm]

def train(X: ndarray, y:ndarray, max_iter: int = 1000, learning_rate: float = 0.01, 
          batch_size: int= 100) -> None: # the weights change #< -- mini-batch gradient descent
    np.random.seed(42)
    start = 0 #<--  initialize start index for mini-batch (we are gonna do without replacement) # no data will be used more than once -> without replacement
    
    #get my weights dict
    inputsize = X.shape[1]
    hiddensize = 13
    outputsize = 1
    weights = init_weights(inputsize,hiddensize,outputsize) #<-- init_weights look up there^
    
    #shuffle my X a little bit to increase generalizing power
    X,y = permuteXY(X,y)
    
    for i in range(max_iter):
        # in case all data used
        # index is exceeded
        if start >=  X.shape[0]: 
            # shuffle X again
            X,y = permuteXY(X,y)
            # restart the start index
            start = 0
            
        # if batch_size exceeds the last guy, reduce the batch size
        if start + batch_size > X.shape[0]:
            batch_size = X.shape[0] - start
            
        X_batch,y_batch = X[start:start+batch_size] , y[start:start+batch_size]
        start += batch_size
        
        # perform first prediction
        forward_info, loss = forward(X_batch,y_batch,weights)
        
        # calculate gradients
        loss_grad = backward(forward_info,weights)
        
        # update W and B
        weights['B1'] -= learning_rate * loss_grad['B1']
        weights['W1'] -= learning_rate * loss_grad['W1']
        weights['B2'] -= learning_rate * loss_grad['B2']
        weights['W2'] -= learning_rate * loss_grad['W2']
        
    return weights
    

In [109]:
##### lets load some data
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

#### so please load boston
X, y = load_boston(return_X_y=True)
#boston = load_boston()
#X = boston.data
#y = boston.target

#### please standardize them
scaler = StandardScaler()
X = scaler.fit_transform(X)

#### train test split them
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.3,random_state = 42)

# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

##### reshape y to (m,1) < --- because our code want 1 there
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [110]:
weights = train(X_train,y_train,max_iter = 10000, learning_rate = 3e-4, batch_size = 20)
weights

{'W1': array([[ 0.1542346 , -0.78919109, -0.20190233,  1.61576053, -0.42313543,
         -0.63328183,  0.47762574, -0.18569829, -0.93419023,  0.50038345,
         -0.66240852, -0.52412739, -0.24362405],
        [-1.49087208, -1.29912297, -0.05100142, -0.79220485,  0.81992276,
         -0.81581593, -0.46925406,  0.97960232,  0.27745478,  0.13095733,
         -0.19587148,  0.03859562,  0.19217519],
        [-0.75816157,  0.42384316, -0.11492181,  0.0052847 , -0.44344517,
          2.12819708,  0.97374521, -0.11525674,  0.84299193, -0.96327729,
          0.57519418, -2.03176496, -0.83700061],
        [-0.11318904,  1.08292305, -0.18994501, -0.26914934,  0.11230511,
         -0.3851478 , -0.30017092, -0.0829786 ,  1.05711018,  0.93603477,
         -1.03382181,  1.82868086,  0.22395735],
        [-0.77664981,  0.64090359,  0.29104044,  1.16934098, -1.42585901,
         -0.32804685,  0.707869  ,  0.94368154, -0.63809569,  0.10827806,
         -1.45785133, -1.17274395,  0.61427461],
        [

In [111]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def predict(X: ndarray, weights: Dict[str, ndarray]):
    O1 = sigmoid(X @ weights['W1'] + weights['B1'])
    return O1 @ weights['W2'] + weights['B2']

In [112]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

ypred = predict(X_test,weights)
mean_s_r = mean_squared_error(y_test,ypred)
r2 =r2_score(y_test,ypred)

In [100]:
print(mean_s_r)
print(r2)

12.200716061748745
0.8362607811584442
