In [1]:
import numpy as np

In [2]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def test_sigmoid():
    test_cases = [(0,0.5),
                  (-100,0),
                 (100,1)]
    
    for x,y in test_cases:
        pred = round(sigmoid(x),4)
        assert pred == y, f'{pred} neq {y}'
        
test_sigmoid()

In [3]:
def neg_log_loss(pred,y):
    return -(y * np.log(pred) + (1-y) * np.log(1-pred) )

def test_neg_log_loss():
    
    test_cases = [(0.000001,0,0), (0.9999999,1,0),(0.3,0,round(-np.log(0.7),4))]
    
    for pred,y,loss in test_cases:
        print(pred,y,loss)
        neg_loss = round(neg_log_loss(pred,y),4)
        assert neg_loss == loss , f"{neg_loss=} neq {loss=}"

test_neg_log_loss()

1e-06 0 0
0.9999999 1 0
0.3 0 0.3567


In [4]:
sigmoid(100)

1.0

In [5]:
sigmoid(0)

0.5

# Logistic Regression

In [6]:
sample_size = 20
input_dim = 3
X = np.random.randn(input_dim,sample_size)
Y = np.random.choice([0,1],sample_size).reshape(1,sample_size)

In [7]:
Y.shape, X.shape

((1, 20), (3, 20))

In [8]:
hidden_node_size = 1
W = np.random.randn(hidden_node_size,input_dim)
B = np.random.randn(1)

In [9]:
W.shape, B.shape

((1, 3), (1,))

In [10]:
z = np.matmul(W,X) + B
z.shape

(1, 20)

In [11]:
z

array([[ 1.30108329,  3.3559704 ,  2.29305178,  5.39143942,  0.22332182,
        -0.90609519, -1.40676835,  0.59137067,  1.91820636,  5.14922994,
         3.55974381, -1.16140935,  3.4926294 , -1.02240655,  2.09740392,
         4.76738831, -0.19158711, -1.1040475 , -3.45294174,  3.42278861]])

In [12]:
a = sigmoid(np.matmul(W,X) + B) 
a.shape

(1, 20)

In [13]:
np.log([1,2,3,np.e])

array([0.        , 0.69314718, 1.09861229, 1.        ])

In [14]:
L = Y * np.log(a) + (1-Y) * np.log(1-a)
L.shape

(1, 20)

In [15]:
L

array([[-1.54185984, -0.03428114, -2.38923239, -5.39598448, -0.81102926,
        -0.33939587, -1.6258505 , -1.03192489, -0.13703663, -5.15501703,
        -0.02804903, -1.43375798, -3.52259664, -1.32969137, -2.21320699,
        -4.77585492, -0.60193483, -1.39037353, -0.03116178, -3.45488917]])

## Gradient Descent for Logistic Regression
Let's derive the equation below.

Below is coming from the fact that $\dfrac{d lnx}{x} = \dfrac{1}{x}$


$$
\dfrac{dL}{da} = \dfrac{Y}{a} + \dfrac{1-Y}{1-a}
$$

Then if we go one step back to calculate $\dfrac{dL}{dz}$ using chain rule:

$$
\dfrac{dL}{dz} = \dfrac{dL}{da} * \dfrac{da}{dz}
$$

where

$$
\dfrac{d\sigma (x)}{dx} = \sigma(x) * (1-\sigma(x))
$$

yielding

$$
\dfrac{dL}{dz} = (\dfrac{Y}{a} + \dfrac{1-Y}{1-a}) * a * (1-a) \\
= a -Y
$$


Finally: 

$$
\dfrac{dL}{dW} = (a - Y) * X 
$$

And 

$$
\dfrac{dL}{dB} = (a-Y)
$$

## Backward Pass (with for loop)

In [16]:
np.array([0,1,0.1]) * 3 

array([0. , 3. , 0.3])

In [17]:
dW = np.zeros(W.shape)
dB = np.zeros(B.shape)


dW, dB

(array([[0., 0., 0.]]), array([0.]))

In [28]:
epoch_num = 5000
l_r = 0.01
for e in range(epoch_num):

    dW = np.zeros(W.shape)
    dB = np.zeros(B.shape)
    total_cost = 0
    for i in range(sample_size):
        x_ = X[:,i]

        z = np.matmul(W,x_) + B 

        a = sigmoid(z)
        y = Y[:,i]
        l = neg_log_loss(a,y)
        dW += (a-y) * x_ 

        dB += (a-y)
        total_cost+=l
    if e% 1000 == 999:
        print(f"{total_cost=}")
    # update params 
    W -= l_r * dW/sample_size
    B -= l_r * dB/sample_size 

total_cost=array([12.0334063])
total_cost=array([12.0334063])
total_cost=array([12.0334063])
total_cost=array([12.0334063])
total_cost=array([12.0334063])


## Backward Pass (without for loop)

Now lets write everything as matrix operations

In [165]:
sample_size = 20
input_dim = 50
X = np.random.randn(input_dim,sample_size)
Y = np.random.choice([0,1],sample_size).reshape(1,sample_size)
hidden_node_size = 1
W = np.random.randn(hidden_node_size,input_dim)
B = np.random.randn(1)

In [181]:
epoch = 500
l_r = 0.01
for e in range(epoch):
    Z = np.matmul(W,X) + B

    A = sigmoid(Z)
    
    L = neg_log_loss(A,Y)
    # print(f"{Z.shape=} {A.shape=} {L.shape=}")
    #update
    dW =  np.matmul(A-Y,X.T) 
    dB = (A-Y)
    # print(f"{dW.shape} {dB.shape}")
    W -= l_r * dW / sample_size 
    B -= l_r * np.sum(dB) / sample_size

In [182]:
np.sum(L)

0.7670965759214501

In [193]:
np.round(sigmoid(np.matmul(W, X)+ B),3).reshape(1,-1)[0], Y[0]

(array([0.026, 0.986, 0.024, 0.   , 0.055, 0.004, 0.088, 0.   , 0.994,
        0.956, 0.069, 0.039, 0.95 , 0.957, 0.971, 0.962, 0.941, 0.986,
        0.06 , 0.916]),
 array([0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1]))

In [122]:
sigmoid(np.matmul(W, X[:,1])+ B), Y[:,2]

(array([0.94871062]), array([0]))

In [14]:
import numpy as np
X = np.array([
    [1.,0,0],
    [2.,0,1],
    [3.,1,0]
])
Y = np.array([0,0,1])

In [96]:
# initialize
W = np.random.randn(1,3)
b = np.zeros((1,1))

In [97]:
W.shape

(1, 3)

In [98]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [99]:
def get_loss(A,Y):
    return Y * np.log(A) + (1-Y) * np.log(1-A)

In [136]:
# forward pass
Z = np.matmul(W,X.T)+b
A = sigmoid(Z)
loss = -np.mean(get_loss(A,Y))

# backward pass
a = 1.
dZ = A - Y
dW = dZ * X
db = dZ 
W = W - a*dW
b = b - a*db 
loss

0.042024429071527564

In [137]:
A 

array([[1.32594564e-02, 4.80993472e-03, 9.30107718e-01],
       [8.81729041e-03, 8.41571985e-02, 9.16170066e-01],
       [6.58262196e-03, 5.58451217e-04, 9.08362179e-01]])

In [80]:
A.shape

(3, 3)