In [31]:
import numpy as np

In [32]:
def sigmoid(x):
    """
    Sigmoid function
    """
    return 1/(1+np.exp(-x))

def sigmoid_prime(x):
    """
    Derivative of the sigmoid function
    """
    return sigmoid(x) * (1 - sigmoid(x))

# input values:
learnrate = 0.5
x = np.array([1, 2, 3, 4])
y = np.array(0.5)

In [33]:
# Initial weights
w = np.array([0.5, -0.5, 0.3, 0.1])

#Calculate the node's linear combination of inputs and weights
h = np.dot(w,x)

#Calculate output of neural network
nn_output = sigmoid(h)

#Calculate error of neural network
error = y - nn_output

# Calculate the error term
error_term = sigmoid_prime(h)*error

#Calculate change in weights
del_w = learnrate * error_term * x

print ('example:')
print ('--------------')
print ('input:')
print ('-----')
print ('learnrate:', learnrate)
print ('        x:', x)
print ('        y:', y)
print ('-----')
print ('initialization:')
print ('        w:', w)
print ('-----')
print ('single iteration:')
print ('      w*x:',h)
print ('        o:',nn_output)
print ('      err:',error)
print ("(sigmoid)'*err:",error_term)
print ('delta_w:',del_w)
print ('w after update:',w+del_w)

example:
--------------
input:
-----
learnrate: 0.5
        x: [1 2 3 4]
        y: 0.5
-----
initialization:
        w: [ 0.5 -0.5  0.3  0.1]
-----
single iteration:
      w*x: 0.7999999999999998
        o: 0.6899744811276125
      err: -0.1899744811276125
(sigmoid)'*err: -0.04063738360460799
delta_w: [-0.02031869 -0.04063738 -0.06095608 -0.08127477]
w after update: [ 0.47968131 -0.54063738  0.23904392  0.01872523]


In [47]:
#toy data
X = [0.5,2.5]
Y = [0.2,0.9]

w_values = []
b_values = []
loss_values = []

def f(w,b,x):
    return 1. / (1. + np.exp(-(w*x)+b))
  
def error(w,b):
  err = 0.0
  for x,y in zip(X,Y):
    fx = f(w,b,x)
    err += 0.5*(fx-y)**2
  return(err)

def grad_w(w,b, x, y):
    y_pred = f(w,b,x)
    return (y_pred - y) * y_pred * (1 - y_pred) * x

def grad_b(w,b, x, y):
    y_pred = f(w,b,x)
    return (y_pred - y) * y_pred * (1 - y_pred)

def gradient_descent():
  w,b,eta = 0, -8, 1.0
  #values for illustration, we can choose randomly
  for i in range(1000):
    #iterating for 1000 epochs
      dw, db = 0, 0
      for x, y in zip(X, Y):
          dw += grad_w(w,b,x, y)
          db += grad_b(w,b,x, y)
      w -= eta*dw
      b -= eta*db
      print ('w0[%d]: %10.8f, w1[%d]:%10.8f,  err: %10.8f' %(i,b,i,w,error(w,b)))  
      w_values.append(w)
      b_values.append(b)
      loss_values.append(error(w,b))

gradient_descent()

w0[0]: -8.00030149, w1[0]:-0.00021757,  err: 0.32469834
w0[1]: -8.00060293, w1[1]:-0.00043513,  err: 0.32469838
w0[2]: -8.00090434, w1[2]:-0.00065268,  err: 0.32469843
w0[3]: -8.00120570, w1[3]:-0.00087024,  err: 0.32469847
w0[4]: -8.00150701, w1[4]:-0.00108778,  err: 0.32469851
w0[5]: -8.00180828, w1[5]:-0.00130532,  err: 0.32469856
w0[6]: -8.00210951, w1[6]:-0.00152285,  err: 0.32469860
w0[7]: -8.00241069, w1[7]:-0.00174038,  err: 0.32469865
w0[8]: -8.00271184, w1[8]:-0.00195790,  err: 0.32469869
w0[9]: -8.00301293, w1[9]:-0.00217542,  err: 0.32469873
w0[10]: -8.00331399, w1[10]:-0.00239293,  err: 0.32469878
w0[11]: -8.00361500, w1[11]:-0.00261044,  err: 0.32469882
w0[12]: -8.00391597, w1[12]:-0.00282794,  err: 0.32469886
w0[13]: -8.00421689, w1[13]:-0.00304544,  err: 0.32469890
w0[14]: -8.00451778, w1[14]:-0.00326293,  err: 0.32469895
w0[15]: -8.00481861, w1[15]:-0.00348041,  err: 0.32469899
w0[16]: -8.00511941, w1[16]:-0.00369789,  err: 0.32469903
w0[17]: -8.00542016, w1[17]:-0.003