In [3]:
import numpy as np
import math

### Using equations described above to calculate w0, W and V for 1 iteration

#### Sample toy set
#### label  feature1  feature2  feature3  feature4(categorical)
#### 0      1         1         5         0
#### 1      2         3         4         1
#### 0      4         10        11        1
#### 1      1         1         5         0

In [4]:
learning_rate = 0.01
num_latent_factors = 2
mean = 0
std = 0.01
reg_bias = 0.01
reg_independent = 0.01
reg_interaction = np.full(num_latent_factors, 0.01)

In [5]:
w0 = 0.0
W = np.zeros(4)
V = np.random.normal(mean, std, (4,2))

In [6]:
x1 = np.dot(V[0],V[1])*1 + np.dot(V[0],V[2])*5 + np.dot(V[0],V[3])*0 + np.dot(V[1],V[2])*5 + np.dot(V[1],V[3])*0 + np.dot(V[2],V[3])*0
x2 = np.dot(V[0],V[1])*6 + np.dot(V[0],V[2])*8 + np.dot(V[0],V[3])*2 + np.dot(V[1],V[2])*12 + np.dot(V[1],V[3])*3 + np.dot(V[2],V[3])*4
x3 = np.dot(V[0],V[1])*40 + np.dot(V[0],V[2])*44 + np.dot(V[0],V[3])*4 + np.dot(V[1],V[2])*110 + np.dot(V[1],V[3])*10 + np.dot(V[2],V[3])*11
x4 = np.dot(V[0],V[1])*1 + np.dot(V[0],V[2])*5 + np.dot(V[0],V[3])*0 + np.dot(V[1],V[2])*5 + np.dot(V[1],V[3])*0 + np.dot(V[2],V[3])*0

### First part of gradient equation

In [7]:
gradient_part_1= (1.0/(1.0+math.exp(-0*x1))) - 0
gradient_part_2= (1.0/(1.0+math.exp(-1*x2))) - 1
gradient_part_3= (1.0/(1.0+math.exp(-0*x3))) - 0
gradient_part_4= (1.0/(1.0+math.exp(-1*x4))) - 1

### w0 gradient 

In [10]:
w0_gradient = sum([gradient_part_1,gradient_part_2,gradient_part_3,gradient_part_4])/4
w0_gradient

-6.663834502987354e-05

### W gradient

In [11]:
gradient1w= gradient_part_1*np.array([1,1,5,0])
gradient2w= gradient_part_2*np.array([2,3,4,1])
gradient3w= gradient_part_3*np.array([4,10,11,1])
gradient4w= gradient_part_4*np.array([1,1,5,0])

In [12]:
W_gradient = sum([gradient1w,gradient2w,gradient3w,gradient4w])/4
W_gradient 

array([ 2.49880857e-01,  8.74828353e-01,  8.74719313e-01, -5.25043691e-05])

### V gradient

In [13]:
grad_part_11 = 1 * (np.multiply(V[1],1) + np.multiply(V[2],5) + np.multiply(V[3],0))
grad_part_12 = 1 * (np.multiply(V[0],1) + np.multiply(V[2],5) + np.multiply(V[3],0))
grad_part_13 = 5 * (np.multiply(V[1],1) + np.multiply(V[0],1) + np.multiply(V[3],0))
grad_part_14 = 0 * (np.multiply(V[1],1) + np.multiply(V[2],5) + np.multiply(V[0],1))

In [14]:
gradient1V= gradient_part_1*np.array([grad_part_11,grad_part_12,grad_part_13,grad_part_14])

In [15]:
grad_part_21 = 2 * (np.multiply(V[1],3) + np.multiply(V[2],4) + np.multiply(V[3],1))
grad_part_22 = 3 * (np.multiply(V[0],2) + np.multiply(V[2],4) + np.multiply(V[3],1))
grad_part_23 = 4 * (np.multiply(V[1],3) + np.multiply(V[0],2) + np.multiply(V[3],1))
grad_part_24 = 1 * (np.multiply(V[1],3) + np.multiply(V[2],4) + np.multiply(V[0],2))

In [16]:
gradient2V= gradient_part_2*np.array([grad_part_21,grad_part_22,grad_part_23,grad_part_24])

In [17]:
grad_part_31 = 4 * (np.multiply(V[1],10) + np.multiply(V[2],11) + np.multiply(V[3],1))
grad_part_32 = 10 * (np.multiply(V[0],4) + np.multiply(V[2],11) + np.multiply(V[3],1))
grad_part_33 = 11 * (np.multiply(V[1],10) + np.multiply(V[0],4) + np.multiply(V[3],1))
grad_part_34 = 1 * (np.multiply(V[1],10) + np.multiply(V[2],11) + np.multiply(V[0],4))

In [18]:
gradient3V= gradient_part_3*np.array([grad_part_31,grad_part_32,grad_part_33,grad_part_34])

In [19]:
grad_part_41 = 1 * (np.multiply(V[1],1) + np.multiply(V[2],5) + np.multiply(V[3],0))
grad_part_42 = 1 * (np.multiply(V[0],1) + np.multiply(V[2],5) + np.multiply(V[3],0))
grad_part_43 = 5 * (np.multiply(V[1],1) + np.multiply(V[0],1) + np.multiply(V[3],0))
grad_part_44 = 0 * (np.multiply(V[1],1) + np.multiply(V[2],5) + np.multiply(V[0],1))

In [20]:
gradient4V= gradient_part_4*np.array([grad_part_41,grad_part_42,grad_part_43,grad_part_44])

In [24]:
V_gradient = sum([gradient1V,gradient2V,gradient3V,gradient4V])/4
V_gradient

array([[ 0.01323777, -0.03360293],
       [-0.14321238, -0.06859905],
       [ 0.12363136, -0.16737963],
       [ 0.0022569 , -0.01137254]])

### parameter updates

In [25]:
w0 = w0 - learning_rate * (w0_gradient + 2 * reg_bias * w0)
w0

6.663834502987354e-07

In [26]:
W = W - learning_rate * (W_gradient + 2 * reg_independent * W)
W

array([-2.49880857e-03, -8.74828353e-03, -8.74719313e-03,  5.25043691e-07])

In [27]:
V = V - learning_rate * (V_gradient + 2 * np.multiply(reg_interaction, V))
V

array([[-0.006133  , -0.01629556],
       [ 0.01452971, -0.00729877],
       [-0.01004112,  0.00141236],
       [-0.01125931,  0.00616726]])