# FFM 尝试

## 数据的重编码

### 创建数据集

In [1]:
import pandas as pd
import numpy as np
np.set_printoptions(precision=3)
pd.set_option( "display.precision",3)

In [3]:
import mxnet as mx

In [2]:
data = pd.DataFrame({'A':['a1','a2','a1'],'B':[1,2,3],'C':[3,2,4],'y':[0,0,1]})
data

Unnamed: 0,A,B,C,y
0,a1,1,3,0
1,a2,2,2,0
2,a1,3,4,1


In [3]:
ydata = data['y']
ydata

0    0
1    0
2    1
Name: y, dtype: int64

In [4]:
xdata = data.drop( 'y',axis = 1).copy()


In [5]:
xdata

Unnamed: 0,A,B,C
0,a1,1,3
1,a2,2,2
2,a1,3,4


### 转换部分类型为分类变量

In [6]:
xdata.B = xdata.B.astype('category')

### 转换成稀疏形式

In [7]:
# 拿到离散变量的field name
categories = ['A','B']
continues  = ['C']

In [8]:
xdata_trnsf = pd.get_dummies(xdata,prefix=categories,prefix_sep = ":",sparse = True)
print(xdata_trnsf)

   C  A:a1  A:a2  B:1  B:2  B:3
0  3     1     0    1    0    0
1  2     0     1    0    1    0
2  4     1     0    0    0    1


In [9]:
# 拿到一级label和二级label,重构索引
import re
label_v1 = [ label.split(':')[0] for label in xdata_trnsf.columns.tolist()]
label_v2 = xdata_trnsf.columns.tolist()

In [10]:
index = pd.MultiIndex.from_arrays([label_v1,label_v2],names = ['field_id','index_id'])
xdata_trnsf.columns = index
xdata_trnsf

field_id,C,A,A,B,B,B
index_id,C,A:a1,A:a2,B:1,B:2,B:3
0,3,1,0,1,0,0
1,2,0,1,0,1,0
2,4,1,0,0,0,1


###  对数据进行重新编码

In [11]:
xdata_trnsf

field_id,C,A,A,B,B,B
index_id,C,A:a1,A:a2,B:1,B:2,B:3
0,3,1,0,1,0,0
1,2,0,1,0,1,0
2,4,1,0,0,0,1


## 初始化随机矩阵

In [40]:
numOfField = xdata_trnsf.columns.levels[0].tolist().__len__()
numOfIndex = xdata_trnsf.columns.levels[1].tolist().__len__()
numOfK     = 2
Wmatrix = pd.Panel(np.random.rand(numOfK,numOfIndex,numOfField)
                  ,major_axis = xdata_trnsf.columns.levels[1]
                  ,minor_axis = xdata_trnsf.columns.levels[0]
                  )

# Wmatrix = pd.Panel(np.array(range(numOfK*numOfIndex*numOfField)).reshape(numOfK,numOfIndex,numOfField)
#                   ,major_axis = xdata_trnsf.columns.levels[1]
#                   ,minor_axis = xdata_trnsf.columns.levels[0]
#                   )


Wmatrix
# wmatrix 是每一个向量对于每一个field的factor

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 6 (major_axis) x 3 (minor_axis)
Items axis: 0 to 1
Major_axis axis: A:a1 to C
Minor_axis axis: A to C

### 从panel中获取元素的方法

In [41]:
# 获取所有index 在这些field上面的向量的一维度部分
Wmatrix.loc[0,:,:]

field_id,A,B,C
index_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A:a1,0.784,0.369,0.178
A:a2,0.929,0.155,0.909
B:1,0.902,0.406,0.864
B:2,0.302,0.267,0.111
B:3,0.948,0.711,0.044
C,0.108,0.103,0.856


## 求出$\phi$的方法

### 获取第一个人的信息

In [42]:
record = xdata_trnsf.loc[0,:].to_frame().to_sparse()
record


Unnamed: 0_level_0,Unnamed: 1_level_0,0
field_id,index_id,Unnamed: 2_level_1
C,C,3
A,A:a1,1
A,A:a2,0
B,B:1,1
B,B:2,0
B,B:3,0


In [43]:
record_matrix = record.dot(record.T)
upper = np.triu(record_matrix.values)
upper[range(upper.shape[0]),range(upper.shape[0]) ]= 0
record_matrix = pd.DataFrame(upper  ,index = record_matrix.index , columns = record_matrix.columns)
record_matrix

Unnamed: 0_level_0,field_id,C,A,A,B,B,B
Unnamed: 0_level_1,index_id,C,A:a1,A:a2,B:1,B:2,B:3
field_id,index_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
C,C,0,3,0,3,0,0
A,A:a1,0,0,0,1,0,0
A,A:a2,0,0,0,0,0,0
B,B:1,0,0,0,0,0,0
B,B:2,0,0,0,0,0,0
B,B:3,0,0,0,0,0,0


### 利用record_matrix 生成xToFactor 和 factorToX 两个tensor

#### 从多层索引中获得值的方法

In [44]:
field_idUsed = record_matrix.index.get_level_values(level = 'field_id')
index_idUsed = record_matrix.index.get_level_values(level = 'index_id')

xToFactor   = Wmatrix.loc[:,index_idUsed,field_idUsed] 
factorToX   = xToFactor.transpose(0,2,1,copy =True)

In [45]:
xToFactor.iloc[0,:,:]

field_id,C,A,A,B,B,B
index_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C,0.856,0.108,0.108,0.103,0.103,0.103
A:a1,0.178,0.784,0.784,0.369,0.369,0.369
A:a2,0.909,0.929,0.929,0.155,0.155,0.155
B:1,0.864,0.902,0.902,0.406,0.406,0.406
B:2,0.111,0.302,0.302,0.267,0.267,0.267
B:3,0.044,0.948,0.948,0.711,0.711,0.711


In [46]:
factorToX.iloc[0,:,:]

index_id,C,A:a1,A:a2,B:1,B:2,B:3
field_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C,0.856,0.178,0.909,0.864,0.111,0.044
A,0.108,0.784,0.929,0.902,0.302,0.948
A,0.108,0.784,0.929,0.902,0.302,0.948
B,0.103,0.369,0.155,0.406,0.267,0.711
B,0.103,0.369,0.155,0.406,0.267,0.711
B,0.103,0.369,0.155,0.406,0.267,0.711


### 计算$\phi$

In [47]:
(xToFactor.values*(factorToX.values))[0,...]

array([[0.734, 0.019, 0.099, 0.089, 0.011, 0.005],
       [0.019, 0.614, 0.728, 0.333, 0.111, 0.35 ],
       [0.099, 0.728, 0.864, 0.14 , 0.047, 0.147],
       [0.089, 0.333, 0.14 , 0.165, 0.108, 0.288],
       [0.011, 0.111, 0.047, 0.108, 0.071, 0.19 ],
       [0.005, 0.35 , 0.147, 0.288, 0.19 , 0.506]])

In [48]:
record_matrix

Unnamed: 0_level_0,field_id,C,A,A,B,B,B
Unnamed: 0_level_1,index_id,C,A:a1,A:a2,B:1,B:2,B:3
field_id,index_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
C,C,0,3,0,3,0,0
A,A:a1,0,0,0,1,0,0
A,A:a2,0,0,0,0,0,0
B,B:1,0,0,0,0,0,0
B,B:2,0,0,0,0,0,0
B,B:3,0,0,0,0,0,0


In [49]:
phi = (xToFactor.values*(factorToX.values))*record_matrix.values
phi[0,:,:]

array([[0.   , 0.058, 0.   , 0.268, 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.333, 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   ]])

In [50]:
phi = phi.sum()
phi

1.596910081397567

In [51]:
# prob
np.exp(phi)/(1+np.exp(phi))

0.831586083234489

### 计算$ \frac{\exp{(-y\phi)}}{1+\exp{(-y\phi)}}$

In [52]:
y = ydata[0]
g_phi = np.exp(-y*phi)/(1+np.exp(-y*phi))

In [53]:
g_phi

0.5

### 计算偏导数

#### 计算  xx $*$ xf 和 xx $*$ fx


In [54]:
grad = g_phi * factorToX.values * record_matrix.values + (xToFactor.values * record_matrix.values).transpose(0,2,1)

In [55]:
grad

array([[[0.   , 0.267, 0.   , 1.295, 0.   , 0.   ],
        [0.325, 0.   , 0.   , 0.451, 0.   , 0.   ],
        [0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
        [0.31 , 0.369, 0.   , 0.   , 0.   , 0.   ],
        [0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
        [0.   , 0.   , 0.   , 0.   , 0.   , 0.   ]],

       [[0.   , 0.252, 0.   , 0.429, 0.   , 0.   ],
        [0.927, 0.   , 0.   , 0.09 , 0.   , 0.   ],
        [0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
        [2.431, 0.482, 0.   , 0.   , 0.   , 0.   ],
        [0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
        [0.   , 0.   , 0.   , 0.   , 0.   , 0.   ]]])

In [56]:
label_v1

['C', 'A', 'A', 'B', 'B', 'B']

In [57]:
from collections import OrderedDict
orderCol = list(OrderedDict.fromkeys(label_v1))
orderCol

['C', 'A', 'B']

In [58]:
from collections import Counter
counterOfCol = dict(Counter(label_v1))
counterOfCol

{'A': 2, 'B': 3, 'C': 1}

In [59]:
# get the slices so that can subtract sub matrix 
slcs = [counterOfCol[i] for i in orderCol]
slcs = np.cumsum(slcs)
slcs = np.insert(slcs,0,0)
slcs

array([0, 1, 3, 6], dtype=int32)

### 计算真实的梯度

In [60]:
grad = np.stack(
    [
        grad[:,slcs[i]:slcs[i+1],:].sum(1) 
            for i in range(len(slcs)-1) 
    ]
).transpose(1,2,0)
grad

array([[[0.   , 0.325, 0.31 ],
        [0.267, 0.   , 0.369],
        [0.   , 0.   , 0.   ],
        [1.295, 0.451, 0.   ],
        [0.   , 0.   , 0.   ],
        [0.   , 0.   , 0.   ]],

       [[0.   , 0.927, 2.431],
        [0.252, 0.   , 0.482],
        [0.   , 0.   , 0.   ],
        [0.429, 0.09 , 0.   ],
        [0.   , 0.   , 0.   ],
        [0.   , 0.   , 0.   ]]])

In [61]:
# 随机梯度下降：

field_idUsed = record_matrix.index.get_level_values(level = 'field_id')
index_idUsed = record_matrix.index.get_level_values(level = 'index_id')
print(field_idUsed,'\n',index_idUsed)

Index(['C', 'A', 'A', 'B', 'B', 'B'], dtype='object', name='field_id') 
 Index(['C', 'A:a1', 'A:a2', 'B:1', 'B:2', 'B:3'], dtype='object', name='index_id')


In [62]:
Wmatrix.loc[:,index_idUsed,orderCol].shape,grad.shape

((2, 6, 3), (2, 6, 3))

In [63]:
orderCol

['C', 'A', 'B']

In [64]:
Wmatrix.values[0,...]

array([[0.784, 0.369, 0.178],
       [0.929, 0.155, 0.909],
       [0.902, 0.406, 0.864],
       [0.302, 0.267, 0.111],
       [0.948, 0.711, 0.044],
       [0.108, 0.103, 0.856]])

In [65]:
grad[:,:,:].transpose()

array([[[0.   , 0.   ],
        [0.267, 0.252],
        [0.   , 0.   ],
        [1.295, 0.429],
        [0.   , 0.   ],
        [0.   , 0.   ]],

       [[0.325, 0.927],
        [0.   , 0.   ],
        [0.   , 0.   ],
        [0.451, 0.09 ],
        [0.   , 0.   ],
        [0.   , 0.   ]],

       [[0.31 , 2.431],
        [0.369, 0.482],
        [0.   , 0.   ],
        [0.   , 0.   ],
        [0.   , 0.   ],
        [0.   , 0.   ]]])

In [66]:
# SGD : 
lamda  = 0.01
yita   = 1
for i,j in enumerate(orderCol):
    Wmatrix.loc[:,index_idUsed,j] =\
        (1-lamda)*Wmatrix.loc[:,index_idUsed,j] - yita*grad[:,:,i].transpose()

In [67]:
Wmatrix.values[0,...]

array([[ 0.776, -0.004, -0.091],
       [ 0.92 ,  0.154,  0.9  ],
       [ 0.442,  0.402, -0.44 ],
       [ 0.299,  0.264,  0.11 ],
       [ 0.939,  0.704,  0.043],
       [-0.218, -0.208,  0.848]])

# 完整的随机梯度下降法

In [75]:
import pandas as pd
import numpy as np
from collections import OrderedDict
from collections import Counter
import re

np.set_printoptions(precision=3)
pd.set_option( "display.precision",3)


data = pd.DataFrame({'A':['a1','a2','a1'],'B':[1,2,3],'C':[3,2,4],'y':[0,0,1]})

ydata = data['y']
xdata = data.drop( 'y',axis = 1).copy()
xdata.B = xdata.B.astype('category')


categories = ['A','B']
continues  = ['C']


xdata_trnsf = pd.get_dummies(xdata,prefix=categories,prefix_sep = ":",sparse = True)




# 拿到一级label和二级label,重构索引
label_v1 = [ label.split(':')[0] for label in xdata_trnsf.columns.tolist()]
label_v2 = xdata_trnsf.columns.tolist()



index = pd.MultiIndex.from_arrays([label_v1,label_v2],names = ['field_id','index_id'])
xdata_trnsf.columns = index
# xdata_trnsf

# ## 初始化随机矩阵


numOfField = xdata_trnsf.columns.levels[0].tolist().__len__()
numOfIndex = xdata_trnsf.columns.levels[1].tolist().__len__()
numOfK     = 2
Wmatrix = pd.Panel(np.random.rand(numOfK,numOfIndex,numOfField)
                  ,major_axis = xdata_trnsf.columns.levels[1]
                  ,minor_axis = xdata_trnsf.columns.levels[0]
                  )



# SGD start
ephoch = 100
# 随机梯度下降：

lamda  = 0.01
yita   = 1


# use shuffled userList:
userList = [0,1,2]
for user in userList:
    # get user record
    record = xdata_trnsf.loc[user,:].to_frame().to_sparse()
    
    # get record matrix
    record_matrix = record.dot(record.T)
    upper = np.triu(record_matrix.values)
    upper[range(upper.shape[0]),range(upper.shape[0]) ]= 0
    record_matrix = pd.DataFrame(upper  ,index = record_matrix.index , columns = record_matrix.columns)


    
    field_idUsed = record_matrix.index.get_level_values(level = 'field_id')
    index_idUsed = record_matrix.index.get_level_values(level = 'index_id')

    xToFactor   = Wmatrix.loc[:,index_idUsed,field_idUsed] 
    factorToX   = xToFactor.transpose(0,2,1,copy =True)

    
    # calculate phi
    phi = (xToFactor.values*(factorToX.values))*record_matrix.values
    
    
    # ### 计算$ \frac{\exp{(-y\phi)}}{1+\exp{(-y\phi)}}$
    y = ydata[0]
    g_phi = np.exp(-y*phi)/(1+np.exp(-y*phi))


    # calculating grad
    grad = g_phi * factorToX.values * record_matrix.values + (xToFactor.values * record_matrix.values).transpose(0,2,1)


    orderCol = list(OrderedDict.fromkeys(label_v1))
    counterOfCol = dict(Counter(label_v1))

    
    # get the slices so that can subtract sub matrix 
    slcs = [counterOfCol[i] for i in orderCol]
    slcs = np.cumsum(slcs)
    slcs = np.insert(slcs,0,0)
    
    grad = np.stack(
        [
            grad[:,slcs[i]:slcs[i+1],:].sum(1) 
                for i in range(len(slcs)-1) 
        ]
    ).transpose(1,2,0)

    for i,j in enumerate(orderCol):
        Wmatrix.loc[:,index_idUsed,j] =        (1-lamda)*Wmatrix.loc[:,index_idUsed,j] - yita*grad[:,:,i].transpose()

        
print(Wmatrix.values)

[[[ 4.648e-01  7.820e-05  5.586e-03]
  [ 1.011e-01 -3.881e-03 -2.140e-04]
  [ 1.753e-02  9.553e-01 -2.369e-01]
  [ 8.785e-02  2.117e-01 -1.936e-03]
  [ 3.267e-01  4.384e-01 -8.865e-01]
  [-1.815e-01 -3.412e+00  7.490e-02]]

 [[ 1.247e-01  4.645e-05  1.197e-01]
  [ 1.332e-01 -9.326e-03 -5.967e-03]
  [ 4.452e-01  9.527e-01 -1.728e-01]
  [ 3.806e-01  5.885e-02 -7.638e-03]
  [ 3.770e-01  7.178e-01 -6.501e-01]
  [-4.604e+00 -2.386e+00  3.689e-01]]]
