# FFM 尝试

## 数据的重编码

### 创建数据集

In [1]:
import pandas as pd
import numpy as np
np.set_printoptions(precision=3)
pd.set_option( "display.precision",3)

In [2]:
data = pd.DataFrame({'A':['a1','a2','a1'],'B':[1,2,1],'C':[3,2,4],'y':[0,0,1]})
data

Unnamed: 0,A,B,C,y
0,a1,1,3,0
1,a2,2,2,0
2,a1,1,4,1


In [3]:
ydata = data['y']
ydata

0    0
1    0
2    1
Name: y, dtype: int64

In [4]:
xdata = data.drop( 'y',axis = 1).copy()


In [5]:
xdata

Unnamed: 0,A,B,C
0,a1,1,3
1,a2,2,2
2,a1,1,4


### 转换部分类型为分类变量

In [6]:
xdata.B = xdata.B.astype('category')

### 转换成稀疏形式

In [7]:
# 拿到离散变量的field name
categories = ['A','B']
continues  = ['C']

In [8]:
xdata_trnsf = pd.get_dummies(xdata,prefix=categories,prefix_sep = ":",sparse = True)
print(xdata_trnsf)

   C  A:a1  A:a2  B:1  B:2
0  3     1     0    1    0
1  2     0     1    0    1
2  4     1     0    1    0


In [9]:
# 拿到一级label和二级label,重构索引
import re
label_v1 = [ label.split(':')[0] for label in xdata_trnsf.columns.tolist()]
label_v2 = xdata_trnsf.columns.tolist()
index = pd.MultiIndex.from_arrays([label_v1,label_v2],names = ['field_id','index_id'])
xdata_trnsf.columns = index

In [10]:
print(xdata_trnsf)

field_id  C    A        B    
index_id  C A:a1 A:a2 B:1 B:2
0         3    1    0   1   0
1         2    0    1   0   1
2         4    1    0   1   0


## 初始化随机矩阵

In [11]:
numOfField = xdata_trnsf.columns.levels[0].tolist().__len__()
numOfIndex = xdata_trnsf.columns.levels[1].tolist().__len__()
numOfK     = 1
# Wmatrix = pd.Panel(np.random.rand(numOfK,numOfIndex,numOfField)
#                   ,major_axis = xdata_trnsf.columns.levels[1]
#                   ,minor_axis = xdata_trnsf.columns.levels[0]
#                   )

Wmatrix = pd.Panel(np.array(range(numOfK*numOfIndex*numOfField)).reshape(numOfK,numOfIndex,numOfField)
                  ,major_axis = xdata_trnsf.columns.levels[1]
                  ,minor_axis = xdata_trnsf.columns.levels[0]
                  )


Wmatrix
# wmatrix 是每一个向量对于每一个field的factor

<class 'pandas.core.panel.Panel'>
Dimensions: 1 (items) x 5 (major_axis) x 3 (minor_axis)
Items axis: 0 to 0
Major_axis axis: A:a1 to C
Minor_axis axis: A to C

### 从panel中获取元素的方法

In [38]:
# 获取所有index 在这些field上面的向量的一维度部分
Wmatrix.loc[0,:,:]

field_id,A,B,C
index_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A:a1,0,1,2
A:a2,3,4,5
B:1,6,7,8
B:2,9,10,11
C,12,13,14


## 求出$\phi$的方法

### 获取第一个人的信息

In [13]:
record = xdata_trnsf.loc[0,:].to_frame().to_sparse()
record

Unnamed: 0_level_0,Unnamed: 1_level_0,0
field_id,index_id,Unnamed: 2_level_1
C,C,3
A,A:a1,1
A,A:a2,0
B,B:1,1
B,B:2,0


In [14]:
record_matrix = record.dot(record.T)
upper = np.triu(record_matrix.values)
upper[range(upper.shape[0]),range(upper.shape[0]) ]= 0
record_matrix = pd.DataFrame(upper  ,index = record_matrix.index , columns = record_matrix.columns)
record_matrix

Unnamed: 0_level_0,field_id,C,A,A,B,B
Unnamed: 0_level_1,index_id,C,A:a1,A:a2,B:1,B:2
field_id,index_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
C,C,0,3,0,3,0
A,A:a1,0,0,0,1,0
A,A:a2,0,0,0,0,0
B,B:1,0,0,0,0,0
B,B:2,0,0,0,0,0


### 利用record_matrix 生成xToFactor 和 factorToX 两个tensor

#### 从多层索引中获得值的方法

In [15]:
field_idUsed = record_matrix.index.get_level_values(level = 'field_id')
index_idUsed = record_matrix.index.get_level_values(level = 'index_id')

In [16]:
xToFactor   = Wmatrix.loc[:,index_idUsed,field_idUsed] 
factorToX   = xToFactor.transpose(0,2,1,copy =True)

In [17]:
xToFactor.iloc[0,:,:]

field_id,C,A,A,B,B
index_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C,14,12,12,13,13
A:a1,2,0,0,1,1
A:a2,5,3,3,4,4
B:1,8,6,6,7,7
B:2,11,9,9,10,10


In [18]:
factorToX.iloc[0,:,:]

index_id,C,A:a1,A:a2,B:1,B:2
field_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C,14,2,5,8,11
A,12,0,3,6,9
A,12,0,3,6,9
B,13,1,4,7,10
B,13,1,4,7,10


### 计算$\phi$

In [19]:
(xToFactor.values*(factorToX.values))[0,...]

array([[196,  24,  60, 104, 143],
       [ 24,   0,   0,   6,   9],
       [ 60,   0,   9,  24,  36],
       [104,   6,  24,  49,  70],
       [143,   9,  36,  70, 100]])

In [20]:
record_matrix

Unnamed: 0_level_0,field_id,C,A,A,B,B
Unnamed: 0_level_1,index_id,C,A:a1,A:a2,B:1,B:2
field_id,index_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
C,C,0,3,0,3,0
A,A:a1,0,0,0,1,0
A,A:a2,0,0,0,0,0
B,B:1,0,0,0,0,0
B,B:2,0,0,0,0,0


In [21]:
phi = (xToFactor.values*(factorToX.values))*record_matrix.values
phi[0,:,:]

array([[  0,  72,   0, 312,   0],
       [  0,   0,   0,   6,   0],
       [  0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0]], dtype=int64)

In [22]:
phi = phi.sum()
phi

390

In [23]:
# prob
np.exp(phi)/(1+np.exp(phi))

1.0

### 计算$ \frac{\exp{(-y\phi)}}{1+\exp{(-y\phi)}}$

In [24]:
y = ydata[0]
g_phi = np.exp(-y*phi)/(1+np.exp(-y*phi))

In [25]:
g_phi

0.5

### 计算偏导数