In [1]:
from itertools import count 
from collections import defaultdict
from scipy.sparse import csr 
import numpy as np 
import pandas as pd 
# 将字典表达的特征转换成矩阵形式
from sklearn.feature_extraction import DictVectorizer

import tensorflow as tf 

from tqdm.autonotebook import tqdm
import datetime

import warnings
warnings.filterwarnings("ignore")



# 读取数据

In [8]:
columns = ["user", "item", "rating", "timestamp"]

data = pd.read_csv("../data/ml-1m/ratings.dat", sep="::", lineterminator="\n", names=columns)

In [32]:
data = data[:100000]

In [33]:
index = np.random.permutation(np.arange(data.shape[0]))
## 划分训练集和测试集
train_ix = index[:int(data.shape[0]*0.8)]
test_ix = index[int(data.shape[0]*0.8):]

train = data.loc[train_ix, :]
test = data.loc[test_ix, :]

In [34]:
train.shape

(80000, 4)

In [35]:
test.shape

(20000, 4)

# 对特征进行向量化转换

In [36]:
# 这个函数的作用相当于one-hot编码
def vectorize_dic(dic, ix=None, p=None, n=0, g=0):
    '''
    dic: 表达成字典形式的特征，键为特征名,值为该特征对应的所有数据,list型 
    ix: index迭代器
    p: 特征空间的维度 
    n: 表示数据量，也就是记录的个数 
    g: 表示特征的个数 
    '''
    if ix == None:
        ix = dict()
    
    nz = n*g 
    ## 建立一个存储所有列标的数组
    col_ix = np.empty(nz, dtype=int)
    
    i = 0 
    for k, lis in dic.items():
        for t in range(len(lis)):
            ## 获取每个特征对应的不同特征值的数量
            ix[str(lis[t]) + str(k)] = ix.get(str(lis[t])+str(k), 0) + 1 
            ## 对应的返回值的列标号
            col_ix[i + t*g] = ix[str(lis[t]) + str(k)]      
        i += 1
        
    ## 对应的行标号
    row_ix = np.repeat(np.arange(0, n), g)
    data = np.ones(nz)
    if p==None:
        ## 表示所有特征可能取值的总数量
        p = len(ix)
    
    ixx = np.where(col_ix < p)
    return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix

## 获取转换后的特征值

In [37]:
X_train, ix = vectorize_dic({"users": train["user"].values, 
                            "items": train["item"].values}, 
                           n=len(train.index), g=2)

X_test, ix = vectorize_dic({"users": test["user"].values, 
                           "items": test["item"].values}, 
                           ix, X_train.shape[1],
                           n=len(test.index), g=2)

In [38]:
len(ix.values())

3933

In [39]:
len(set(ix.values()))

331

In [40]:
y_train = train["rating"].values
y_test = test["rating"].values

In [41]:
X_train = X_train.todense()
X_test = X_test.todense()

In [46]:
y_train.shape

(80000,)

In [42]:
X_train.shape

(80000, 3875)

In [43]:
X_test.shape

(20000, 3875)

# 定义计算图

In [44]:
# 定义参数
n, p = X_train.shape  ## n表示记录数，p表示特征数
k  = 10   ## 表示中间向量v的维度

In [49]:
## 定义计算图
inputX = tf.placeholder(tf.float32, shape=[None, p], name="inputX")
inputY = tf.placeholder(tf.float32, shape=[None, 1], name="inputY")

## 定义偏置和一维特征的系数
w0 = tf.Variable(tf.zeros([1]), name="w0")
w = tf.Variable(tf.zeros([p]), name="w")

## 定义二维特征的系数
v = tf.Variable(tf.random_normal([k, p], mean=0, stddev=0.01), name="v")

## 得到维度 [batch, 1]
linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(w, inputX), 1, keepdims=True), name="linear")
## 交互特征，输出维度 [batch, 1]
pair_interactions = 0.5 * tf.reduce_sum(tf.subtract(
            tf.pow(tf.matmul(inputX, tf.transpose(v)), 2), 
            tf.matmul(tf.pow(inputX, 2), tf.transpose(tf.pow(v, 2)))), 
                                       axis=1, keepdims=True, name="pair")

y_hat = tf.add(linear_terms, pair_interactions, name="predictions")

## 添加l2正则项
lambda_w = tf.constant(0.001, name="lambda_w")
lambda_v = tf.constant(0.001, name="lmabda_v")

l2_norm = tf.reduce_sum(tf.add(tf.multiply(lambda_w, tf.pow(w, 2)),
                              tf.multiply(lambda_v, tf.pow(v, 2))))

error = tf.reduce_mean(tf.square(inputY - y_hat))
loss = tf.add(error, l2_norm)


train_op = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)

Instructions for updating:
Use tf.cast instead.


## 训练

In [54]:
def batcher(X_, y_=None, batch_size=-1):
    n_samples = X_.shape[0]
    
    if batch_size == -1: 
        batch_size = n_samples
    if batch_size < 1: 
        raise ValueError(f"Parameter batch_size={batch_size} is unsupported")
    
    for i in range(0, n_samples, batch_size):
        upper_bound = min(i+batch_size, n_samples)
        ret_x = X_[i:upper_bound]
        ret_y = None
        if y_ is not None: 
            ret_y = y_[i:i+batch_size]
            yield (ret_x, ret_y)

In [57]:
epochs = 10 
batch_size = 1000 

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    
    for epoch in tqdm(range(epochs), unit="epoch"):
        perm = np.random.permutation(X_train.shape[0])
        ## 按照batch进行训练
        for bX, bY in batcher(X_train[perm], y_train[perm], batch_size):
            _, t = sess.run([train_op, loss], feed_dict={inputX: bX.reshape(-1, p), inputY: bY.reshape(-1, 1)})
        
        now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(f"{now} | EPOCH: {epoch} | LOSS: {t}")
            
    
    errors = []
    
    for bX, bY in batcher(X_test, y_test):
        errors.append(sess.run(error, feed_dict={inputX: bX.reshape(-1, p), inputY: bY.reshape(-1, 1)}))
        #print(errors)
        
    RMSE = np.sqrt(np.array(errors).mean())
    print(RMSE)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

2019-08-14 11:48:20 | EPOCH: 0 | LOSS: 1.6427619457244873
2019-08-14 11:48:22 | EPOCH: 1 | LOSS: 1.2301772832870483
2019-08-14 11:48:23 | EPOCH: 2 | LOSS: 1.3363068103790283
2019-08-14 11:48:24 | EPOCH: 3 | LOSS: 1.2514501810073853
2019-08-14 11:48:26 | EPOCH: 4 | LOSS: 1.2924153804779053
2019-08-14 11:48:27 | EPOCH: 5 | LOSS: 1.2045905590057373
2019-08-14 11:48:28 | EPOCH: 6 | LOSS: 1.1927227973937988
2019-08-14 11:48:30 | EPOCH: 7 | LOSS: 1.2391865253448486
2019-08-14 11:48:31 | EPOCH: 8 | LOSS: 1.207564353942871
2019-08-14 11:48:32 | EPOCH: 9 | LOSS: 1.3152328729629517

1.1216104


In [13]:
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator

In [14]:
df = pd.DataFrame({"num": [1, 2, 3, 4], "str": ['a', 'a', 'b', None]})

In [15]:
proto = GenericFeatureStatisticsGenerator().ProtoFromDataFrames([{'name': 'test', "table": df}])

In [18]:
from IPython.core.display import display, HTML

In [24]:
# Create the feature stats for the datasets and stringify it.
import base64
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator

protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")

In [None]:
proto = GenericFeatureStatisticsGenerator().ProtoFromDataFrames([{'name': 'test', "table": df}])

In [25]:
# Display the facets overview visualization for this data
from IPython.core.display import display, HTML

HTML_TEMPLATE = """
        <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
        <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" >
        <facets-overview id="elem"></facets-overview>
        <script>
          document.querySelector("#elem").protoInput = "{protostr}";
        </script>"""
html = HTML_TEMPLATE.format(protostr=protostr)
display(HTML(html))

# 使用xlearn

In [1]:
import xlearn

import pandas as pd 
import numpy as np 

import os 

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
import matplotlib.pyplot as plt 
import seaborn as sns 

from collections import Counter

In [4]:
from tqdm.autonotebook import tqdm



In [5]:
# 解决中文乱码问题
plt.rc("font", family="SimHei", size="15")

In [6]:
train = pd.read_csv("../data/criteo/criteo_data.csv")

In [7]:
train.describe()

Unnamed: 0,Label,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13
count,1000000.0,576997.0,1000000.0,764946.0,750884.0,975464.0,778260.0,959166.0,999338.0,959166.0,576997.0,959166.0,229943.0,750884.0
mean,0.254949,3.2384,93.965184,21.094269,6.95427,18616.53,116.761653,14.592224,13.378894,105.450931,0.572658,2.507449,0.979991,7.630003
std,0.435833,8.944151,349.884675,344.010001,8.554014,67686.65,456.056847,58.277459,32.345836,217.405401,0.679937,4.82537,6.003874,35.071487
min,0.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,2.0,2.0,829.0,8.0,1.0,2.0,10.0,0.0,0.0,0.0,2.0
50%,0.0,1.0,2.0,5.0,4.0,2548.0,33.0,3.0,8.0,39.0,0.0,1.0,0.0,4.0
75%,1.0,3.0,32.0,14.0,9.0,10163.0,102.0,11.0,20.0,110.0,1.0,3.0,0.0,9.0
max,1.0,780.0,18522.0,65535.0,507.0,2527030.0,233523.0,8807.0,5064.0,19327.0,8.0,147.0,768.0,6702.0


In [34]:
# 连续值跨度比较大，对所有的连续值分箱
## 分箱节点为：0 25 75 95 100
for col in continous:
    ## 计算几个分位点
    ## 取出所有的非nan值
    value = train[np.isnan(train[col]).astype('int8') == 0][col].values
    Q0 = np.min(value)
    Q1 = np.percentile(value, 25)
    Q2 = np.percentile(value, 50)
    Q3 = np.percentile(value, 75)
    Q4 = np.percentile(value, 95)
    Q5 = np.max(value)
    bins = [Q0, Q1, Q2, Q3, Q4, Q5]
    bins = sorted(set(bins))
    labels = list(map(str, list(range(len(bins)-1))))
    print(bins, labels)
    train[f"C_{col}"] = pd.cut(train[col], bins=bins, labels=labels)


[0.0, 1.0, 3.0, 14.0, 780.0] ['0', '1', '2', '3']
[-2, 0.0, 2.0, 32.0, 442.0, 18522] ['0', '1', '2', '3', '4']
[0.0, 2.0, 5.0, 14.0, 59.0, 65535.0] ['0', '1', '2', '3', '4']
[0.0, 2.0, 4.0, 9.0, 24.0, 507.0] ['0', '1', '2', '3', '4']
[0.0, 829.0, 2548.0, 10163.0, 64556.0, 2527030.0] ['0', '1', '2', '3', '4']
[0.0, 8.0, 33.0, 102.0, 473.0, 233523.0] ['0', '1', '2', '3', '4']
[0.0, 1.0, 3.0, 11.0, 57.0, 8807.0] ['0', '1', '2', '3', '4']
[0.0, 2.0, 8.0, 20.0, 43.0, 5064.0] ['0', '1', '2', '3', '4']
[0.0, 10.0, 39.0, 110.0, 427.0, 19327.0] ['0', '1', '2', '3', '4']
[0.0, 1.0, 2.0, 8.0] ['0', '1', '2']
[0.0, 1.0, 3.0, 9.0, 147.0] ['0', '1', '2', '3']
[0.0, 4.0, 768.0] ['0', '1']
[0.0, 2.0, 4.0, 9.0, 26.0, 6702.0] ['0', '1', '2', '3', '4']


In [35]:
train.head()

Unnamed: 0,Label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C_I4,C_I5,C_I6,C_I7,C_I8,C_I9,C_I10,C_I11,C_I12,C_I13
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,,1,0.0,3.0,0.0,3.0,0.0,1.0,,0.0
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,,1.0
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,3.0,0,2.0,2.0,0.0,3.0,0.0,1.0,0.0,4.0
3,0,,893,,,4392.0,,0.0,0.0,0.0,...,,2,,,,,,,,
4,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,0.0,...,,0,,1.0,,,0.0,0.0,,


In [36]:
train_df = train[:int(train.shape[0]*0.8)]
test_df = train[int(train.shape[0]*0.8):]

In [37]:
# 选出连续型特征
con = [f for f in train_df.columns if f.startswith("I")]
cat = [f for f in train_df.columns if f.startswith("C")]

In [42]:
# 定义将数据转换为xlearn格式的数据
class FFMFormat:
    def __init__(self, vector_feat, one_hot_feat, continus_feat):
        '''
        vector_feat: 表示多个有意义的字符组成的特征，可以理解为向量型特征，缺失值用"-1"填充 
        one_hot_feat: 表示可以使用One-hot编码的特征，缺失值使用-1填充 
        continus_feat: 表示连续型特征，经过归一化处理的 
        '''
        self.field_index_ = None  # 记录场索引信息
        self.feature_index_ = None # 记录特征索引信息
        self.vector_feat = vector_feat
        self.one_hot_feat = one_hot_feat
        self.continus_feat = continus_feat
        
    def fit(self, df):
        ## 每一列对应一个场
        self.field_index_ = {col: i for i, col in enumerate(df.columns)}
        self.feature_index_ = {}
        last_idx = 0 
        for col in tqdm(df.columns):
            ## 如果对应列是one-hot型特征
            if col in self.one_hot_feat:
                print("cat: ", col)
                df[col] = df[col].astype(str)
                ## 求出该变量中共有多少种不同的值
                vals = [v for v in np.unique(df[col].values) if str(v) != "nan"]
                ## 获得对应的one-hot只有的特征名
                names = np.asarray(list(map(lambda x: col+"_"+x, vals)))
                tmp = dict(zip(names, range(last_idx, last_idx+len(names))))
                self.feature_index_[col] = tmp
                last_idx += len(names)
            elif col in self.vector_feat:
                ## 这是字符串型特征
                vals = []
                for data in df[col].apply(str):
                    if data != "nan":
                        ## 按照空格进行分割
                        for word in data.strip().split():
                            vals.append(word)
                vals = np.unique(vals)
                vals = filter(lambda x: x!="nan", vals)
                names = np.asarray(list(map(lambda x: col+"_"+x, vals)))
                tmp = dict(zip(names, range(last_idx, last_idx+len(names))))
                self.feature_index_[col] = tmp
                last_idx += len(names)
            elif col in self.continus_feat:
                ## 最后如果是数值型特征
                print("con: ", col)
                self.feature_index_[col] = last_idx
                last_idx += 1 
        return self 
    
    # 对每一行进行转换
    def transform_row_(self, row):
        ffm = []
        
        for col, val in row.loc[row != 0].to_dict().items():
            if col in self.one_hot_feat:
                name = f"{col}_{val}"
                if name in self.feature_index_[col]:
                    ffm.append("{}:{}:1".format(self.field_index_[col], self.feature_index_[col][name]))
            elif col in self.vector_feat:
                for word in str(val).split():
                    name = f"{col}_{word}"
                    if name in self.feature_index_[col]:
                        ffm.append("{}:{}:1".format(self.field_index_[col], self.feature_index_[col][name]))
            elif col in self.continus_feat:
                if str(val) != "nan": 
                    ffm.append("{}:{}:{}".format(self.field_index_[col], self.feature_index_[col], val))
        return " ".join(ffm)
    
    def transform(self, df):
        return pd.Series({idx: self.transform_row_(row) for idx, row in tqdm(df.iterrows())})
    
    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)
    
    

In [43]:
def convert_to_ffm(train_df, test_df=None, vector_fe=[], onehot_fe=[], contin_fe=[], path="./", label=None):
    if test_df is not None:
        df_ = pd.concat([train_df, test_df], axis=0, sort=False, ignore_index=True)
    else:
        df_ = train_df
    
    trans = FFMFormat(vector_fe, onehot_fe, contin_fe)
    user_ffm = trans.fit_transform(df_)
    
    train_ = user_ffm[:train_df.shape[0]]
    if test_df is not None:
        test_ffm = user_ffm[train_df.shape[0]:]
    
    if label:
        Y = np.array(train_df.pop(label))
    else:
        raise ValueError("Please give the label")
    
    train_ffm = pd.DataFrame()
    train_ffm["Label"] = Y.astype(str) 
    train_ffm["feature"] = train_
    train_ffm['all'] = train_ffm[['Label', "feature"]].apply(lambda row: " ".join(row), axis=1, raw=True)
    train_ffm.drop(["Label", "feature"], axis=1, inplace=True)
    
    
    ## 生成训练集和验证集
    train_ffm[:int(train_ffm.shape[0]*0.8)].to_csv(os.path.join(path, "train_ffm.txt"), index=False, header=False, sep=" ")
    train_ffm[int(train_ffm.shape[0]*0.8):].to_csv(os.path.join(path, "valid_ffm.txt"), index=False, header=False, sep=" ")
    
    if test_df is not None:
        test_ffm.to_csv(os.path.join(path, "test_ffm.txt"), index=False, header=False, sep=" ")
    
    '''
    with open(os.path.join(path, "train_ffm.txt"), "w") as f: 
        for idx, row in train_ffm.iterrows():
            f.write(row['all'] + "\n")
    '''

In [38]:
def preprocess(train_df, test_df=None, contin_fe=[]):
    '''
    只需要处理连续型特征即可
    '''
    if test_df is not None:
        df_ = pd.concat([train_df, test_df], axis=0, sort=False, ignore_index=True)
    else:
        df_ = train_df
    
    ss = StandardScaler()
    df_[contin_fe] = ss.fit_transform(df_[contin_fe])
    
    train_df = df_[:train_df.shape[0]]
    if test_df is not None:
        test_df = df_[train_df.shape[0]:]
        return train_df, test_df
    return train_df, None

In [46]:
convert_to_ffm(train_df, test_df, onehot_fe=cat, contin_fe=[], label="Label")

HBox(children=(IntProgress(value=0, max=53), HTML(value='')))

cat:  C1
cat:  C2
cat:  C3
cat:  C4
cat:  C5
cat:  C6
cat:  C7
cat:  C8
cat:  C9
cat:  C10
cat:  C11
cat:  C12
cat:  C13
cat:  C14
cat:  C15
cat:  C16
cat:  C17
cat:  C18
cat:  C19
cat:  C20
cat:  C21
cat:  C22
cat:  C23
cat:  C24
cat:  C25
cat:  C26
cat:  C_I1
cat:  C_I2
cat:  C_I3
cat:  C_I4
cat:  C_I5
cat:  C_I6
cat:  C_I7
cat:  C_I8
cat:  C_I9
cat:  C_I10
cat:  C_I11
cat:  C_I12
cat:  C_I13



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [5]:
ffm_model = xlearn.create_ffm()

In [6]:
ffm_model.setTrain("./train_ffm.txt")

In [7]:
ffm_model.setValidate("./valid_ffm.txt")

In [8]:
param = {"task": "binary", "lr": 0.1, "lambda": 0.002, "metric":"acc"}

In [9]:
ffm_model.fit(param, "./model.out")

In [10]:
ffm_model.setTest("./test_ffm.txt")

In [11]:
ffm_model.setSigmoid()

In [12]:
ffm_model.predict("./model.out", "./output.txt")

## 使用FM模型

In [None]:
class FMFormat:
    def 