### 使用Tensorflow作用于推荐系统

### 1 数据准备 - 以Movielens 为例

In [None]:
"Movieslens 数据格式: user,item,rating,timestamp"

### 2 数据预处理部分

In [72]:
# from __future__ import absolute_import,division,print_function => print ("this python3.x ") 需要按照python3 语法规则
#coding:utf-8
import numpy as np
import pandas as pd


"function 1:"
def read_data_and_process(filename, sep = "\t"):
    col_names = ["user","item","rate","timestamp"]
    df        = pd.read_csv(filepath_or_buffer = filename, sep = sep, header = None, names = col_names, engine = 'python')
    "逐级递减"
    df['user'] -= 1 
    df['item'] -= 1
    for col in ('user','item'):
        df[col]  = df[col].astype(np.int32)
    
    df['rate'] = df['rate'].astype(np.float32) #"转换数据类型by astype"
    return df
    

"随机生成一个batch一个batch的数据来保证每次训练模型拿到的数据均不同，避免过拟合的出现"
class ShuffleBatchData(object): # object 
    
    def __init__ (self,inputs,batch_size = 10):
        self.inputs     = inputs
        self.batch_size = batch_size
        self.num_cols = len(self.inputs)
        self.len      = len(self.inputs[0])
        self.inputs   = np.transpose( np.vstack([np.array(self.inputs[i] for i in range(self.num_cols))] ) )
        
    
    def __len__ (self):
        return self.len
    
    def __iter__(self):
        return self
    
    def __next__(self):
        return self.next()
    
    "随机生成一个batch size个下标, 并取出相应的样本"
    def next(self):
        ids = np.random.randint(0,self.len,(self.batch_size, ) )
        out = self.inputs[ids,:]
        
        return [out[:,columns_index] for columns_index in range(self.num_cols)]

"顺序产生一个epoch的数据,用于测试中...."
class OneEpochTestDataProcesing(ShuffleBatchData):
    def __init__(self,inputs,batch_size = 10):
        super(ShuffleBatchData,self).__init__(inputs,batch_size = batch_size)
        if batch_size>0:
            self.idx_group  = np.array_split(np.arange(self.len), np.ceil(self.len/batch_size) )
        else:
            self.idx_group  = [np.arange(self.len)]
        self.group_id = 0
        
    def next(self):
        if self.group_id > len(self.idx_group):
            self.group_id = 0 
            raise StopIteration 
        out = self.inputs[self.idx_group[self.group_id],:]
        self.group_id += 1 
        
        return [out[:,i] for i in range(self.num_cols)]
            

### 3 搭建model

In [None]:
import tensorflow as tf
"使用矩阵分解搭建网络结构"
def interface_svd(user_batch,item_batch,user_num,item_num,dim = 5, device = "/cpu:0"):
    with tf.device("/cpu:0"):
        "初始化几个偏执项"
        global_bias = tf.get_variable("global_bias",shape = [])
        w_bias_user = tf.get_variable('embd_bias_user',shape = [user_num])
        w_bias_item = tf.get_variable('embd_bias_item',shape = [item_num])
        
        "bias向量"
        bias_user = tf.nn.embedding_lookup(w_bias_user,user_batch,name = "bias_user")
        bias_item = tf.nn.embedding_lookup(w_bias_item,item_batch,name = "item_user")
        
        w_user = tf.get_variable("embd_user",shape = [user_num,dim], initializer = tf.truncated.normal_initializer(stddev = 0.02) )
        w_item = tf.get_variable("embd_item",shape = [item_num,dim], initializer = tf.truncated.normal_initializer(stddev = 0.02))
        
        "user向量与item向量"
        embd_user = tf.nn.embedding_lookup(w_user,user_batch, name = "embedding_user")
        embd_item = tf.nn.embedding_lookup(w_item,item_batch,name = "embedding_item")
         
        "以上的都是tensorflow 规定的向量初始化过程"
        
    with td.device(device):
        "按照公式对user 向量和item 向量求和"
        infer = tf.reduce_sum(tf.multiply(embd_user,embd_item),1)
        
        "按照公式加上几个偏置项"
        infer = tf.add(infer,global_bias)
        infer = tf.add(infer,bias_user)
        infer = tf.add(infer,bias_item,name = 'svc_inference')
        
        "加上正则化项"
        regularizer = tf.add(tf.nn.l2_loss(embd_user),tf.nn.l2_loss(embd_item),name = 'svd_regularizer')
        return infer, regularizer

"模型迭代"
def optimization(infer,regularizer,rate_batch,learning_rate = 0.001,reg = 0.10,device = "./cpu:0"):
    global_step = tf.train.get_global_step()
    assert global_step is not None
        
    "选择合适的optimizer 优化"
    with tf.device(device):
        cost_l2  = tf.nn.l2_loss(tf.subtract(infer,rate_batch))
        penalty  = tf.constant(reg,dtype = tf.float32, shape = [], name = 'l2')
        cost     = tf.add(cost_l2, tf.multiply(regularizer,penalty))
        train_optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost,global_step = global_step)
    return cost,train_optimizer 

### 4.在实际数据上训练模型 


In [None]:
import time
from collections import deque
import numpy as np
import tensorflow as tf
from six import next 
from tensorflow.core.framework import summary_pb2
np.random.seed(123)

"模型常量定义"
BATCH_SIZE =  2000
USER_NUM =  6040
ITEM_NUM =  3952
DIM  = 15 # "factor 维度"

"最大迭代轮数"
EPOCH_MAX = 200

"使用cpu做训练"
DEVICE = "./cpu:0"
 
"截断"
def clip(x):
    return np.clip(x,1,5)

"方便可视化做的summary"
def make_scalar_summary(name,value):
    return summary_pb2.Summary(value = [summary_pb2.Summary.Value(tag = name,simple_value = value )])

"通过调用上面定义的函数获取数据:"
def get_data():
    df = read_data_and_process("./movielens/ml-1m/ratings.dat",sep = "::")
    rows = len(df)
    "permutation - 生成随机队列"
    df   = df.iloc[np.random.permutation(rows)].reset_index(drop = True)
    split_index = int(rows * 0.9)
    df_train    = df[0:split_index]
    'reset_index : We can use the `drop = True` parameter to avoid the old index being added as a column'
    df_test     = df[split_index:].reset_index(drop = True )
    print(df_train.shape,df_test.shape)
    return df_train,df_test

"实现训练过程"
def svd(train,test):
    samples_per_batch = len(train) # "batch size"
    
    "将数据一个batch一个batch喂到模型里面训练"
    iter_train = ShuffleBatchData([train["user"],train["item"], train["rate"] ],batch_size = BATCH_SIZE)
    
    "测试数据"
    iter_test  = OneEpochTestDataProcesing([test["user"], test["item"], test["rate"]], batch_size = -1)
    
    "user and item batch "
    user_batch = tf.placeholder(tf.int32,shape = [None], name = "id_user")
    item_batch = tf.placeholder(tf.int32,shape = [None],name = "id_item")
    rate_batch = tf.placeholder(tf.int32,shape = [None])
    
    "构建Graph 和训练"
    infer, regularizer  = interface_svd(user_batch,item_batch,USER_NUM,ITEM_NUM,DIM,DEVICE)
    
    global_step =  tf.contrib.get_or_create_global_step()
    cost,train_optimizer  = optimization(infer,regularizer,rate_batch,learning_rate = 0.001,reg = 0.10,device = DEVICE)
    
    "初始化所有变量"
    init_op = tf.global_variables_initializer()
    
    "开始迭代"
    with tf.Session() as sess:
        sess.run(init_op)
        summary_writer = tf.summary.FileWriter(logdir = '/InternalData/log', graph = sess.graph )
        print "{} {} {} {}".format("epoch","train_error","val_error","elapsed_time") 
        errors = deque(maxlen = samples_per_batch)
        start  = time.time()
        for i in range(EPOCH_MAX * samples_per_batch):
            users,items,rates = next(iter_train)
            _,pred_batch = sess.run([train_optimizer,infer], feed_dict = {user_batch:users,
                                                                         item_batch:items,
                                                                         rate_batch:rates})
            pred_batch = clip(pred_batch)
            errors.append(np.power(pred_batch - rates,2) )
            if i%samples_per_batch  == 0:
                train_errors = np.sqrt(np.mean(errors))
                test_errors  = np.array([])
                
                for users,items,rates in iter_test:
                    pred_batch = sess.run(infer, feed_dict = {user_batch:users,
                                                             item_batch:items})
                    
                    pred_batch = clip(pred_batch)
                    test_errors = np.append(test_errors,np.power(pred_batch - rates,2 ))
                    
                end  = time.time()
                test_errors_sqrt = np.sqrt(np.mean(test_errors))
                print "{:3d} {:f} {:f} {:f}".format(i//samples_per_batch,train_errors,test_errors_sqrt,end - start)
                
                train_err_summary = make_scalar_summary("training_error",train_errors)
                test_err_summary = make_scalar_summary("test_error",test_errors_sqrt)
                summary_writer.add_summary(train_err_summary,i)
                summary_writer.add_summary(test_err_summary,i)
                start = end 


In [None]:
"获取数据集"
train_datasets, test_datasets = get_data()
"SVD训练数据"
svd(train_datasets,test_datasets)

In [None]:
help(pd.read_csv)

In [58]:
filename  = "./RawData/ReleasaeFilesCRC12.5.csv"
col_names = ["Baseline","Date"]
DF   = pd.read_csv(filename, sep = ",", header = None, names = col_names, engine = 'python')
DF['Baseline'] =  1
DF['Date'] = "2019"
for col in ("Baseline","Date"):
    DF[col] =  DF[col].astype(np.float32)
    
    

In [85]:
help(pd.DataFrame.reset_index)

Help on method reset_index in module pandas.core.frame:

reset_index(self, level=None, drop=False, inplace=False, col_level=0, col_fill='') unbound pandas.core.frame.DataFrame method
    For DataFrame with multi-level index, return new DataFrame with
    labeling information in the columns under the index names, defaulting
    to 'level_0', 'level_1', etc. if any are None. For a standard index,
    the index name will be used (if set), otherwise a default 'index' or
    'level_0' (if 'index' is already taken) will be used.
    
    Parameters
    ----------
    level : int, str, tuple, or list, default None
        Only remove the given levels from the index. Removes all levels by
        default
    drop : boolean, default False
        Do not try to insert index into dataframe columns. This resets
        the index to the default integer index.
    inplace : boolean, default False
        Modify the DataFrame in place (do not create a new object)
    col_level : int or str, default 0

In [47]:
DataFrame

Unnamed: 0,Baseline,Date
0,BaselineNames,Date
1,CORE_RILC:CORE_RILC_FHM_API_00_002,1-Jan-19
2,CORE_RILC:CORE_RILC_PDDAPI_00_001,2-Jan-19
3,CORE_RILC:CORE_RILC_PDDDK_00_011,3-Jan-19
4,CORE_RILC:CORE_RILC_PPCCBITI_01_002,4-Jan-19
5,CORE_RILC:CORE_RILC_PPCFHM_01_001,5-Jan-19
6,CORE_RILC:CORE_RILC_PPCFHMI_00_003,6-Jan-19
7,CORE_RILC:CORE_RILC_PPCFSS_01_001,7-Jan-19
8,CORE_RILC:CORE_RILC_PPCFSSI_00_003,8-Jan-19
9,CORE_RILC:CORE_RILC_PPCMCCMS_00_003,9-Jan-19
