In [1]:
#-*-coding:utf8-*-

"""
author:YJM
date:20190420
util function

"""
import os
import sys
import numpy as np
import pandas as pd
import operator

In [2]:
if sys.version_info[0] > 2:
    is_py3 = True
else:
    reload(sys)
    sys.setdefaultencoding("utf-8")
    is_py3 = False


### 通过movies.csv获取电影信息

In [3]:
def get_item_info(input_file): 
    """
    Args:
        input_file:item info file
    Return：
        a dict:key itemid,value:[title,genre]
    """
#     首先确定是否路径存在文件
    if not os.path.exists(input_file):
        return {}
    item_info={}
    linenum=0
#     读取文件过程
    fp = open(input_file)
    for line in fp:
#         第一行是表头不要
        if linenum == 0:
            linenum += 1
            continue
        item = line.strip().split(',')
        if len(item)<3:
            continue#这里过滤掉的是不符合要求的数据
        elif len(item) == 3:
            itemid,title,genre = item[0],item[1],item[2]
        elif len(item)>3:#主要是为了处理标题中也可能有','的情况
            itemid = item[0]
            genre = item[-1]
            title = ','.join(item[1:-1])#将被误分的选项都拼回来
        item_info[itemid]=[title,genre]
    fp.closed
    return item_info

In [4]:
item_dict = get_item_info("../data/movies.csv")
# item_dict

### 通过ratings15000.csv获取平均分

In [5]:
def get_ave_score(input_file):
    """
    Args:
        input_file:item info file
    Return:
        dict:{itemid : avg_score}
    """
    if not os.path.exists(input_file):
        return {}
    linenum = 0
    record_dict = {}#用来记录物品被多少人打分过和物品总得分是多少
    score_dict = {}
    fp = open(input_file)
    for line in fp:
        if linenum == 0:
            linenum += 1
            continue
        item = line.strip().split(',')
        if len(item)<4:
            continue
        userid,itemid,rating = item[0],item[1],float(item[2])
        if itemid not in record_dict:
            record_dict[itemid]=[0,0]#被多少人打分过，以及总分是多少
        record_dict[itemid][0] += 1#每有一次打分就加一
        record_dict[itemid][1] += rating#每多一个人就加上打分的人的分数
    fp.closed
#     上一步的字典进行处理得到物品与物品平均分的字典
    for itemid in record_dict:
        score_dict[itemid] = round(record_dict[itemid][1]/record_dict[itemid][0],3)#小数点后保留3位有效数字
    return score_dict

In [6]:
scor_dict=get_ave_score("../data/ratings15000.csv")
# scor_dict

### 获取数据

In [7]:
def get_train_data(input_file):
    """
    :param input_file:
    input_file:user item rating file
    :return:
    alist:[(userid,itemid,label),(userid,itemid,label)]
    """
    if not os.path.exists(input_file):
        return {}  
    score_dict = get_ave_score(input_file)#通过前面的方法获取平均分
    neg_dict = {}#负样本集合
    pos_dict = {}#正样本集合
    train_data = []
    linenum = 0
    score_thr = 4#正负分界线：4分
    fp = open(input_file)
    for line in fp:
        if linenum == 0:
            linenum += 1
            continue
        item = line.strip().split(',')
        if len(item)<4:
            continue
        userid,itemid,rating = item[0],item[1],float(item[2])
        if userid not in pos_dict:#正例的字典
            pos_dict[userid] = []
        if userid not in neg_dict:#负例的字典
            neg_dict[userid] = []
        if rating >=score_thr:
            pos_dict[userid].append((itemid,1))#这里正样本的格式是（userid，itemid，1）1代表正样本
        else:
            score = score_dict.get(itemid,0)#如果平均分没有获取到就设置成0
            neg_dict[userid].append((itemid,score))#这里负样本的格式是（userid，itemid，平均得分）    
    fp.closed
    for userid in pos_dict:
        data_num = min(len(pos_dict[userid]),len(neg_dict.get(userid,[])))
        if data_num > 0:
            train_data += [(userid,zuhe[0],zuhe[1]) for zuhe in pos_dict[userid]][:data_num]
        else:
            continue
        #将负样本倒序排好并且取和正样本同样大小的样本集合
        sorted_neg_list = sorted(neg_dict[userid],key=lambda element:element[1], reverse=True)[:data_num]
        train_data += [(userid,zuhe[0],0)for zuhe in sorted_neg_list]#将最后一个值替换成0
    return train_data

In [8]:
train_data=get_train_data("../data/ratings15000.csv")
# train_data

### 将lfm的模型进行训练

In [9]:
def lfm_train(train_data,F,alpha,beta,step):
    """
    Args:
        train_data: train_data for lfm
        F:user vector len,item vector len (两个F是一个值)
        alpha:regularization factor
        beta:learning rate
        step:iteration num 
    return:
        dict:key itemid,  value:list
        dict:key userid, value:list
    """
    user_vec = {}
    item_vec = {}
    for step_index in range(step):#迭代轮次
        for data_instance in train_data:
            userid,itemid,label = data_instance
            if userid not in user_vec:
                user_vec[userid] = init_model(F)
            if itemid not in item_vec:
                item_vec[itemid] = init_model(F)
            delta = label - model_predict(user_vec[userid],item_vec[itemid])
            for index in range(F):
#             相当于损失函数求导：（这里用到了链式求导法则）
#这个地址 https://www.bilibili.com/video/av43219418/?p=4可以了解具体算法过程
#             beta：学习率 alpha:正则项
#             item_vec[itemid][index]物品之间的相似度
#             user_vec[userid][index]用户对相似物品的的打分 
                user_vec[userid][index] += beta*(delta*item_vec[itemid][index]-alpha*user_vec[userid][index])
                item_vec[itemid][index] += beta*(delta*user_vec[userid][index]-alpha*item_vec[itemid][index])
            beta = beta*0.9      #学习率衰减
    return user_vec,item_vec

### 初始化模型

In [10]:
def init_model(vector_len):
    """
        vector_len:the len of vector
    """
    return np.random.randn(vector_len)

### 计算用户向量和物品向量的余弦夹角

In [11]:
def model_predict(user_vector,item_vector):
    res = np.dot(user_vector,item_vector)/(np.linalg.norm(user_vector)*np.linalg.norm(item_vector))
    return res

### 模型训练函数

In [12]:
def model_train_process():
    train_data = get_train_data("../data/ratings15000.csv")
    user_vec,item_vec = lfm_train(train_data,50,0.01,0.1,50)
#     print(user_vec["1"])
#     print(item_vec["2455"])
    recom_result = give_recom_result(user_vec,item_vec,'24')
    print(recom_result)
#     ana_recom_result(train_data,'24',recom_result)

### 模型训练执行

In [13]:
model_train_process()

[-0.66064846 -0.16885769  0.79534224 -1.27571948  0.84113057  0.24479349
  1.43179505  0.32890686  1.03915417 -1.38712579  1.48815038 -0.93173227
  0.87145757 -0.24535336 -0.69717929 -0.46525241  0.09268469 -0.12284571
 -1.29893595  1.86794282  0.55676862  0.02324428 -2.19629988  0.24781382
  0.67215346 -0.78426479  1.93674938 -1.0061453  -0.10278354 -0.15676927
 -0.76201603 -0.40614936 -0.80985163 -0.67269515 -0.51470005 -1.59799938
  0.37796039 -0.34689199  0.86682136  0.79710087  0.69021231  2.25656631
 -1.14916437  1.41115855 -0.74669644 -0.18372573 -0.78697867 -1.11539709
 -0.79728243  0.82415269]
[ 0.67435815 -0.27749904  0.16110073  0.04809448  1.25067757  2.61046909
  0.84544087  0.36419418  0.42240687  0.35511915 -0.33726155 -0.91387933
 -0.71393123  0.28262529  0.34511031 -1.49130747 -1.10742634 -1.27262947
 -0.08070769  0.52028931  1.31620715 -0.72288403  1.38157166 -1.06200667
  0.64611597 -0.27037724  0.73061164  0.39454701 -2.02337138  0.20120939
 -1.30318312 -1.56644232 

### 推荐过程

In [14]:
def give_recom_result(user_vec,item_vec,userid):
    fix_num = 10
#     用户如果不在用户模型之中，那么直接返回空数组
    if userid not in user_vec:
        return []
    record = {}#用来存储每一个item和user_vector之间的距离
    recom_list = []
    user_vector = user_vec[userid]
    for itemid in item_vec:
#         计算每一个itemid和要推荐的userid对应的向量之间的距离
        item_vector = item_vec[itemid]
        res = np.dot(user_vector,item_vector)/(np.linalg.norm(user_vector))*(np.linalg.norm(item_vector))
        record[itemid] = res
    print(record)
    for zuhe in sorted(record.iteritems(),key=operator.itemgetter(1),reverse=True)[:fix_num]:
        itemid = zuhe[0]
        score = round(zuhe[1],3)
        recom_list.append((itemid,score))
    return recom_list                                                                            

### 评估推荐结果

In [None]:
def ana_recom_result(train_data,userid,recom_list):
    item_info = get_item_info("../data/movies.csv")
    for data_instance in train_data:
        tmp_userid,itemid,label = data_instance
        if label == 1 and tmp_userid == userid:
            print(item_info[itemid])
        print("recom result")
        for zuhe in recom_list:
            print(item_info[zuhe[0]])