# MSD 歌曲推荐——推荐
1. 定义不同模型中用户对item的打分函数
2. 装载训练好的模型
3. 根据模型，预测用户对item的打分
4. 读取测试数据
5. 根据用户对item的打分产生推荐，并计算推荐结果的评价指标

In [3]:
# coding: utf-8
# -*- coding:utf-8 -*-
import sys
# reload(sys)
# sys.setdefaultencoding("utf-8")

import pandas as pd
import numpy as np

import _pickle as cPickle
import scipy.io as sio

#距离
import scipy.spatial.distance as ssd

from numpy.random import random  

## 1. 对给定用户，推荐物品/计算打分
不同的推荐算法，打分不同

### 1.1 基于用户的协同过滤

In [4]:
### 预测用户uid对item iid的打分
### similarity_matrix为用户与用户之间的相似性矩阵
def user_CF_pred(uid, iid, similarity_matrix): 
    sim_accumulate=0.0  
    rat_acc=0.0 
    for user_id in item_users[iid]:  #对item iid打过分的所有用户
        #计算当前用户与给item i打过分的用户之间的相似度
        #sim = user_similarity(user_id, uid)
        sim = similarity_matrix[user_id,uid]
            
        if sim != 0: 
            rat_acc += sim * (user_item_scores[user_id,iid] - users_mu[user_id])   #用户user对item i的打分
            sim_accumulate += np.abs(sim)  
        
    if sim_accumulate != 0:   
        score = users_mu[uid] + rat_acc/sim_accumulate
    else: #no similar users,return average rates of the user
        score = users_mu[uid]
    
    return score

### 1.2 基于物品的协同过滤
利用用户打过分的item中，与item最相似的n_Knns最物品计算预测打分

In [5]:
### 预测用户uid对item iid的打分
### similarity_matrix为item与item之间的相似性矩阵
### n_Knns最相似的物品的数目
def item_CF_pred(uid, iid, similarity_matrix, n_Knns): 
    sim_accumulate=0.0  
    rat_acc=0.0 
    n_nn_items = 0
    
    #相似度排序
    cur_items_similarity = np.array(similarity_matrix[iid,:])
    cur_items_similarity = cur_items_similarity.flatten()
    sort_index = sorted(((e,i) for i,e in enumerate(list(cur_items_similarity))), reverse=True)
    
    for i in range(0,len(sort_index)):
        cur_item_index = sort_index[i][1]
        
        if n_nn_items >= n_Knns:  #相似的items已经足够多（>n_Knns）
            break;
        
        if cur_item_index in user_items[uid]: #对用户打过分的item
           #计算当前用户打过分item与其他item之间的相似度
            #sim = item_similarity(cur_item_index, iid)
            sim = similarity_matrix[iid, cur_item_index]
            
            if sim != 0: 
                rat_acc += sim * (user_item_scores[uid, cur_item_index])   #用户user对item i的打分
                sim_accumulate += np.abs(sim)  
        
            n_nn_items += 1
        
    if sim_accumulate != 0:   
        score = rat_acc/sim_accumulate
    else:   #no similar items,return average rates of the user  
        score = users_mu[uid]
    
    if score <0:
        score = 0.0
    
    return score

### 1.3 基于SVD的协同过滤

In [6]:
def svd_CF_pred(uid, iid):  
    score = mu + bi[iid] + bu[uid] + np.sum(qi[iid]* pu[uid])  
    return score  

## 2. 从文件读入训练好的模型

In [7]:
# A method for loading data from JSON file
#用于读取训练好的svd模型
def load_json(filepath):
    with open(filepath, 'r') as file:
        dict_ = json.load(file)

        mu = dict_['mu']
        K = dict_['K']

        bi = np.asarray(dict_['bi'])
        bu = np.asarray(dict_['bu'])
    
        qi = np.asarray(dict_['qi'])
        pu = np.asarray(dict_['pu'])

In [17]:
import json
#用户和item的索引
users_index = cPickle.load(open("users_index.pkl", 'rb'))
items_index = cPickle.load(open("items_index.pkl", 'rb'))

n_users = len(users_index)
n_items = len(items_index)
    
#用户-物品关系矩阵R
user_item_scores = sio.mmread("user_item_scores").todense()
    
#倒排表
##每个用户播放的歌曲
user_items = cPickle.load(open("user_items.pkl", 'rb'))
##事件参加的用户
item_users = cPickle.load(open("item_users.pkl", 'rb'))

#所有用户之间的相似度
similarity_matrix_users = cPickle.load(open("users_similarity_played.pkl", 'rb'))

#所有item之间的相似度
similarity_matrix_items = cPickle.load(open("items_similarity_played.pkl", 'rb'))

#svd模型
load_json('svd_model.json')

#每个用户的平均打分
users_mu = cPickle.load(open("users_mu.pkl", 'rb'))


#训练好的svd模型
#load_json('svd_model.json')
with open('svd_model.json') as file:
    dict_ = json.load(file)

    mu = dict_['mu']
    K = dict_['K']

    bi = np.asarray(dict_['bi'])
    bu = np.asarray(dict_['bu'])
    
    qi = np.asarray(dict_['qi'])
    pu = np.asarray(dict_['pu'])

## 3. 根据模型，预测用户对item的打分
不同的推荐算法，只是预测打分函数不同，
user_items_scores[i] = user_CF_pred(cur_user_id, i)  #预测打分

In [18]:
#user：用户
#返回推荐items及其打分（DataFrame）

N_KNNS = 10

def recommend(user):
    cur_user_id = users_index[user]
    
    #训练集中该用户打过分的item
    cur_user_items = user_items[cur_user_id]

    #该用户对所有item的打分
    user_items_scores = np.zeros(n_items)

    #预测打分
    for i in range(n_items):  # all items 
        if i not in cur_user_items: #训练集中没打过分
            #user_items_scores[i] = user_CF_pred(cur_user_id, i, similarity_matrix_users)  #预测打分
            #user_items_scores[i] = item_CF_pred(cur_user_id, i, similarity_matrix_items, N_KNNS)  #预测打分
            user_items_scores[i] = svd_CF_pred(cur_user_id, i)  #预测打分
    
    #推荐
    #Sort the indices of user_item_scores based upon their value，Also maintain the corresponding score
    sort_index = sorted(((e,i) for i,e in enumerate(list(user_items_scores))), reverse=True)
    
    #Create a dataframe from the following
    columns = ['item_id', 'score']
    df = pd.DataFrame(columns=columns)
         
    #Fill the dataframe with top 20 (n_rec_items) item based recommendations
    #sort_index = sort_index[0:n_rec_items]
    #Fill the dataframe with all items based recommendations
    for i in range(0,len(sort_index)):
        cur_item_index = sort_index[i][1] 
        cur_item = list (items_index.keys()) [list (items_index.values()).index (cur_item_index)]
            
        if ~np.isnan(sort_index[i][0]) and cur_item_index not in cur_user_items:
            df.loc[len(df)]=[cur_item, sort_index[i][0]]
    
    return df

In [19]:
#recommend('b21e1b6b14b7b3b8b8e683e82ede0e59ad64e9f7')

##  4. 读取测试数据

In [20]:
dpath = './data/'
df_triplet_test = pd.read_csv( dpath  + 'triplet_dataset_sub_test.csv')

## 5. 测试，并计算评价指标
PR、覆盖度、RMSE

In [None]:
#统计总的用户
unique_users_test = df_triplet_test['user'].unique()

#为每个用户推荐的item的数目
N_RS_ITEMS = 10


#性能评价参数初始化，用户计算Percison和Recall
n_hits = 0
n_total_rec_items = 0
n_test_items = 0

#所有被推荐商品的集合（对不同用户），用于计算覆盖度
all_rec_items = set()

#残差平方和，用与计算RMSE
rss_test = 0.0

#对每个测试用户
for user in unique_users_test:
    #测试集中该用户打过分的电影（用于计算评价指标的真实值）
    if user not in users_index:   #user在训练集中没有出现过，新用户不能用协同过滤
        print(str(user) + ' is a new user.\n')
        continue
   
    #测试集真实值
    df_user_records_test= df_triplet_test[df_triplet_test.user == user]
    
    #对每个测试用户，计算该用户对训练集中未出现过的商品的打分，并基于该打分进行推荐（top N_RS_ITEMS）
    #返回结果为DataFrame
    df_rec_items = recommend(user)
    for i in range(N_RS_ITEMS):
        item = df_rec_items.iloc[i]['item_id']
        
        if item in df_user_records_test['song'].values:
            n_hits += 1
        all_rec_items.add(item)
    
    #计算rmse
    #对测试集中的每条记录，计算真实值与预测之间的RMSE
    for i in range(df_user_records_test.shape[0]):
        item = df_user_records_test.iloc[i]['song']
        score = df_user_records_test.iloc[i]['fractional_play_count']
        
        df1 = df_rec_items[df_rec_items.item_id == item]
        if(df1.shape[0] == 0): #item不在推荐列表中，可能是新item在训练集中没有出现过，或者该用户已经打过分新item不能被协同过滤推荐
            print(str(item) + ' is a new item or  user \n')
            continue
        pred_score = df1['score'].values[0]
        
        rss_test += (pred_score - score)**2     #残差平方和
    
    #推荐的item总数
    n_total_rec_items += N_RS_ITEMS
    
    #真实item的总数
    n_test_items += df_user_records_test.shape[0]

#Precision & Recall
precision = n_hits / (1.0*n_total_rec_items)
recall = n_hits / (1.0*n_test_items)

#覆盖度：推荐商品占总需要推荐商品的比例
coverage = len(all_rec_items) / (1.0* n_items)

#打分的均方误差
rmse=np.sqrt(rss_test / df_triplet_test.shape[0])  

de27b74444dae039f76e421362c6a914da9f8b41 is a new user.

467e0e46181933c7e1a936e513ca55fbab4edaed is a new user.



In [55]:
precision

0.019425444596443228

In [56]:
recall

0.018933333333333333

In [57]:
coverage

0.10375

In [58]:
rmse

0.054327201333121231

           Precision       Recall         Coverage     RMSE
user_CF    0.010534        0.010267       0.35375      0.050265
item_CF    0.018878        0.0184         0.99375      0.051091
svd_CF     0.019425        0.018933       0.10375      0.054327            