#  MSD 歌曲推荐——协同过滤推荐预处理
1. 隐式播放次数 --> 显式打分
2. 训练数据/测试数据分割
3. 对训练数据，建立倒排表，比实时查询数据库快
4. 计算每个用户的平均打分
5. 对训练数据，预计算所有用户之间的相似度,保存用户相似度矩阵
6. 对训练数据，预计算所有物品之间的相似度，保存物品相似度矩阵
7. 用训练数据训练SVD模型，保存SVD模型

In [3]:
# -*- coding:utf-8 -*-
import sys
# reload(sys)
# sys.setdefaultencoding("utf-8")

## 工具包

In [5]:
import pandas as pd
import numpy as np

from collections import defaultdict
import scipy.sparse as ss

#保存数据
import _pickle as cPickle
import scipy.io as sio

#距离
import scipy.spatial.distance as ssd

from numpy.random import random

%matplotlib inline

## 读取数据
用户(800)、歌曲（800）及其播放次数

In [6]:
dpath = './data/'
df_triplet = pd.read_csv( dpath  + 'triplet_dataset_sub.csv')

## 1. 隐式反馈 --> 打分

In [7]:
#每个用户的总播放次数
df_triplet_users = df_triplet[['user','play_count']].groupby('user').sum().reset_index()
df_triplet_users.rename(columns={'play_count':'total_play_count'},inplace=True)

#每首歌曲的播放比例
df_triplet = pd.merge(df_triplet, df_triplet_users)
df_triplet['fractional_play_count'] = df_triplet['play_count']/df_triplet['total_play_count']
del df_triplet_users

In [8]:
df_triplet.head()

Unnamed: 0,user,song,play_count,total_play_count,fractional_play_count
0,4e11f45d732f4861772b2906f81a7d384552ad12,SOCKSGZ12A58A7CA4B,1,259,0.003861
1,4e11f45d732f4861772b2906f81a7d384552ad12,SOCVTLJ12A6310F0FD,1,259,0.003861
2,4e11f45d732f4861772b2906f81a7d384552ad12,SODLLYS12A8C13A96B,3,259,0.011583
3,4e11f45d732f4861772b2906f81a7d384552ad12,SOEGIYH12A6D4FC0E3,1,259,0.003861
4,4e11f45d732f4861772b2906f81a7d384552ad12,SOFRQTD12A81C233C0,2,259,0.007722


## 2. 训练数据/测试数据分割

In [9]:
from sklearn.model_selection import train_test_split

total_index = df_triplet.index

train_index, test_index = train_test_split(total_index, train_size = 0.8,random_state = 7)



In [11]:
df_triplet_train = df_triplet.iloc[train_index]
df_triplet_test = df_triplet.iloc[test_index]

df_triplet_train.to_csv(path_or_buf= dpath + 'triplet_dataset_sub_train.csv')
df_triplet_test.to_csv(path_or_buf= dpath + 'triplet_dataset_sub_test.csv')

## 3. 对训练数据，事先计算好倒排表，比实时查询数据库快
用户和item重新建索引

In [12]:
#所用的用户和item
users = list(df_triplet_train['user'].unique())
items = list(df_triplet_train['song'].unique())
n_users = len(users)
n_items = len(items)

print("number of Users :%d" % n_users)
print("number of Songs :%d" % n_items)

#倒排表
#统计每个用户播放过的歌曲   / 播放每个歌曲的用户
user_items = defaultdict(set)
item_users = defaultdict(set)

#用户-物品关系矩阵表，稀疏矩阵，
user_item_scores = ss.dok_matrix((n_users, n_items))

#重新编码用户索引字典
users_index = dict()
items_index = dict()
for i, u in enumerate(users):
    users_index[u] = i


#重新编码活动索引字典    
for i, e in enumerate(items):
    items_index[e] = i

n_records = df_triplet_train.shape[0]
for i in range(n_records):
    user_index_i = users_index[df_triplet_train.iloc[i]['user'] ] #用户
    item_index_i = items_index[df_triplet_train.iloc[i]['song'] ]#歌曲
    
    user_items[user_index_i].add(item_index_i)    #该用户的歌曲
    item_users[item_index_i].add(user_index_i)    #播放该歌曲的用户
        
    score = df_triplet_train.iloc[i]['fractional_play_count']  #播放次数的比例
    user_item_scores[user_index_i, item_index_i] = score

#倒排表
cPickle.dump(user_items, open("user_items.pkl", 'wb'))
cPickle.dump(item_users, open("item_users.pkl", 'wb'))

#保存用户-物品关系矩阵R，以备后用
sio.mmwrite("user_item_scores", user_item_scores)


#保存用户索引表
cPickle.dump(users_index, open("users_index.pkl", 'wb'))
#保存活动索引表
cPickle.dump(items_index, open("items_index.pkl", 'wb'))

number of Users :786
number of Songs :800


## 4. 计算每个用户的平均打分 和所有用户的平均打分

In [13]:
users_mu = np.zeros(n_users)
for u in range(n_users):  
    n_user_items = 0
    r_acc = 0.0
    
    for i in user_items[u]:  #用户打过分的item
        r_acc += user_item_scores[u,i]
        n_user_items += 1
 
    users_mu[u] = r_acc/n_user_items

cPickle.dump(users_mu, open("users_mu.pkl", 'wb')) 

#所有用户的平均打分
mu = df_triplet_train['fractional_play_count'].mean()  #average rating
cPickle.dump(mu, open("mu.pkl", 'wb'))

## 5.  预先计算好所有用户之间的相似度

### 5.1.1 计算两个用户之间的相似度
以播放比例为特征

In [14]:
def user_similarity_playcount(uid1, uid2 ):
    si={}  #有效item（两个用户均有打分的item）的集合
    for item in user_items[uid1]:  #uid1所有打过分的Item1
        if item in user_items[uid2]:  #如果uid2也对该Item打过分
            si[item]=1  #item为一个有效item
        
    n=len(si)   #有效item数，有效item为即对uid对Item打过分，uid2也对Item打过分
    if (n==0):  #没有共同打过分的item，相似度设为0？
        similarity=0.0  
        return similarity  
        
    #用户uid1的有效打分(减去该用户的平均打分)
    s1=np.array([user_item_scores[uid1,item]-users_mu[uid1] for item in si])  
        
    #用户uid2的有效打分(减去该用户的平均打分)
    s2=np.array([user_item_scores[uid2,item]-users_mu[uid2] for item in si])  
        
    similarity = 1 - ssd.cosine(s1, s2) 
    
    if np.isnan(similarity): #s1或s2的l2模为0（全部等于该用户的平均打分）
        similarity = 0.0
    return similarity  

### 5.1.2 计算两个用户之间的相似度
以是否播放过歌曲为特征

In [15]:
def user_similarity_played(uid1, uid2 ):
    #得到uid1的特征表示：Calculate unique items of item uid1
    s1 = user_items[uid1] 
    
    #得到uid1的特征表示：Calculate unique items of item uid1
    s2 = user_items[uid2]
        
    #Calculate intersection of songs played by uid1 and uid2
    intersection = s1.intersection(s2)
                
    #Calculate Jaccard Index
    if len(intersection) != 0:
        #Calculate union of songs played by uid1 and uid2
        union = s1.union(s2)
        similarity = float(len(intersection))/float(len(union))
    else:
        similarity = 0

    return similarity  

### 5.2 计算好所有用户之间的相似性
对用户比较少、用户比较固定的的系统适用

In [16]:
users_similarity_matrix = np.matrix(np.zeros(shape=(n_users, n_users)), float)

for ui in range(n_users):
    users_similarity_matrix[ui,ui] = 1.0
    
    #打印进度条
    if(ui % 100 == 0):
        print ("ui=%d " % (ui))

    for uj in range(ui+1,n_users):   
        users_similarity_matrix[uj,ui] = user_similarity_played(ui, uj)
        users_similarity_matrix[ui,uj] = users_similarity_matrix[uj,ui]

cPickle.dump(users_similarity_matrix, open("users_similarity_played.pkl", 'wb')) 

ui=0 
ui=100 
ui=200 
ui=300 
ui=400 
ui=500 
ui=600 
ui=700 


## 6. 事先计算好所有item之间的相似性

### 6.1.1 计算两个item之间的相似度
以播放次数/播放比例为特征

In [17]:
def item_similarity_playcount(iid1, iid2):
    su={}  #有效item（两个用户均有打分的item）的集合
    for user in item_users[iid1]:  #对iid1所有打过分的用户
        if user in item_users[iid2]:  #如果该用户对iid2也打过分
            su[user]=1  #该用户为一个有效user
        
    n=len(su)   #有效item数，有效item为即对uid对Item打过分，uid2也对Item打过分
    if (n==0):  #没有共同打过分的item，相似度设为0？
        similarity=0  
        return similarity  
        
    #iid1的有效打分(减去用户的平均打分)
    s1=np.array([user_item_scores[user,iid1]-users_mu[user] for user in su])
        
    #iid2的有效打分(减去用户的平均打分)
    s2=np.array([user_item_scores[user,iid2]-users_mu[user] for user in su])  
    
    similarity = 1 - ssd.cosine(s1, s2) 
    if( np.isnan(similarity) ):  #分母为0（s1或s2中所有元素为0）
        similarity = 0.0
    return similarity  

### 6.1.2 计算两个item之间的相似度
以是否播放为特征
比以播放次数为特征计算快

In [18]:
def item_similarity_played(iid1, iid2 ):
    #得到iid1的特征表示：Calculate unique users of iid1
    s1 = item_users[iid1] 
    
    #得到iid2的特征表示：Calculate unique users of iid2
    s2 = item_users[iid2]
        
    #Calculate intersection of users played iid1 and iid2
    intersection = s1.intersection(s2)
                
    #Calculate Jaccard Index
    if len(intersection) != 0:
        #Calculate union of songs played by uid1 and uid2
        union = s1.union(s2)
        similarity = float(len(intersection))/float(len(union))
    else:
        similarity = 0

    return similarity  

### 6.2 计算所有item之间的相似性
对item比较少、Item比较固定的系统适用

In [19]:
items_similarity_matrix = np.matrix(np.zeros(shape=(n_items, n_items)), float)

for i in range(n_items):
    items_similarity_matrix[i,i] = 1.0
    
    #打印进度条
    if(i % 100 == 0):
        print ("i=%d " % (i) )

    for j in range(i+1,n_items):   #items by user 
        items_similarity_matrix[j,i] = item_similarity_played(i, j)
        items_similarity_matrix[i,j] = items_similarity_matrix[j,i]

cPickle.dump(items_similarity_matrix, open("items_similarity_played.pkl", 'wb')) 

i=0 
i=100 
i=200 
i=300 
i=400 
i=500 
i=600 
i=700 


## 7. SVD模型训练

### 7.1 模型初始化

In [20]:
#隐含变量的维数
K = 40

#item和用户的偏置项
bi = np.zeros((n_items,1))    
bu = np.zeros((n_users,1))   

#item和用户的隐含向量
qi =  np.zeros((n_items,K))    
pu =  np.zeros((n_users,K))   

#隐含向量初始化
for uid in range(n_users):  #对每个用户
    pu[uid] = np.reshape(random((K,1))/10*(np.sqrt(K)),K)
       
for iid in range(n_items):  #对每个item
    qi[iid] = np.reshape(random((K,1))/10*(np.sqrt(K)),K)

### 7.2 根据当前参数，预测用户uid对Item（iid）的打分

In [21]:
def svd_pred(uid, iid):  
    score = mu + bi[iid] + bu[uid] + np.sum(qi[iid]* pu[uid])  
        
    #将打分范围控制在1-5之间
    #if score>5:  
        #score = 5  
    #elif score<1:  
        #score = 1  
        
    return score  

### 7.3 模型训练

In [22]:
#gamma：为学习率
#Lambda：正则参数
#steps：迭代次数

steps=50
gamma=0.04
Lambda=0.15

#总的打分记录数目
n_records = df_triplet_train.shape[0]

for step in range(steps):  
    print ('The ' + str(step) + '-th  step is running' )
    rmse_sum=0.0 
            
    #将训练样本打散顺序
    kk = np.random.permutation(n_records)  
    for j in range(n_records):  
        #每次一个训练样本
        line = kk[j]  
        
        uid = users_index [df_triplet_train.iloc[line]['user']]
        iid = items_index [df_triplet_train.iloc[line]['song']]
    
        rating  = df_triplet_train.iloc[line]['fractional_play_count']
                
        #预测残差
        eui = rating - svd_pred(uid, iid)  
        #残差平方和
        rmse_sum += eui**2  
                
        #随机梯度下降，更新
        bu[uid] += gamma * (eui - Lambda * bu[uid])  
        bi[iid] += gamma * (eui - Lambda * bi[iid]) 
                
        temp = qi[iid]  
        qi[iid] += gamma * (eui* pu[uid]- Lambda*qi[iid] )  
        pu[uid] += gamma * (eui* temp - Lambda*pu[uid])  
            
    #学习率递减
    gamma=gamma*0.93  
    print ("the rmse of this step on train data is ",np.sqrt(rmse_sum/n_records))  

The 0-th  step is running
the rmse of this step on train data is  [0.88016747]
The 1-th  step is running
the rmse of this step on train data is  [0.1440474]
The 2-th  step is running
the rmse of this step on train data is  [0.09634916]
The 3-th  step is running
the rmse of this step on train data is  [0.08015438]
The 4-th  step is running
the rmse of this step on train data is  [0.0717879]
The 5-th  step is running
the rmse of this step on train data is  [0.06604374]
The 6-th  step is running
the rmse of this step on train data is  [0.06218259]
The 7-th  step is running
the rmse of this step on train data is  [0.05910418]
The 8-th  step is running
the rmse of this step on train data is  [0.05652856]
The 9-th  step is running
the rmse of this step on train data is  [0.05445164]
The 10-th  step is running
the rmse of this step on train data is  [0.05273756]
The 11-th  step is running
the rmse of this step on train data is  [0.05136268]
The 12-th  step is running
the rmse of this step on 

### 7.3 保存模型参数

In [29]:
# A method for saving object data to JSON file
import json
def save_json(filepath):
    dict_ = {}
    dict_['mu'] = mu
    dict_['K'] = K
    
    dict_['bi'] = bi.tolist()
    dict_['bu'] = bu.tolist()
    
    dict_['qi'] = qi.tolist()
    dict_['pu'] = pu.tolist()

    # Creat json and save to file
    json_txt = json.dumps(dict_)
    with open(filepath, 'w') as file:
        file.write(json_txt)

In [30]:
# A method for loading data from JSON file
def load_json(filepath):
    with open(filepath, 'r') as file:
        dict_ = json.load(file)

        mu = dict_['mu']
        K = dict_['K']

        bi = np.asarray(dict_['bi'])
        bu = np.asarray(dict_['bu'])
    
        qi = np.asarray(dict_['qi'])
        pu = np.asarray(dict_['pu'])

In [31]:
save_json('svd_model.json')