In [19]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn import model_selection, metrics
import time

In [2]:
df = pd.read_csv('/Users/JQC/Desktop/ml-100k/u.data', sep='\t', \
                 names=['user_id','item_id','rating','timestamp'])
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [16]:
df.shape

(100000, 4)

In [4]:
n_users = df.user_id.nunique()
n_items = df.item_id.nunique()
n_users, n_items

(943, 1682)

In [9]:
[*df.itertuples()][:10]

[Pandas(Index=0, user_id=196, item_id=242, rating=3, timestamp=881250949),
 Pandas(Index=1, user_id=186, item_id=302, rating=3, timestamp=891717742),
 Pandas(Index=2, user_id=22, item_id=377, rating=1, timestamp=878887116),
 Pandas(Index=3, user_id=244, item_id=51, rating=2, timestamp=880606923),
 Pandas(Index=4, user_id=166, item_id=346, rating=1, timestamp=886397596),
 Pandas(Index=5, user_id=298, item_id=474, rating=4, timestamp=884182806),
 Pandas(Index=6, user_id=115, item_id=265, rating=2, timestamp=881171488),
 Pandas(Index=7, user_id=253, item_id=465, rating=5, timestamp=891628467),
 Pandas(Index=8, user_id=305, item_id=451, rating=3, timestamp=886324817),
 Pandas(Index=9, user_id=6, item_id=86, rating=3, timestamp=883603013)]

In [11]:
df.user_id.unique().max(), df.item_id.unique().max()

(943, 1682)

In [12]:
user_item_matrix = np.zeros((n_users, n_items))
# 把用户所有的评分的电影对应到矩阵中
for line in df.itertuples():
    user_item_matrix[line[1]-1,line[2]-1] = line[3]

In [13]:
user_item_matrix[:10, :10]

array([[5., 3., 4., 3., 3., 5., 4., 1., 5., 3.],
       [4., 0., 0., 0., 0., 0., 0., 0., 0., 2.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [4., 3., 0., 0., 0., 0., 0., 0., 0., 0.],
       [4., 0., 0., 0., 0., 0., 2., 4., 4., 0.],
       [0., 0., 0., 5., 0., 0., 5., 5., 5., 4.],
       [0., 0., 0., 0., 0., 0., 3., 0., 0., 0.],
       [0., 0., 0., 0., 0., 5., 4., 0., 0., 0.],
       [4., 0., 0., 4., 0., 0., 4., 0., 4., 0.]])

In [25]:
user_item_matrix.shape

(943, 1682)

In [14]:
def LFM_grad_desc(R, K=2, max_iter=1000, alpha=0.0001, lamda=0.002 ):
    """
    LFM随机梯度下降求解
    R 为用户~评分矩阵
    K 隐含特征向量维度
    max_iter 最大学习次数
    alpha  步长 学习率
    lamda 正则化系数
    """
    M = R.shape[0]
    N = R.shape[1]
    
    # 随机生成
    P = np.random.rand(M, K)
    Q = np.random.rand(N, K)
    Q = Q.T
    
    # 开始迭代
    for step in range(max_iter):
        # 对所有的用户和物品进行遍历
        for u in range(M):
            for i in range(N):
                # 取出当前用户有评分的数据 计算偏差
                if R[u][i] > 0:
                    # 预测值和真实值的偏差
                    error_ui = np.dot(P[u, :], Q[:, i]) - R[u][i]
                    # 按照随机梯度下降算法更新 Pu Qi
                    for k in range(K):
                        P[u][k] = P[u][k] - alpha *(2 * error_ui * Q[k][i] + 2 * lamda * P[u][k]) 
                        Q[k][i] = Q[k][i] - alpha * (2 * error_ui * P[u][k] + 2 * lamda * Q[k][i])
        # 所有的u,i遍历完, 所有的特征向量更新完成, 得到P, Q,计算预测评分矩阵
#         pred_r = np.dot(P, Q)
        cost = 0
        # 计算当前损失
        for u in range(M):
            for i in range(N):
                if R[u][i] > 0:
                    cost += (np.dot(P[u, :], Q[:, i]) - R[u][i]) ** 2
                    # 加上正则项
                    for k in range(K):
                        cost += lamda * (P[u][k] ** 2 + Q[k][i] ** 2)
        if cost < 0.0001:
            break
                # 计算当前损失函数
    return P, Q.T, cost
    

In [21]:
start_time = time.time()

K = 5
max_iter = 5000
alpha = 0.0002
lamda = 0.004

# user_item_matrix 数据量太大 只取前10个参加运算
P, Q, cost = LFM_grad_desc(user_item_matrix[:10, :10], K, max_iter, alpha, lamda)

stop_time = time.time()

print ('Optimization time : ', (stop_time - start_time)/60., 'minutes')
print('cost', cost)

Optimization time :  0.11092429558436076 minutes
cost 1.352299971502676


In [23]:
# pd.DataFrame({'user_id': np.array(range(1, 11)), 'item_id': })
pd.DataFrame(user_item_matrix[:10, :10])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0
6,0.0,0.0,0.0,5.0,0.0,0.0,5.0,5.0,5.0,4.0
7,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0
9,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,0.0


In [24]:
pred_r = np.dot(P, Q.T)
pd.DataFrame(pred_r)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,4.951419,3.046813,3.996317,2.9971,2.996804,4.991639,4.001093,1.01344,4.984624,3.001425
1,3.865357,2.993616,2.786902,3.284245,2.601122,4.212949,3.706697,3.171575,3.690556,2.176123
2,2.683202,2.099226,1.601583,2.580292,2.354685,2.741286,2.116567,2.763645,2.412367,2.000967
3,2.838995,1.873842,2.177975,2.370572,1.716413,2.858592,2.79655,1.139803,2.609502,1.49547
4,4.050629,2.924564,2.709017,3.172211,3.171571,4.10144,2.975482,2.792371,3.858503,2.774708
5,3.938175,3.322656,2.325576,2.698563,3.47497,4.227492,2.054192,4.005664,3.996053,2.713161
6,5.547954,4.190762,3.688158,5.084573,4.351623,5.787891,4.904502,4.974183,5.109473,3.869004
7,3.190696,2.489418,2.118979,3.054255,2.440935,3.38488,3.00248,3.072616,2.906734,2.079657
8,4.466339,3.646772,3.152568,3.559327,3.08663,4.999545,3.98984,4.101666,4.403317,2.474611
9,4.171672,3.191028,2.894254,3.881194,3.037162,4.455667,4.058509,3.740186,3.849437,2.666552
