In [1]:
import sklearn
import pandas as pd 
import numpy as np
import gc
import time
import torch
from operator import itemgetter
from sklearn.model_selection import KFold
from queue import PriorityQueue



path = "D:/kaggle/MovieLens/"
path2 = 'D:/kaggle/MovieLens/ml-latest-small/'
#data = pd.read_csv( path+"rating.csv", usecols=[0,1])
data = pd.read_csv( path2+"ratings.csv", usecols=[0,1])

In [2]:
print(data.shape)
data.head()

(100836, 2)


Unnamed: 0,userId,movieId
0,1,1
1,1,3
2,1,6
3,1,47
4,1,50


In [3]:
# 剔除稀疏的数据
temp = data.groupby(['movieId'],as_index=False)['userId'].count().rename(columns={'userId':"user_count"})
data = data.merge(temp, 'left', 'movieId')
data = data[data['user_count'] > 3]
temp = data.groupby(['userId'],as_index=False)['movieId'].count().rename(columns={'movieId':"movie_count"})
data = data.merge(temp, 'left', 'userId')
data = data[data['movie_count'] > 3]

In [4]:
data.nunique()

userId          610
movieId        4180
user_count      174
movie_count     257
dtype: int64

In [5]:
kf = KFold(n_splits=8, shuffle=True, random_state=22)

for train_index, valid_index in kf.split(data):
    train = data.iloc[train_index]
    test = data.iloc[valid_index]
    user_items = {}
    for userid, group in train.groupby("userId"):
        user_items[userid] = set( group['movieId'])

    item_users = {}
    for movieid, group in train.groupby("movieId"):
        item_users[movieid] = set( group['userId'])
        
    test_user_items = {}
    for userid, group in test.groupby("userId"):
        test_user_items[userid] = set( group['movieId'])
    
    break

In [6]:
def Recall_Precision(obj, test, topN):
    hit = 0
    n_recall = 0
    n_precision = 0
    for user in test.keys():
        true = test[user]
        rank = obj.recommend(user, length=topN)
        rank = {it[0] for it in rank}
        hit += len(true & rank)
        n_recall += len(true)
        n_precision += topN
    return hit / n_recall, hit / n_precision


def Coverage(obj, test, topN):
    recommend_items = set()
    all_items = set()
    for user in test.keys():
        all_items.update( test[user])
        rank = obj.recommend(user, length=topN)
        rank = {it[0] for it in rank}
        recommend_items.update(rank)
    return len(recommend_items) / len(all_items)

#如果推荐出的物品都很热门，说明推荐的新颖度较低，否则说明推荐结果比较新颖
def Popularity(obj, train, topN):
    item_popularity = dict()
    for user, items in train.items():
        for item in items:
            item_popularity[item] = item_popularity.get(item, 0) + 1
    ret = 0
    n = 0
    for user in train.keys():
        rank = obj.recommend(user, length=topN)
        rank = {it[0] for it in rank}
        for item in rank:
            ret += np.log(1 + item_popularity[item])
            n += 1
    
    return ret / n 

In [7]:
class LFM(object):
    def __init__(self, ratio=2, factors=20, epoch=10, alpha=0.1, lambda_=0.01):
        self.ratio = ratio                 #正负样例比率，对性能最大影响
        self.factors = factors            
        self.epoch = epoch                       
        self.alpha = alpha                 #梯度下降步长
        self.lambda_ = lambda_              #正则化参数
        self.istorch = False
        
    def Random_Select_Negative_Sample(self, hist_items):
        ratio = self.ratio
        items_pool = self.items_pool
        all_sample = dict()
        for i in hist_items:
            all_sample[i] = 1
        n = 0
        index = (np.random.rand( len(hist_items) * 3 * ratio) > 0.1)[:len(items_pool)]
        for item in items_pool[:len(index)][index]:
            if item in all_sample:
                continue
            all_sample[item] = 0
            n += 1
            if n >= len(hist_items)*ratio:
                break
                
        return all_sample
        
    def init_items_pool(self):
        item_count = {}
        for item, users_set in self.item_users.items():
            item_count[item] = len(users_set)
            
        item_count = sorted(item_count.items(), key=itemgetter(1), reverse=True)
        self.items_pool = np.array([i[0] for i in item_count])
    
    def train(self, user_items, item_users):
        self.user_items = user_items
        self.item_users = item_users
        lambda_ = self.lambda_
        alpha   = self.alpha
        factors = self.factors
        
        self.init_items_pool()
        usernum = len(user_items)
        # 初始化参数
        nums = 0
        self.P = np.random.randn(usernum+1, factors)
        self.Q = {i:np.random.randn(factors) for i in item_users}
        for step in range(0, self.epoch):
            for user, positive_simple in user_items.items():
                nums += 1
                if(nums%100==0): print(nums)
                all_sample = self.Random_Select_Negative_Sample(positive_simple)
                for item, r_ui in all_sample.items():
                    e_ui = r_ui - np.dot( self.P[user], self.Q[item])
                    self.P[user] += alpha * (e_ui * self.Q[item] - lambda_ * self.P[user])
                    self.Q[item] += alpha * (e_ui * self.P[user] - lambda_ * self.Q[item])
            alpha *= 0.9
            
    def train_batch(self, user_items, item_users):
        self.istorch = True
        self.user_items = user_items
        self.item_users = item_users
        lambda_ = self.lambda_
        alpha   = self.alpha
        factors = self.factors
        
        self.init_items_pool()
        usernum = len(user_items)
        # 初始化参数
        nums = 0
        self.P = torch.randn(usernum+1, factors, requires_grad=True)
        self.Q = {i:torch.randn(factors, 1, requires_grad=True) for i in item_users}

        criterion = torch.nn.BCELoss(reduction='mean')
        optimizer = torch.optim.SGD([self.P]+[i for i in self.Q.values()], lr=alpha)

        for step in range(0, self.epoch):
            for user, positive_simple in user_items.items():
                all_sample = self.Random_Select_Negative_Sample(positive_simple)
                target = torch.Tensor([r_ui for r_ui in all_sample.values()]).reshape(-1,1)
                dot = torch.mm(self.P[user].reshape(-1,factors), torch.cat([self.Q[item] for item in all_sample.keys()], dim=1))
                active = torch.sigmoid(dot).reshape(-1,1)
                loss = criterion( active, target)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                nums += 1
                if(nums%50==0): print(torch.sum(loss))

            alpha *= 0.9
    
    # 加快排序
    def recommend_que(self, user, length=10):
        rank = []
        que = PriorityQueue()
        
        for item in self.item_users:
            if item in user_items[user]:
                continue
            
            if self.istorch == True:
                value = torch.dot(self.P[user], self.Q[item].reshape(-1)).item()
            else:
                value = np.dot(self.P[user], self.Q[item])
                
            if que.qsize()<length:
                que.put((value,item))
            else:
                last_value, last_item= que.get()
                if value>last_value:
                    que.put((value,item))
                else:
                    que.put((last_value, last_item))
        while que.qsize()>0:
            value,name= que.get()
            rank.append((name,value))
        return rank
    
    def recommend(self, user, length=10):
        rank = {}
        
        for item in self.item_users:
            if item in user_items[user]:
                continue
            
            if self.istorch == True:
                rank[item] = torch.dot(self.P[user], self.Q[item].reshape(-1)).item()
            else:
                rank[item] = np.dot(self.P[user], self.Q[item])

        return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:length]

In [8]:
model = LFM()
model.train_batch( user_items, item_users)

tensor(1.8138, grad_fn=<SumBackward0>)
tensor(1.8943, grad_fn=<SumBackward0>)
tensor(2.0914, grad_fn=<SumBackward0>)
tensor(1.6769, grad_fn=<SumBackward0>)
tensor(2.1429, grad_fn=<SumBackward0>)
tensor(2.6874, grad_fn=<SumBackward0>)
tensor(2.3087, grad_fn=<SumBackward0>)
tensor(1.7491, grad_fn=<SumBackward0>)
tensor(1.6318, grad_fn=<SumBackward0>)
tensor(2.6693, grad_fn=<SumBackward0>)
tensor(1.5460, grad_fn=<SumBackward0>)
tensor(1.9970, grad_fn=<SumBackward0>)
tensor(1.8382, grad_fn=<SumBackward0>)
tensor(1.7222, grad_fn=<SumBackward0>)
tensor(2.3476, grad_fn=<SumBackward0>)
tensor(1.5463, grad_fn=<SumBackward0>)
tensor(1.8179, grad_fn=<SumBackward0>)
tensor(1.9656, grad_fn=<SumBackward0>)
tensor(2.1844, grad_fn=<SumBackward0>)
tensor(2.0298, grad_fn=<SumBackward0>)
tensor(1.7107, grad_fn=<SumBackward0>)
tensor(1.7266, grad_fn=<SumBackward0>)
tensor(1.1026, grad_fn=<SumBackward0>)
tensor(1.8687, grad_fn=<SumBackward0>)
tensor(1.3192, grad_fn=<SumBackward0>)
tensor(2.0091, grad_fn=<S

In [13]:
model.recommend(9)

[(7115, 17.085277557373047),
 (1810, 16.6064453125),
 (2416, 15.810047149658203),
 (56941, 14.358726501464844),
 (70, 14.159635543823242),
 (88405, 14.028036117553711),
 (725, 13.834428787231445),
 (3251, 13.271860122680664),
 (3927, 13.159055709838867),
 (2950, 13.127809524536133)]

In [12]:
test_user_items[9]

{2011, 2023, 5481, 5988}