In [1]:
#-*-coding:utf8-*-

"""
author:YJM

date:20190420

util function

"""
from __future__ import division
import os
import sys
import numpy as np
import pandas as pd
import operator

### 通过movies.csv获取电影信息

In [2]:
def get_item_info(input_file):
    if not os.path.exists(input_file):
        return {}
    item_info={}
    linenum=0
    fp = open(input_file)
    for line in fp:
        if linenum == 0:
            linenum += 1
            continue
        item = line.strip().split(',')
        if len(item)<3:
            continue
        elif len(item) == 3:
            itemid,title,genre = item[0],item[1],item[2]
        elif len(item)>3:
            itemid = item[0]
            genre = item[-1]
            title = ','.join(item[1:-1])
        item_info[itemid]=[title,genre]
    fp.closed
    return item_info

### 图算法的数据格式

In [3]:
def get_graph_from_data(input_file):
    """
    Args:
        input_file:user item rating file
    Return:
        a dict:{User A:{itemb:1,itemc:1},itemb:{UserA:1}}
    """
    if not os.path.exists(input_file):
        return {}   
    graph={}
    linenum =0
    score_thr=4.0
    fp = open(input_file)
    for line in fp:
        if linenum ==0:
            linenum +=1
            continue
        item = line.strip().split(",")
        if len(item)<3:
            continue
        userid,itemid,rating =item[0],"item_"+item[1],item[2]
        if float(rating)<score_thr:
            continue
        if userid not in graph:
            graph[userid] ={}
        graph[userid][itemid]=1
        if itemid not in graph:
            graph[itemid]={}
        graph[itemid][userid] = 1
    fp.close()
    return graph

In [4]:
graph=get_graph_from_data("../data/ratings15000.csv")
# graph

In [None]:
tmp_rank ={point:0 for point in graph}
tmp_rank
for out_point ,out_dict in graph.items():
    print("--------------------")
    for inner_point,value in graph[out_point].items():
        print(tmp_rank[out_point])
        

### 将personalRank的算法模型

In [5]:
def personal_rank(graph,root,alpha,iter_num,recom_num=10):
    """
    Args:
        graph:user item graph
        root:指定要推荐的用户
        alpha：以alpha的概率选择随机游走，以1-alpha的概率回到起点
        item_num:迭代轮次
        recom_num=10:指定迭代轮次
    Return:
        a dict :    key :itemid  value: pr
    """
    rank = {}
    rank = {point:0 for point in graph}#将除了root顶点以外，其他所有顶点初始化为0,一箭双雕，自动去重
    rank[root] = 1#root顶点初始化成1
    recom_result={}#输出的数据结构
    for iter_index in range(iter_num):
        tmp_rank = {}
        tmp_rank = {point:0 for point in graph}#该迭代轮次下其余顶点到root顶点的pr值
        #如果该顶点不是root顶点,那么所有连接该顶点的顶点的pr值以1/N的概率贡献给这个顶点
        for out_point,out_dict in graph.items():
            for inner_point,value in graph[out_point].items():
#                 如果顶点不是root顶点（公式的上半部分）
#              len(out_dict)是出度
                tmp_rank[inner_point] +=round(alpha*rank[out_point]/len(out_dict),4)
#                公式的下半部分
                if inner_point == root:
                    tmp_rank[inner_point] +=round(1-alpha,4)
#         迭代充分了提前结束迭代
        if tmp_rank ==rank:
            print("out"+str(iter_index))#查看是否提前结束迭代
            break
#         如果没有完全迭代充分，就要赋值给rank这个数据结构
        rank = tmp_rank
    
    right_num = 0#定义一个计数器
    
#     将rank这个结构根据pr值的得分进行排序，并过滤掉User顶点和root顶点已经行为过的item 
    for zuhe in sorted(rank.items(),key=operator.itemgetter(1),reverse=True):
        point,pr_score =zuhe[0],zuhe[1]
        if len(point.split('_'))<2:#如果不是item顶点就过滤掉
            continue
        if point in graph[root]:#如果被root顶点行为过，同样过滤
            continue
        recom_result[point] = pr_score #结果装载进数据集
        right_num += 1
        if right_num >recom_num:
            break#迭代轮次结束
    return recom_result

In [8]:
def get_one_user_recom():
    """
    give one fix user recom result
    """
    user ="2"# A
    alpha = 0.8      
#     graph = get_graph_from_data("../data/log.txt")
    graph =get_graph_from_data("../data/ratings15000.csv")
    iter_num = 100  
    recom_result=personal_rank(graph,user,alpha,iter_num)
    item_info = get_item_info("../data/movies.csv")
#     将用户感兴趣的物品打印出来分析结果
    for itemid in graph[user]:
        pure_itemid = itemid.split("_")[1]
        print(item_info[pure_itemid])
    print("result------------")    
    for itemid in recom_result:
        pure_itemid = itemid.split("_")[1]
        print(item_info[pure_itemid])
        print(recom_result[itemid])    

In [9]:
get_one_user_recom()
# 推荐结果存盘或者存在kv中

out31
['Grumpier Old Men (1995)', 'Comedy|Romance']
["Mr. Holland's Opus (1995)", 'Drama']
['From Dusk Till Dawn (1996)', 'Action|Comedy|Horror|Thriller']
['Braveheart (1995)', 'Action|Drama|War']
['Star Wars: Episode IV - A New Hope (1977)', 'Action|Adventure|Sci-Fi']
['Legends of the Fall (1994)', 'Drama|Romance|War|Western']
['Jurassic Park (1993)', 'Action|Adventure|Sci-Fi|Thriller']
['Blade Runner (1982)', 'Action|Sci-Fi|Thriller']
['Terminator 2: Judgment Day (1991)', 'Action|Sci-Fi']
['North by Northwest (1959)', 'Action|Adventure|Mystery|Romance|Thriller']
['2001: A Space Odyssey (1968)', 'Adventure|Drama|Sci-Fi']
['Star Wars: Episode V - The Empire Strikes Back (1980)', 'Action|Adventure|Sci-Fi']
['Star Wars: Episode VI - Return of the Jedi (1983)', 'Action|Adventure|Sci-Fi']
['Alien (1979)', 'Horror|Sci-Fi']
['"Femme Nikita, La (Nikita) (1990)"', 'Action|Crime|Romance|Thriller']
['Stand by Me (1986)', 'Adventure|Drama']
['Back to the Future (1985)', 'Adventure|Comedy|Sci-Fi']