In [1]:
# encoding:utf-8

import os
import csv
import heapq
import math
from operator import itemgetter
from datetime import datetime
from collections import defaultdict

# 加载电影数据文件（包含用户ID、电影ID、评分、时间戳）
def load_reviews(path, **kwargs):
    options = {
        'fieldnames': ('userid', 'movieid', 'rating', 'timestamp'),
        'delimiter': '\t'
    }
    options.update(kwargs)
    parse_date = lambda r, k: datetime.fromtimestamp(float(r[k]))
    parse_int = lambda r, k: int(r[k])
    with open(path, 'rt', encoding="utf-8") as reviews:
        reader = csv.DictReader(reviews, **options)
        for row in reader:
            row['movieid'] = parse_int(row, 'movieid')
            row['userid'] = parse_int(row, 'userid')
            row['rating'] = parse_int(row, 'rating')
            row['timestamp'] = parse_date(row, 'timestamp')
            yield row

# 辅助数据导入
def relative_path(path):
    dirname = os.path.dirname(os.path.realpath('__file__'))
    path = os.path.join(dirname, path)
    return  os.path.normpath(path)

# 读取电影信息（包含电影ID、电影名、出品时间、视频地址、类型涵盖）
def load_movies(path, **kwargs):
    options = {
        'fieldnames': ('movieid', 'title', 'release', 'video', 'url'),
        'delimiter': '|',
        'restkey': 'genre'
    }
    options.update(**kwargs)
    parse_int = lambda r, k: int(r[k])
    parse_date = lambda r, k: datetime.strptime(r[k], '%d-%b-%Y') if r[k] else None
    with open(path, 'rt', encoding="utf-8") as movies:
        reader = csv.DictReader(movies, **options)
        for row in reader:
            row['movieid'] = parse_int(row, 'movieid')
            # print row['movieid']
            row['release'] = parse_date(row, 'release')
            # print row['release']
            # print row['video']
            yield row

# 基于用户的推荐系统:
class MovieLens(object):

    def __init__(self, udata, uitem):
        self.udata = udata
        self.uitem = uitem
        self.movies = {}
        self.reviews = defaultdict(dict)
        self.load_dataset()

    def load_dataset(self):
        # 加载数据到内存中，按ID为索引
        for movie in load_movies(self.uitem):
            self.movies[movie['movieid']] = movie

        for review in load_reviews(self.udata):
            self.reviews[review['userid']][review['movieid']] = review
            # print self.reviews[review['userid']][review['movieid']]

    # 检查电影是否存在
    def reviews_for_movie(self, movieid):
        for review in self.reviews.values():
            if movieid in review:
                yield review[movieid]

    # 对所有的电影求平均评价分数
    def average_reviews(self):
        for movieid in self.movies:
            reviews = list(r['rating'] for r in self.reviews_for_movie(movieid))
            average = sum(reviews) / float(len(reviews))
            yield (movieid, average, len(reviews))  # 返回了（movieid，评分平均分，长度(即评价人数)）

    # 获取电影前n的top排行
    def top_rated(self, n=10):
        return heapq.nlargest(n, self.bayesian_average(), key=itemgetter(1))

    # 获取一个修正后的贝叶斯平均值
    def bayesian_average(self, c=59, m=3):
        for movieid in self.movies:
            reviews = list(r['rating'] for r in self.reviews_for_movie(movieid))
            average = ((c * m) + sum(reviews)) / float(c + len(reviews))
            yield (movieid, average, len(reviews))

    # 找出两个评论者之间的交集
    def share_preferences(self, criticA, criticB):
        if criticA not in self.reviews:
            raise KeyError("Couldn't find critic '%s' in data " % criticA)
        if criticB not in self.reviews:
            raise KeyError("Couldn't find critic '%s' in data " % criticB)
        moviesA = set(self.reviews[criticA].keys())
        moviesB = set(self.reviews[criticB].keys())
        shared = moviesA & moviesB
        # 创建一个评论过的的字典返回
        reviews = {}
        for movieid in shared:
            reviews[movieid] = (
                self.reviews[criticA][movieid]['rating'],
                self.reviews[criticB][movieid]['rating'],
            )
        return reviews

    # 通过两个人的共同偏好作为向量来计算两个用户之间的欧式距离
    def euclidean_distance(self, criticA, criticB, prefs='users'):
        # 创建两个用户的交集
        preferences = self.share_preferences(criticA, criticB)
        # 没有则返回0
        if len(preferences) == 0: return 0
        # 求偏差的平方的和
        sum_of_squares = sum([pow(a - b, 2) for a, b in preferences.values()])
        # 修正的欧式距离，返回值的范围为[0,1]
        return 1 / (1 + math.sqrt(sum_of_squares))

    # 返回两个评论者之间的皮尔逊相关系数
    def pearson_correlation(self, criticA, criticB, prefs='users'):
        if prefs == 'users':
            preferences = self.share_preferences(criticA, criticB)
        elif prefs == 'movies':
            preferences = self.shared_critics(criticA, criticB)
        else:
            raise Exception("No preferences of type '%s'." % prefs)
        length = len(preferences)
        if length == 0: return 0
        # 循环处理每一个评论者之间的皮尔逊相关系数
        sumA = sumB = sumSquareA = sumSquareB = sumProducts = 0
        for a, b in preferences.values():
            sumA += a
            sumB += b
            sumSquareA += pow(a, 2)
            sumSquareB += pow(b, 2)
            sumProducts += a * b
        # 计算皮尔逊系数
        numerator = (sumProducts * length) - (sumA * sumB)
        denominator = math.sqrt(((sumSquareA * length) - pow(sumA, 2)) * ((sumSquareB * length) - pow(sumB, 2)))
        if denominator == 0: return 0
        return abs(numerator / denominator)

    # 为特定用户寻找一个合适的影评人
    def similar_critics(self, user, metric='euclidean', n=None):
        metrics = {
            'euclidean': self.euclidean_distance,
            'pearson': self.pearson_correlation
        }
        distance = metrics.get(metric, None)
        # 解决可能出现的状况
        if user not in self.reviews:
            raise KeyError("Unknown user, '%s'." % user)
        if not distance or not callable(distance):
            raise KeyError("Unknown or unprogrammed distance metric '%s'." % metric)
        # 计算对用户最合适的影评人
        critics = {}
        for critic in self.reviews:
            # 不能与自己进行比较
            if critic == user:
                continue
            critics[critic] = distance(user, critic)
        if n:
            return heapq.nlargest(n, critics.items(), key=itemgetter(1))
        return critics

    '''
    预测一个用户对一部电影的评分，相当于评论过这部电影的用户对当前用户的加权均值
    并且权重取决与其他用户和该用户的相似程度
    '''
    def predict_ranking(self, user, movie, metric='euclidean', critics=None):
        critics = critics or self.similar_critics(user, metric=metric)
        total = 0.0
        simsum = 0.0
        for critic, similarity in critics.items():
            if movie in self.reviews[critic]:
                total += similarity * self.reviews[critic][movie]['rating']
                simsum += similarity
        if simsum == 0.0: return 0.0
        return total / simsum

    # 为所有的电影预测评分，返回前n个评分的电影和它们的评分
    def predict_all_rankings(self, user, metric='euclidean', n=None):
        critics = self.similar_critics(user, metric=metric)
        movies = {
            movie: self.predict_ranking(user, movie, metric, critics)
            for movie in self.movies
        }
        if n:
            return heapq.nlargest(n, movies.items(), key=itemgetter(1))
        return movies


if __name__ == '__main__':
    data = relative_path('data/ml-100k/u.data')
    item = relative_path('data/ml-100k/u.item')
    model = MovieLens(data, item)
    # 基于用户的相关预测
    # 输出排名前十的电影
    print ("排名前十的电影为：")
    for mid, avg, num in model.top_rated(10):
        title = model.movies[mid]['title']
        print ("[%0.3f average rating (%i reviews)] %s" % (avg, num,title))
    # 计算两个用户在偏好空间中的距离
    print ("用户643和用户916的偏好相似性为：", model.euclidean_distance(643, 916))  # A,B
    # 计算两个用户相关性
    print ("用户643和用户916的相关性为：",model.pearson_correlation(643, 916))
    # 利用皮尔逊系数为特定用户寻找最好的影评人
    print ("用户766相似（皮尔逊系数）的影评人为：")
    for item in model.similar_critics(766, 'pearson', n=10):
        print ("%4i: %0.3f" % item)
    # 利用欧氏距离为特定用户寻找最好的影评人
    print("用户766相似（欧氏距离）的影评人为：")
    for item in model.similar_critics(766, 'euclidean', n=10):
        print ("%4i: %0.3f" % item)
    # 预测一个用户对一部电影的评分
    print ("用户422对电影50的评分（欧氏距离）为：",model.predict_ranking(422, 50, 'euclidean'))
    print ("用户422对电影50的评分（皮尔逊系数）为：",model.predict_ranking(422, 50, 'pearson'))
    # 预测一个用户对所有电影的评分，并返回前n个值
    print ("用户532对所有电影评分中排名前十的是：")
    for mid ,rating in model.predict_all_rankings(532,'pearson',10):
        print ('%0.3f: %s' % (rating, model.movies[mid]['title']))




排名前十的电影为：
[4.234 average rating (583 reviews)] Star Wars (1977)
[4.224 average rating (298 reviews)] Schindler's List (1993)
[4.196 average rating (283 reviews)] Shawshank Redemption, The (1994)
[4.172 average rating (243 reviews)] Casablanca (1942)
[4.135 average rating (267 reviews)] Usual Suspects, The (1995)
[4.123 average rating (413 reviews)] Godfather, The (1972)
[4.120 average rating (390 reviews)] Silence of the Lambs, The (1991)
[4.098 average rating (420 reviews)] Raiders of the Lost Ark (1981)
[4.082 average rating (209 reviews)] Rear Window (1954)
[4.066 average rating (350 reviews)] Titanic (1997)
用户643和用户916的偏好相似性为： 0.08563786063744523
用户643和用户916的相关性为： 0.3942560673329039
用户766相似（皮尔逊系数）的影评人为：
  15: 1.000
  32: 1.000
  39: 1.000
  40: 1.000
  46: 1.000
  54: 1.000
  68: 1.000
  74: 1.000
 112: 1.000
 133: 1.000
用户766相似（欧氏距离）的影评人为：
  46: 1.000
  61: 1.000
  79: 1.000
 101: 1.000
 105: 1.000
 129: 1.000
 139: 1.000
 157: 1.000
 181: 1.000
 191: 1.000
用户422对电影50的评分（欧氏距离）为： 4