In [5]:
# encoding:utf-8

import os
import csv
import heapq
import math
from operator import itemgetter
from datetime import datetime
from collections import defaultdict

# 加载电影数据文件
def load_reviews(path, **kwargs):
    options = {
        'fieldnames': ('userid', 'movieid', 'rating', 'timestamp'),
        'delimiter': '\t'
    }
    options.update(kwargs)
    parse_date = lambda r, k: datetime.fromtimestamp(float(r[k]))
    parse_int = lambda r, k: int(r[k])
    with open(path, 'rt', encoding="utf-8") as reviews:
        reader = csv.DictReader(reviews, **options)
        for row in reader:
            row['movieid'] = parse_int(row, 'movieid')
            row['userid'] = parse_int(row, 'userid')
            row['rating'] = parse_int(row, 'rating')
            row['timestamp'] = parse_date(row, 'timestamp')
            yield row

# 辅助数据导入
def relative_path(path):
    dirname = os.path.dirname(os.path.realpath('__file__'))
    path = os.path.join(dirname, path)
    return  os.path.normpath(path)

# 读取电影信息
def load_movies(path, **kwargs):
    options = {
        'fieldnames': ('movieid', 'title', 'release', 'video', 'url'),
        'delimiter': '|',
        'restkey': 'genre'
    }
    options.update(**kwargs)
    parse_int = lambda r, k: int(r[k])
    parse_date = lambda r, k: datetime.strptime(r[k], '%d-%b-%Y') if r[k] else None
    with open(path, 'rt', encoding="utf-8") as movies:
        reader = csv.DictReader(movies, **options)
        for row in reader:
            row['movieid'] = parse_int(row, 'movieid')
            # print row['movieid']
            row['release'] = parse_date(row, 'release')
            # print row['release']
            # print row['video']
            yield row

# 基于物品的推荐系统，这里指的是电影
class MovieLens(object):

    def __init__(self, udata, uitem):
        self.udata = udata
        self.uitem = uitem
        self.movies = {}
        self.reviews = defaultdict(dict)
        self.load_dataset()

    # 加载数据到内存中，按ID为索引
    def load_dataset(self):
        for movie in load_movies(self.uitem):
            self.movies[movie['movieid']] = movie

        for review in load_reviews(self.udata):
            self.reviews[review['userid']][review['movieid']] = review
            # print self.reviews[review['userid']][review['movieid']]

    # 检查电影是否存在
    def reviews_for_movie(self, movieid):
        for review in self.reviews.values():
            if movieid in review:
                yield review[movieid]

    # 对所有的电影求平均评价分数
    def average_reviews(self):
        for movieid in self.movies:
            reviews = list(r['rating'] for r in self.reviews_for_movie(movieid))
            average = sum(reviews) / float(len(reviews))
            yield (movieid, average, len(reviews))  # 返回了（movieid，评分平均分，长度(即评价人数)）

    # 获取电影前n的top排行
    def top_rated(self, n=10):
        return heapq.nlargest(n, self.bayesian_average(), key=itemgetter(1))

    # 获取一个修正后的贝叶斯平均值
    def bayesian_average(self, c=59, m=3):
        for movieid in self.movies:
            reviews = list(r['rating'] for r in self.reviews_for_movie(movieid))
            average = ((c * m) + sum(reviews)) / float(c + len(reviews))
            yield (movieid, average, len(reviews))

    # 返回两部电影的交集,即两部电影在同一个人观看过的情况
    def shared_critics(self, movieA, movieB):
        if movieA not in self.movies:
            raise KeyError("Couldn't find movie '%s' in data" % movieA)
        if movieB not in self.movies:
            raise KeyError("Couldn't find movie '%s' in data" % movieB)
        criticsA = set(critic for critic in self.reviews if movieA in self.reviews[critic])
        criticsB = set(critic for critic in self.reviews if movieB in self.reviews[critic])
        shared = criticsA & criticsB  # 和操作
        # 创建一个评论过的字典以返回
        reviews = {}
        for critic in shared:
            reviews[critic] = (
                self.reviews[critic][movieA]['rating'],
                self.reviews[critic][movieB]['rating']
            )
        return reviews

    # 通过两部电影的共同属性作为向量来计算两部电影之间的欧式距离
    def euclidean_distance(self, criticA, criticB, prefs='movies'):
        # 创建两部电影的交集
        preferences = self.share_preferences(criticA, criticB)
        # 没有则返回0
        if len(preferences) == 0: return 0
        # 求偏差的平方的和
        sum_of_squares = sum([pow(a - b, 2) for a, b in preferences.values()])
        # 修正的欧式距离，返回值的范围为[0,1]
        return 1 / (1 + math.sqrt(sum_of_squares))

    # 返回两部电影之间的皮尔逊相关系数
    def pearson_correlation(self, criticA, criticB, prefs='movies'):
        if prefs == 'users':
            preferences = self.share_preferences(criticA, criticB)
        elif prefs == 'movies':
            preferences = self.shared_critics(criticA, criticB)
        else:
            raise Exception("No preferences of type '%s'." % prefs)
        length = len(preferences)
        if length == 0: return 0
        # 循环处理每一部电影之间的皮尔逊相关系数
        sumA = sumB = sumSquareA = sumSquareB = sumProducts = 0
        for a, b in preferences.values():
            sumA += a
            sumB += b
            sumSquareA += pow(a, 2)
            sumSquareB += pow(b, 2)
            sumProducts += a * b
        # 计算皮尔逊系数
        numerator = (sumProducts * length) - (sumA * sumB)
        denominator = math.sqrt(((sumSquareA * length) - pow(sumA, 2)) * ((sumSquareB * length) - pow(sumB, 2)))
        if denominator == 0: return 0
        return abs(numerator / denominator)

    # 返回与制定电影相似的相似的电影
    def similar_items(self, movie, metric='eculidean', n=None):
        metrics = {
            'euclidean': self.euclidean_distance,
            'pearson': self.pearson_correlation,
        }
        distance = metrics.get(metric, None)
        # 解决可能出现的状况
        if movie not in self.reviews:
            raise KeyError("Unknown movie, '%s'." % movie)
        if not distance or not callable(distance):
            raise KeyError("Unknown or unprogrammed distance metric '%s'." % metric)
        items = {}
        for item in self.movies:
            if item == movie:
                continue
            items[item] = distance(item, movie, prefs='movies')
        if n:
            return heapq.nlargest(n, items.items(), key=itemgetter(1))
        return items

    '''
    预测一个用户对一部电影的评分，相当于当前用户对其他电影的评分的加权均值
    并且权重取决于其他电影和该电影的相似程度
    '''
    def predict_items_recommendation(self, user, movie, metric='euclidean'):
        movie = self.similar_items(movie, metric=metric)
        total = 0.0
        simsum = 0.0
        for relmovie, similarity in movie.items():
            if relmovie in self.reviews[user]:
                total += similarity * self.reviews[user][relmovie]['rating']
                simsum += similarity
        if simsum == 0.0:
            return 0.0
        return total / simsum


if __name__ == '__main__':
    data = relative_path('data/ml-100k/u.data')
    item = relative_path('data/ml-100k/u.item')
    model = MovieLens(data, item)
    # 基于物品的相关预测
    # 输出所有与电影766相似（皮尔逊）的系数
    print ("所有与电影766相似（皮尔逊）的系数前十为：")
    for movie, similarity in model.similar_items(766, 'pearson', n=10):
        print ('%0.3f : %s' % (similarity, model.movies[movie]['title']))
    # 输出对用户232对电影52的评分预测
    print ("用户232对电影52的评分预测为 ",model.predict_items_recommendation(232, 52, 'pearson'))

所有与电影766相似（皮尔逊）的系数前十为：
1.000 : Copycat (1995)
1.000 : Usual Suspects, The (1995)
1.000 : From Dusk Till Dawn (1996)
1.000 : White Balloon, The (1995)
1.000 : Taxi Driver (1976)
1.000 : Apollo 13 (1995)
1.000 : Net, The (1995)
1.000 : To Wong Foo, Thanks for Everything! Julie Newmar (1995)
1.000 : Hoop Dreams (1994)
1.000 : I.Q. (1994)
用户232对电影52的评分预测为  3.980443976004237
