In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

In [8]:
def get_movie_dataset():
    # 加载基于所有电影的标签
    # all-tag.csv
    _tags = pd.read_csv("ml-latest-small/tags.csv", usecols=range(1, 3)).dropna()
    tags = _tags.groupby("movieId").agg(list)

    # 加载电影列表数据
    movies = pd.read_csv("ml-latest-small/movies.csv", index_col="movieId")
    # 将电影的类别词分开
    movies['genres'] = movies['genres'].apply(lambda x: x.split("|"))

    # 为每部电影匹配对应的标签数据， 如果没有将会是NAN
    movies_index = set(movies.index) & set(tags.index)
    new_tags = tags.loc[list(movies_index)]
    ret = movies.join(new_tags)

    # 构建电影数据集， 包含电影ID， 电影名称， 类别和标签四个字段
    # 如果电影没有标签数据， 就替换为空列表
    # map(fun, 可迭代对象)
    df = map(lambda x: (x[0], x[1], x[2], x[2]+x[3]) if x[3] is not np.nan else (x[0], x[1], x[2], []), ret.itertuples())
    movies_dataset = pd.DataFrame(df, columns=['movieId', 'title', 'genres', 'tags'])

    movies_dataset.set_index("movieId", inplace=True)
    return movies_dataset

movies_dataset = get_movie_dataset()
movies_dataset.head()

Unnamed: 0_level_0,title,genres,tags
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[Adventure, Animation, Children, Comedy, Fanta..."
2,Jumanji (1995),"[Adventure, Children, Fantasy]","[Adventure, Children, Fantasy, fantasy, magic ..."
3,Grumpier Old Men (1995),"[Comedy, Romance]","[Comedy, Romance, moldy, old]"
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",[]
5,Father of the Bride Part II (1995),[Comedy],"[Comedy, pregnancy, remake]"


In [9]:
from gensim.models import TfidfModel
from pprint import pprint
from gensim.corpora import Dictionary

def create_movie_profile(movie_dataset):
    '''
    使用tfidf，分析提取topn关键词
    :param movie_dataset:
    :return:
    '''
    dataset = movie_dataset["tags"].values

    from gensim.corpora import Dictionary
    # 根据数据集建立词袋，并统计词频，将所有词放入一个词典，使用索引进行获取
    dct = Dictionary(dataset)
    # 根据将每条数据，返回对应的词索引和词频
    corpus = [dct.doc2bow(line) for line in dataset]
    # 训练TF-IDF模型，即计算TF-IDF值
    model = TfidfModel(corpus)

    _movie_profile = []
    for i, data in enumerate(movie_dataset.itertuples()):
        mid = data[0]
        title = data[1]
        genres = data[2]
        vector = model[corpus[i]]
        movie_tags = sorted(vector, key=lambda x: x[1], reverse=True)[:30]
        topN_tags_weights = dict(map(lambda x: (dct[x[0]], x[1]), movie_tags))
        # 将类别词的添加进去，并设置权重值为1.0
        for g in genres:
            topN_tags_weights[g] = 1.0
        topN_tags = [i[0] for i in topN_tags_weights.items()]
        _movie_profile.append((mid, title, topN_tags, topN_tags_weights))

    movie_profile = pd.DataFrame(_movie_profile, columns=["movieId", "title", "profile", "weights"])
    movie_profile.set_index("movieId", inplace=True)
    return movie_profile

movie_profile = create_movie_profile(movies_dataset)

In [10]:
movie_profile.weights

Unnamed: 0_level_0,title,profile,weights
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[pixar, fun, Animation, Children, Fantasy, Adv...","{'pixar': 0.837374709121301, 'fun': 0.34531665..."
2,Jumanji (1995),"[game, magic board game, Robin Williams, fanta...","{'game': 0.49506005899914796, 'magic board gam..."
3,Grumpier Old Men (1995),"[moldy, old, Romance, Comedy]","{'moldy': 0.669101789463952, 'old': 0.66910178..."
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","{'Comedy': 1.0, 'Drama': 1.0, 'Romance': 1.0}"
5,Father of the Bride Part II (1995),"[pregnancy, remake, Comedy]","{'pregnancy': 0.7029528753875794, 'remake': 0...."


In [11]:
def create_inverted_table(movie_profile):
    inverted_table = {}
    for mid, weights in movie_profile['weights'].iteritems():
        for tag, weight in weights.items():
            # 到inverted_table dict 用tag作为key去取值， 如果取不到就返回[]
            _ = inverted_table.get(tag, [])
            _.append((mid, weight))
            inverted_table.setdefault(tag, _)
    return inverted_table

inverted_table = create_inverted_table(movie_profile)

In [78]:
import collections
from functools import reduce

def create_user_profile():
    watch_record = pd.read_csv("ml-latest-small/ratings.csv")
    train, test = train_test_split(watch_record, test_size=0.2, random_state=42)

    watch_record = train.iloc[:,:2].groupby("userId").agg(list)
    # print(watch_record)

    movie_dataset = get_movie_dataset()
    movie_profile = create_movie_profile(movie_dataset)

    user_profile = {}
    for uid, mids in watch_record.itertuples():
        record_movie_prifole = movie_profile.loc[list(mids)]
        counter = collections.Counter(reduce(lambda x, y: list(x)+list(y), record_movie_prifole["profile"].values))
        # 兴趣词
        interest_words = counter.most_common(50)
        maxcount = interest_words[0][1]
        interest_words = [(w,round(c/maxcount, 4)) for w,c in interest_words]
        user_profile[uid] = interest_words

    return user_profile, train, test

user_profile, train, test = create_user_profile()

In [132]:
user_profile

{1: [('Action', 1.0),
  ('Adventure', 0.9444),
  ('Comedy', 0.9222),
  ('Drama', 0.7556),
  ('Thriller', 0.6111),
  ('Fantasy', 0.5222),
  ('Crime', 0.5),
  ('Children', 0.4667),
  ('Sci-Fi', 0.4444),
  ('Animation', 0.3222),
  ('Romance', 0.2889),
  ('War', 0.2444),
  ('Musical', 0.2444),
  ('Mystery', 0.2),
  ('Horror', 0.1889),
  ('Disney', 0.0889),
  ('Western', 0.0778),
  ('twist ending', 0.0556),
  ('suspense', 0.0556),
  ('superhero', 0.0556),
  ('aliens', 0.0556),
  ('sci-fi', 0.0444),
  ('classic', 0.0444),
  ('Vietnam', 0.0444),
  ('disturbing', 0.0444),
  ('psychology', 0.0444),
  ('dark comedy', 0.0444),
  ('great soundtrack', 0.0333),
  ('space opera', 0.0333),
  ('time travel', 0.0333),
  ('thought-provoking', 0.0333),
  ('King Arthur', 0.0333),
  ('religion', 0.0333),
  ('violence', 0.0333),
  ('archaeology', 0.0333),
  ('imdb top 250', 0.0333),
  ('serial killer', 0.0222),
  ('heist', 0.0222),
  ('mindfuck', 0.0222),
  ('off-beat comedy', 0.0222),
  ('crime', 0.0222),
 

In [82]:
train_dict = dict()
for i in train.userId.unique():
    train_dict[i] = {}
    for j in train[train.userId == i].movieId:
        train_dict[i][j] = train[(train.userId == i) & (train.movieId == j)].rating.values[0]

In [83]:
test_dict = dict()
for i in test.userId.unique():
    test_dict[i] = {}
    for j in test[test.userId == i].movieId:
        test_dict[i][j] = test[(test.userId == i) & (test.movieId == j)].rating.values[0]

In [87]:
# 为用户产生推荐结果
recommendation = {}
for uid, interest_words in tqdm(user_profile.items()):
    result_table = {}   # 电影id: [0.2, 0.5]
    for interest_word, interest_weight in interest_words:
        related_movies = inverted_table[interest_word]
        for mid, relate_weight in related_movies:
            if mid in train_dict[uid]:
                continue
            _ = result_table.get(mid, [])
            _.append(interest_weight)    #只考虑用户的兴趣程度
            # _.append(relate_weight)   # 只考虑兴趣词与电影的关联程度
            # _.append(interest_weight * relate_weight)     # 二者都考虑
            result_table.setdefault(mid, _)


    rs_result = map(lambda x: (x[0], sum(x[1])), result_table.items())
    rs_result = sorted(rs_result, key=lambda x: x[1], reverse=True)
    recommendation[uid] = {i[0]:i[1] for i in rs_result}
    # break

  0%|          | 0/610 [00:00<?, ?it/s]

In [73]:
hit = 0
for item, w in sorted(recommendation[1].items(), key=lambda x: x[1], reverse=True)[0:nitems]:
    if item in test_dict[1]:
        hit += 1

In [100]:
nitems = 10
hit = 0
mae = 0
n_recall = 0
n_precision = 0
n_mae = 0

for user in train_dict:
    test_items = test_dict.get(user, {})
    rank = recommendation[user]
    for item, w in sorted(rank.items(), key=lambda x: x[1], reverse=True)[0:10]:
        if item in test_items:
            hit += 1
            mae += abs(w - test_items[item])
            n_mae += 1
    n_recall += len(test_items)
    n_precision += nitems

In [101]:
print("精准率：", hit / (1.0 * n_precision))

ZeroDivisionError: float division by zero

KeyError: 187031

In [105]:
user

431

In [133]:
recommendation[1][81132]

5.2