In [6]:
import io
from surprise import KNNBaseline, Dataset, get_dataset_dir

In [11]:
def read_item_names():
    """    
    这个方法可以由id找到name 也可以由name找到id
    """

    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid

In [2]:
data = Dataset.load_builtin('ml-100k')

In [3]:
trainset = data.build_full_trainset()
# 皮尔逊相似度 user_based 为True计算用户与用户的相似度 False计算item与item的相似度
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x116ed9be0>

In [12]:
rid_to_name, name_to_rid = read_item_names()

In [35]:
# 
toy_story_raw_id = name_to_rid['Toy Story (1995)']
# 在创建trainset时，每个原始id都映射到一个称为inner id的惟一整数，这更适合于操作Surprise
# 可以使用训练集的to_inner_uid()、to_inner_iid()、to_raw_uid()和to_raw_iid()方法在原始id和内部id之间进行转换
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
# 近邻算法 获取最相邻的10个
toy_story_neighbors= algo.get_neighbors(toy_story_inner_id, k=10)
# 将to_inner_iid在转换为原始的数据的id
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
# 由id映射电影名称
toy_story_neighbors_names = [rid_to_name[rid] for rid in toy_story_neighbors]

print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors_names:
    print(movie)


The 10 nearest neighbors of Toy Story are:
Beauty and the Beast (1991)
Raiders of the Lost Ark (1981)
That Thing You Do! (1996)
Lion King, The (1994)
Craft, The (1996)
Liar Liar (1997)
Aladdin (1992)
Cool Hand Luke (1967)
Winnie the Pooh and the Blustery Day (1968)
Indiana Jones and the Last Crusade (1989)


### 测试余弦相似度推荐的电影

In [36]:
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x120ad8f28>

In [37]:
rid_to_name, name_to_rid = read_item_names()

toy_story_raw_id = name_to_rid['Toy Story (1995)']
# 在创建trainset时，每个原始id都映射到一个称为inner id的惟一整数，这更适合于操作Surprise
# 可以使用训练集的to_inner_uid()、to_inner_iid()、to_raw_uid()和to_raw_iid()方法在原始id和内部id之间进行转换
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
# 近邻算法 获取最相邻的10个
toy_story_neighbors= algo.get_neighbors(toy_story_inner_id, k=10)
# 将to_inner_iid在转换为原始的数据的id
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
# 由id映射电影名称
toy_story_neighbors_names = [rid_to_name[rid] for rid in toy_story_neighbors]

print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors_names:
    print(movie)


The 10 nearest neighbors of Toy Story are:
So Dear to My Heart (1949)
My Life and Times With Antonin Artaud (En compagnie d'Antonin Artaud) (1993)
Somebody to Love (1994)
Crows and Sparrows (1949)
Total Eclipse (1995)
Mr. Jones (1993)
Convent, The (Convento, O) (1995)
Incognito (1997)
Every Other Weekend (1990)
Homage (1995)
