## 1 导入常用库

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## 2 读入相关文件

In [2]:
path = '../ml-100k/'

names = ['user_id', 'item_id', 'rating', 'timestamp']
df_data = pd.read_csv(path+'u.data', sep='\t', names=names)
df_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
names = ['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL',\
                'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',\
                'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
df_item = pd.read_csv(path+'u.item', sep='|', names=names, engine='python')
df_item.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## 3 构建电影相似矩阵

In [4]:
# 构建电影相似矩阵
movie_matrix = df_item[['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',\
                                      'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']].values.reshape(1682,19)
print("movie_matrix.shape : ", movie_matrix.shape)

# 计算余弦相似度
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(movie_matrix)
print("cosine_sim.shape : ", cosine_sim.shape)
print(cosine_sim)

movie_matrix.shape :  (1682, 19)
cosine_sim.shape :  (1682, 1682)
[[1.         0.         0.         ... 0.         0.57735027 0.        ]
 [0.         1.         0.57735027 ... 0.         0.         0.        ]
 [0.         0.57735027 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.70710678]
 [0.57735027 0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.70710678 0.         1.        ]]


In [5]:
# 基于内容推荐Top10,  输入movieId
def get_recommendations(df_item, movieId, cosine_sim, K):
    sim = cosine_sim[movieId - 1]    # 找出该 movieId 对应的向量
    sim_index = np.argsort(-sim)             # 排序找到最相关的索引
    K_sim_index = sim_index[:K+1] + 1
    res = df_item[df_item['movie id'].isin(K_sim_index)]
    return res

In [6]:
res_1 = get_recommendations(df_item, 1, cosine_sim, 10)
res_1[['movie id', 'movie title', 'release date']]

Unnamed: 0,movie id,movie title,release date
0,1,Toy Story (1995),01-Jan-1995
94,95,Aladdin (1992),01-Jan-1992
239,240,Beavis and Butt-head Do America (1996),20-Dec-1996
376,377,Heavyweights (1994),01-Jan-1994
421,422,Aladdin and the King of Thieves (1996),01-Jan-1996
476,477,Matilda (1996),02-Aug-1996
968,969,Winnie the Pooh and the Blustery Day (1968),01-Jan-1968
995,996,"Big Green, The (1995)",01-Jan-1995
1077,1078,Oliver & Company (1988),29-Mar-1988
1218,1219,"Goofy Movie, A (1995)",01-Jan-1995


## 4 使用KNN方法找到最相近的 TOP K 部电影，与上述方式做个对比。

In [7]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

neigh = NearestNeighbors(10, 0.4)
neigh.fit(movie_matrix)

K_sim_index = neigh.kneighbors(movie_matrix[0].reshape(1,-1), 11, return_distance=False)+1

In [8]:
K_sim_index[0]

array([  1, 422, 169, 189, 261, 240, 102,  95, 259, 243, 225])

In [9]:
knn_res_1 = df_item[df_item['movie id'].isin(K_sim_index[0])]
knn_res_1[['movie id', 'movie title', 'release date']]

Unnamed: 0,movie id,movie title,release date
0,1,Toy Story (1995),01-Jan-1995
94,95,Aladdin (1992),01-Jan-1992
101,102,"Aristocats, The (1970)",01-Jan-1970
168,169,"Wrong Trousers, The (1993)",01-Jan-1993
188,189,"Grand Day Out, A (1992)",01-Jan-1992
224,225,101 Dalmatians (1996),27-Nov-1996
239,240,Beavis and Butt-head Do America (1996),20-Dec-1996
242,243,Jungle2Jungle (1997),07-Mar-1997
258,259,George of the Jungle (1997),01-Jan-1997
260,261,Air Bud (1997),01-Aug-1997


查看两种方法推荐的电影有多少是一样的

In [10]:
left = res_1[['movie id', 'movie title', 'release date']]
right = knn_res_1[['movie id', 'movie title', 'release date']]
res = pd.merge(left, right, how='inner', on=['movie id'])
res

Unnamed: 0,movie id,movie title_x,release date_x,movie title_y,release date_y
0,1,Toy Story (1995),01-Jan-1995,Toy Story (1995),01-Jan-1995
1,95,Aladdin (1992),01-Jan-1992,Aladdin (1992),01-Jan-1992
2,240,Beavis and Butt-head Do America (1996),20-Dec-1996,Beavis and Butt-head Do America (1996),20-Dec-1996
3,422,Aladdin and the King of Thieves (1996),01-Jan-1996,Aladdin and the King of Thieves (1996),01-Jan-1996


- 从上面结果看，10部中有3部推荐的完全一样。
- 如果考虑，模型融合的方式，则这两种方法都推荐的这3部电影，鲁棒性更强

## 5 Approximated KNN

再使用Approximated KNN算法，从速度和推荐效果来对比 KNN 算法

安装[Faiss](https://github.com/facebookresearch/faiss)

![](../picture/5.png)
![](../picture/6.png)
![](../picture/7.png)

参考[在多个GPU上运行Faiss以及性能测试](http://www.cnblogs.com/DjangoBlog/p/8615540.html)，构建一个简单的例子

    1.1在CPU上运行
    Faiss的所有算法都是围绕index展开的。不管运行搜索还是聚类，首先都要建立一个index。

    import faiss
    # make faiss available
    index = faiss.IndexFlatL2(d)
    # build the index
    # d is the dimension of data
    在运行上述代码后，就可以添加数据并运行搜索了。

    index.add(xb)
    # xb is the base data
    D, I = index.search(xq, k)
    # xq is the query data
    # k is the num of neigbors you want to search
    # D is the distance matrix between xq and k neigbors
    # I is the index matrix of k neigbors

In [11]:
import faiss

In [12]:
dim = 19     #电影的特征数量
faiss_index = faiss.IndexFlatL2(dim)
faiss_index.add(movie_matrix)

AssertionError: 

上面错误的原因在于，其只能搜索连续的值，解决方法参考自：

[AssertionError: assert x.flags.contiguous #459](https://github.com/facebookresearch/faiss/issues/459)

In [13]:
dim = 19     #电影的特征数量
faiss_index = faiss.IndexFlatL2(dim)
faiss_index.add(np.ascontiguousarray(movie_matrix[1:].astype('float32')))

In [14]:
k = 10                          # we want to see 10 nearest neighbors
D, I = faiss_index.search(np.ascontiguousarray(movie_matrix.astype('float32')), k)     # actual search
print(I[:5])                   # neighbors of the 5 first queries
print(D[-5:])                  # neighbors of the 5 last queries

[[ 420  136  187  100  137  167   92  238  223   93]
 [ 928  116 1011  564  115  980 1103 1014  825    0]
 [ 357  464  771    1  593  613  973  941  216  196]
 [  72    2   63  148   45   91  314  234   43   32]
 [ 346    3 1206  327  647 1067 1211  330   98   53]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [15]:
I[0]+1

array([421, 137, 188, 101, 138, 168,  93, 239, 224,  94])

- 此种方法推荐的10部电影的 moive id 为 I[0] + 1，看起来有几个和之前的 id相差 1
- fiass: [1, 421, 137, 188, 101, 138, 168,  93, 239, 224,  94]
- knn : [  1, 422, 169, 189, 261, 240, 102,  95, 259, 243, 225]
- content-based : [   1,   95,  240,  377,  422,  477,  969,  996, 1078, 1219, 1470]
- 其中，421,188,168,94,224,239,101加1后都能找到对应的ID，这个原因需要找时间分析一下