# import packages

In [2]:
# import packages
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import wordcloud as wc
import scipy.sparse as sparse
import xgboost as xgb
import sklearn

from xgboost import XGBRegressor
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold,train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
saving=False

# load data

In [3]:
movielens_rating=pd.read_csv("./input/ratings.csv")
movielens_link=pd.read_csv("./input/links.csv")
movielens_movie=pd.read_csv("./input/movies.csv")
movielens_score=pd.read_csv("./input/genome-scores.csv")
movielens_tag=pd.read_csv("./input/genome-tags.csv")
movielens_dict={v:k for k,v in movielens_link.movieId.to_dict().items()}
movielens_rating.movieId=movielens_rating.movieId.apply(movielens_dict.get)
movielens_rating.userId-=1
movielens_score.movieId=movielens_score.movieId.apply(movielens_dict.get)
movielens_score.tagId-=1
movielens_tag.tagId-=1
movielens_rating = movielens_rating[movielens_rating['movieId'] < 10000]
movielens_rating = movielens_rating[0:10000]
movielens_score = movielens_score[0:1000]

# Recommendation

In [5]:
# define transformers
class SparseMatrixTransformer(BaseEstimator,TransformerMixin):
    def __init__(self,row=None,col=None,value=None,shape=None):
        self.row=row
        self.col=col
        self.value=value
        self.shape=shape
    
    def fit(self,X,y=None):
        self.shape=(X[self.row].max()+1,X[self.col].max()+1)
        return self
    
    def transform(self,X,y=None):
        if type(X) is pd.DataFrame:
            return sparse.csr_matrix(sparse.coo_matrix((X[self.value],(X[self.row],X[self.col])),self.shape))
        else:
            return sparse.csr_matrix(sparse.coo_matrix((X[:,2],(X[:,0],X[:,1])),self.shape))

In [6]:
user_movie_transformer=SparseMatrixTransformer(row="userId",col="movieId",value="rating").fit(movielens_rating)
movie_tag_transformer=SparseMatrixTransformer(row="movieId",col="tagId",value="relevance",shape=(user_movie_transformer.shape[1],movielens_tag.tagId.max()+1))

In [7]:
# base class for recommendation system
class RecommenderMixin(BaseEstimator,TransformerMixin):
    def __init__(self,n_rec=10):
        self.n_rec=n_rec
    
    def evaluate(self,X_train,X_valid,silent=False):
        n_users,n_items=X_train.shape
        X=(X_train+X_valid).T
        item_rating=np.array([row.data.mean() if row.nnz else 0 for row in X])
        item_rating[np.isnan(item_rating)]
        mean_rating=X.data.mean()
        if not silent:print("predicting...")
        recommend=self.predict(X_train)
        if not silent:print("evaluating...")
        intersect=np.asarray(X_valid[np.arange(n_users).repeat(self.n_rec),recommend.flatten()].reshape(n_users,-1))
        hit=(intersect!=0).sum()
        precision=hit/recommend.size
        recall=hit/X_valid.nnz
        coverage=np.unique(recommend).size/n_items
        hit_rating=(intersect[intersect!=0]).mean()
        indices=recommend[(intersect!=0)]
        recom_rating=np.average(item_rating[indices])
        print("accuracy=%.4f"%precision)
        print("recall=%.4f"%recall)
        print("cover=%.4f"%coverage)
        print("user's vote to the hit movies=%.4f"%(hit_rating))
        print("average vote to the hit movies=%.4f"%(recom_rating))
        print("average vote to all movies=%.4f"%(mean_rating))
        
    def fit_predict(self,X,y=None):
        return self.fit(X,y).predict(X)

In [8]:
# item based colaborative filtering
# calculating the similarity between movies based on users' vote to the movies
class ItemBasedCF(RecommenderMixin):
    def __init__(self,n_rec=20,n_sim=20,file_name=None,baseline=3.0):
        super().__init__(n_rec=n_rec)
        self.baseline=baseline
        if file_name:
            self.similar_matrix=sparse.load_npz(file_name)
            self.n_sim=self.similar_matrix.shape[1]
        else:
            self.n_sim=n_sim
            self.similar_matrix=None
        
    def save(self,file_name):
        sparse.save_npz(file_name,self.similar_matrix)

    def fit(self,X,y=None,silent=False):
        if not silent:print("calculating the similar matrix...")
        n_users,n_items=X.shape
        user_movie=X.copy()
        user_movie.data-=self.baseline
        norms=sparse.linalg.norm(user_movie,axis=0)
        norms[norms==0]=1
        user_movie*=sparse.diags(1/norms)
        movie_sim_matrix=user_movie.T*user_movie
        if not silent:print("sparsifying similar matrix...")
        row=np.arange(n_items).repeat(self.n_sim)
        col=np.array([np.argpartition(row.toarray().flatten(),-self.n_sim)[-self.n_sim:].copy() for row in movie_sim_matrix]).flatten()
        self.similar_matrix=sparse.csr_matrix(sparse.coo_matrix((np.asarray(movie_sim_matrix[row,col]).flatten(),(row,col)),(n_items,n_items)))
        if not silent:print("fitting done.")
        return self
    
    def transform(self,X,y=None):
        user_movie=X.copy()
        user_movie.data-=self.baseline
        return user_movie*self.similar_matrix
    
    def predict(self,X):
        watched_movies=X.copy()
        watched_movies.data.fill(np.inf)
        movie_favor=self.transform(X)-watched_movies
        return np.array([np.argpartition(row.toarray().flatten(),-self.n_rec)[-self.n_rec:].copy() for row in movie_favor])

In [9]:
# content based recommendation system
# calculating the similarity between movies based their relevance to each tags
class ContentBasedRS(RecommenderMixin):
    #def __init__(self,n_rec=10,n_sim=100,file_name=None,baseline=3.0):
    def __init__(self,n_rec=10,n_sim=50,file_name=None,baseline=3.0):
        super().__init__(n_rec=n_rec)
        self.baseline=baseline
        if file_name:
            self.similar_matrix=sparse.load_npz(file_name)
            self.n_sim=self.similar_matrix.shape[1]
        else:
            self.n_sim=n_sim
            self.similar_matrix=None
        
    def save(self,file_name):
        sparse.save_npz(file_name,self.similar_matrix)

    def fit(self,X,y=None,silent=False):
        n_items,n_features=X.shape
        if not silent:print("sparsifying relevance matrix...")
        row=np.arange(n_items).repeat(self.n_sim)
        col=np.array([np.argpartition(row.toarray().flatten(),-self.n_sim)[-self.n_sim:].copy() for row in X]).flatten()
        relevance_matrix=sparse.csr_matrix(sparse.coo_matrix((np.asarray(X[row,col]).flatten(),(row,col)),(n_items,n_features)))
        if not silent:print("calculating the similar matrix...")
        movie_sim_matrix=relevance_matrix*relevance_matrix.T
        if not silent:print("sparsifying similar matrix...")
        row=np.arange(n_items).repeat(self.n_sim)
        col=np.array([np.argpartition(row.toarray().flatten(),-self.n_sim)[-self.n_sim:].copy() for row in movie_sim_matrix]).flatten()
        self.similar_matrix=sparse.csr_matrix(sparse.coo_matrix((np.asarray(movie_sim_matrix[row,col]).flatten(),(row,col)),(n_items,n_items)))
        if not silent:print("fitting done.")
        return self
    
    def transform(self,X,y=None):
        user_movie=X.copy()
        user_movie.data-=self.baseline
        return user_movie*self.similar_matrix
    
    def predict(self,X):
        watched_movies=X.copy()
        watched_movies.data.fill(np.inf)
        movie_favor=self.transform(X)-watched_movies
        return np.array([np.argpartition(row.toarray().flatten(),-self.n_rec)[-self.n_rec:].copy() for row in movie_favor])

In [10]:
# fitting content based recommendation system
contentRS=ContentBasedRS()
%time contentRS.fit(movie_tag_transformer.transform(movielens_score))
if saving:contentRS.save("ContentBasedRS.npz")

sparsifying relevance matrix...
calculating the similar matrix...
sparsifying similar matrix...
fitting done.
Wall time: 10.1 s


In [11]:
# fitting item based colaborative filtering
itemCF=ItemBasedCF()
%time itemCF.fit(user_movie_transformer.transform(movielens_rating))
if saving:itemCF.save("ItemBasedCF.npz")

calculating the similar matrix...
sparsifying similar matrix...
fitting done.
Wall time: 1.3 s


# 指定用户推荐10个最相关的电影以及10个次之的代码(假设：userId = 15)

In [12]:
# example of the most relevant movies recommended to a specified user
def user_recommend(rs,user):
    return movielens_movie.iloc[rs.predict(user_movie_transformer.transform(movielens_rating)[[user]]).flatten()].reset_index(drop=True)
    
user_recommend(itemCF,15)

Unnamed: 0,movieId,title,genres
0,2231,Rounders (1998),Drama
1,3462,Modern Times (1936),Comedy|Drama|Romance
2,527,Schindler's List (1993),Drama|War
3,1213,Goodfellas (1990),Crime|Drama
4,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
5,7099,Nausicaä of the Valley of the Wind (Kaze no ta...,Adventure|Animation|Drama|Fantasy|Sci-Fi
6,6618,Shaolin Soccer (Siu lam juk kau) (2001),Action|Comedy
7,2762,"Sixth Sense, The (1999)",Drama|Horror|Mystery
8,4226,Memento (2000),Mystery|Thriller
9,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi


# 指定电影推荐与之相关的10个电影以及10个次之的代码(假设：movieId = 7506)

In [13]:
# example of the most relevant movies to a specified movie
def movie_similar(rs,movie):
    index=rs.similar_matrix[movie].indices
    return pd.merge(movielens_movie.iloc[index].reset_index(drop=True),pd.DataFrame({"similar":rs.similar_matrix[movie,index].toarray().flatten()}),left_index=True,right_index=True).sort_values("similar",ascending=False).reset_index(drop=True)
    
movie_similar(itemCF,7506)

Unnamed: 0,movieId,title,genres,similar
0,3415,"Mirror, The (Zerkalo) (1975)",Drama,0.0
1,3417,"Crimson Pirate, The (1952)",Adventure|Comedy,0.0
2,3452,Romeo Must Die (2000),Action|Crime|Romance|Thriller,0.0
3,3434,Death Wish 5: The Face of Death (1994),Action|Drama,0.0
4,3432,Death Wish 3 (1985),Action|Drama,0.0
5,3431,Death Wish 2 (1982),Action|Drama,0.0
6,3430,Death Wish (1974),Action|Crime|Drama,0.0
7,3429,Creature Comforts (1989),Animation|Comedy,0.0
8,3428,"Champ, The (1979)",Drama,0.0
9,3427,Coogan's Bluff (1968),Crime,0.0


# 查看指定user的已经观看的电影

应该优先给用户推荐没有看过的电影

In [None]:
# see the already watched movies of a specified user
def user_watched(user):
    watched_movies=movielens_rating[movielens_rating.userId==user]
    return pd.DataFrame({"title":movielens_movie.title.iloc[watched_movies.movieId],"genres":movielens_movie.genres.iloc[watched_movies.movieId],"rating":watched_movies.rating.values})

user_watched(5)

# 对推荐系统进行评估和比较

In [14]:
# evaluating and comparing the recommendation system
rating_train,rating_valid=train_test_split(movielens_rating[["userId","movieId","rating"]])
rating_train=user_movie_transformer.transform(rating_train)
rating_valid=user_movie_transformer.transform(rating_valid)
%time ItemBasedCF().fit(rating_train).evaluate(rating_train,rating_valid)
%time ContentBasedRS().fit(movie_tag_transformer.transform(movielens_score)).evaluate(rating_train,rating_valid)

calculating the similar matrix...
sparsifying similar matrix...
fitting done.
predicting...
evaluating...
accuracy=0.0714
recall=0.0520
cover=0.0766
user's vote to the hit movies=4.1192
average vote to the hit movies=4.0894
average vote to all movies=3.5358
Wall time: 1.43 s
sparsifying relevance matrix...
calculating the similar matrix...
sparsifying similar matrix...
fitting done.
predicting...
evaluating...
accuracy=0.0044
recall=0.0016
cover=0.0169
user's vote to the hit movies=2.1250
average vote to the hit movies=2.4583
average vote to all movies=3.5358
Wall time: 11.2 s
