## Key assumptions:
Collaborative Filtering, doesn’t need anything else except users’ historical preference on a set of items. Because it’s based on historical data, the core assumption here is that the users who have agreed in the past tend to also agree in the future.


Implementation largely inspired by:
https://www.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/

In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from collections import defaultdict
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split, GridSearchCV

In [2]:
anime = pd.read_csv("data/anime.csv")
rating = pd.read_csv("data/rating.csv")

In [3]:
n_users = rating.user_id.unique().shape[0]
n_items = rating.anime_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

73515 users
11200 items


In [4]:
# Replace -1 with 0, na with 0:
rating = rating.fillna(0)
rating['rating'].replace({-1: 0}, inplace=True)

In [5]:
# Create a mapping of animeid to anime name:
anime_name_mapping = dict(zip(anime.anime_id, anime.name))

inverse_name_anime_mapping = dict(zip(anime.name, anime.anime_id))

In [6]:
# Create a mapping for a new id for anime

anime_id_list = sorted(list(rating['anime_id'].unique()))
anime_id2_mapping = {v:i for i,v in enumerate(anime_id_list)}


user_id_list = sorted(list(rating['user_id'].unique()))
user_id2_mapping = {v:i for i,v in enumerate(user_id_list)}

In [7]:
inverse_anime_id2_mapping = {i:v for v, i in anime_id2_mapping.items()}

In [8]:
rating['user_id2'] = rating['user_id'].map(user_id2_mapping)
rating['anime_id2'] = rating['anime_id'].map(anime_id2_mapping)

In [9]:
from scipy.sparse import csr_matrix

row_idx = []
col_idx = []
dat = []

for row in rating.itertuples():
    row_idx.append(row[4])
    col_idx.append(row[5])
    dat.append(row[3])

row_idx = np.array(row_idx)
col_idx = np.array(col_idx)
dat = np.array(dat)
    
ratings_mat = csr_matrix((dat, (row_idx, col_idx)), shape=(n_users, n_items))

In [10]:
ratings_mat.eliminate_zeros()

In [11]:
sparsity = float(len(ratings_mat.nonzero()[0]))
sparsity /= (ratings_mat.shape[0] * ratings_mat.shape[1])
sparsity *= 100
print('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 0.77%


In [12]:
def train_test_split(ratings):
    test_row_idx = []
    test_col_idx = []
    test_dat = []
    
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[1], 
                                        size=int(0.2*len(ratings[user, :].nonzero()[1])), 
                                        replace=False)
        
        test_row_idx.extend([user]*len(test_ratings))
        test_col_idx.extend(test_ratings)
        test_dat.extend(list(train[user, test_ratings].toarray()[0]))
        
        train[user, test_ratings] = 0
        
    test = csr_matrix((test_dat, (test_row_idx, test_col_idx)), shape=(n_users, n_items))
    
    train.eliminate_zeros()
        
    # Test and training are truly disjoint
#     assert(np.all((train * test) == 0)) 
    return train, test

train, test = train_test_split(ratings_mat)

In [63]:
# from sklearn.metrics.pairwise import cosine_similarity

# def pairwise_cosine_similarity(ratings,  kind='user', dense_output=False):
#     if kind == 'user':
#         similarities_sparse = cosine_similarity(ratings, dense_output=dense_output)
#     else:
#         similarities_sparse = cosine_similarity(ratings.T, dense_output=dense_output)
#     return similarities_sparse



# def cosine_similarity_n_space(m1, m2, batch_size=100):
#     assert m1.shape[1] == m2.shape[1]
#     ret = np.ndarray((m1.shape[0], m2.shape[0]))
#     for row_i in range(0, int(m1.shape[0] / batch_size) + 1):
#         start = row_i * batch_size
#         end = min([(row_i + 1) * batch_size, m1.shape[0]])
#         if end <= start:
#             break # cause I'm too lazy to elegantly handle edge cases
#         rows = m1[start: end]
#         sim = cosine_similarity(rows, m2) # rows is O(1) size
#         ret[start: end] = sim
#     return ret

In [69]:
# from sklearn.metrics.pairwise import cosine_similarity

# user_similarity = pairwise_cosine_similarity(train, dense_output=False, kind='user')
# item_similarity = pairwise_cosine_similarity(train, dense_output=False, kind='item')

In [13]:
# 80% train data

anime_svd = TruncatedSVD(n_components=10)
anime_features = anime_svd.fit_transform(train.T)

print ("anime_features.shape = {0}".format(anime_features.shape))


user_svd = TruncatedSVD(n_components=10)
user_features = user_svd.fit_transform(train)

print ("user_features.shape = {0}".format(user_features.shape))

anime_features.shape = (11200, 10)
user_features.shape = (73515, 10)


## Approach 1: Memory-based Approach using correlation (Unscalable):

One way to generate recommendations is for a user can enter the name of an anime and get top-k recommendations of other animes that are similar

In [14]:
corr_mat = np.corrcoef(anime_features)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [15]:
def top_k_corr_movie(anime_id, k=5):
    anime_id2 = anime_id2_mapping[anime_id]
    corr_anime = corr_mat[anime_id2].copy()
    corr_anime[np.isnan(corr_anime)] = 0
    top_anime_id2s = corr_anime.argsort()[-k:][::-1]
    top_anime_ids = [inverse_anime_id2_mapping[a] for a in top_anime_id2s]
    top_animes = [anime_name_mapping[i] for i in top_anime_ids]
    return top_animes

In [16]:
chosen_anime = 'Steins;Gate'
chosen_anime_id = inverse_name_anime_mapping[chosen_anime]
top_k_corr_movie(chosen_anime_id, k=10)

['Steins;Gate',
 'Fate/Zero',
 'Fate/Zero 2nd Season',
 'Steins;Gate Movie: Fuka Ryouiki no Déjà vu',
 'Steins;Gate: Oukoubakko no Poriomania',
 'Bakemonogatari',
 'Nisemonogatari',
 'Psycho-Pass',
 'Shinsekai yori',
 'Angel Beats!']

## Approach 2: Model-based Approach

* We can train a model to take in the truncated latent features and produce predictions, even for a new user, as long as the new user provides ratings for a few movies

* We'll use the library: scikit-surprise

In [20]:
rating['rating'].max()

10

In [21]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(rating[['user_id', 'anime_id', 'rating']], reader)

In [23]:
# Train-test split using scikit-surprise

trainset, testset = train_test_split(data, test_size=0.2)

In [28]:
SVD_model = SVD()
SVD_model.fit(trainset)
predictions = SVD_model.test(testset)

In [27]:
accuracy.rmse(predictions)

RMSE: 1.9546


1.954637929524839

Let's predict a rating for a user, for eg, user_id=1:

In [35]:
user1 = rating[(rating['user_id'] == 1) &  (rating['rating'] > 0)]
user1['anime'] = user1['anime_id'].map(anime_name_mapping)
user1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user1['anime'] = user1['anime_id'].map(anime_name_mapping)


Unnamed: 0,user_id,anime_id,rating,user_id2,anime_id2,anime
47,1,8074,10,0,5214,Highschool of the Dead
81,1,11617,10,0,6567,High School DxD
83,1,11757,10,0,6606,Sword Art Online
101,1,15451,10,0,7255,High School DxD New


Looks like user1 loves Ecchi/Romance/Fantasy genre, probably belonging to more of a teenage male demographic

What's this user's predicted rating for Naruto?

In [38]:
SVD_model.predict(uid=1, iid=20)

Prediction(uid=1, iid=20, r_ui=None, est=1.7854776896379807, details={'was_impossible': False})

Let's see what are the top 10 predictions for this user in the test set

In [60]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [65]:
top_n = get_top_n(predictions, n=10)

top_k_dict = {}

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    top_k_dict[uid] = [iid for (iid, _) in user_ratings]

In [67]:
# Top 10 animes for user 1:
[anime_name_mapping[a_id] for a_id in top_k_dict[1]]

['Sword Art Online II',
 'Highschool of the Dead: Drifters of the Dead',
 'Sakurasou no Pet na Kanojo',
 'Accel World',
 'Date A Live',
 'Kiss x Sis (TV)',
 'Naruto',
 'Kuroshitsuji',
 'IS: Infinite Stratos',
 'Strike the Blood']

Looks pretty legit! Seems like this model has learnt to personalize user 1 's preference