In [10]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
anime_data = pd.read_csv("data/anime.csv")
anime_user_ratings = pd.read_csv("data/animelist.csv")
anime_data = anime_data.rename(columns = {"MAL_ID": "anime_id"})
anime_id_to_name = anime_data[["anime_id", "Name"]]

In [3]:
ratings_df = anime_user_ratings.merge(anime_id_to_name, left_on = 'anime_id', right_on = 'anime_id', how = 'left')
ratings_df = ratings_df[["user_id", "Name", "anime_id","rating","watching_status"]]
ratings_df.head()

Unnamed: 0,user_id,Name,anime_id,rating,watching_status
0,0,Basilisk: Kouga Ninpou Chou,67,9,1
1,0,Fairy Tail,6702,7,1
2,0,Gokusen,242,10,1
3,0,Kuroshitsuji,4898,0,1
4,0,One Piece,21,10,1


In [4]:
n_ratings_by_user = ratings_df['user_id'].value_counts()
n_ratings_per_anime = ratings_df['anime_id'].value_counts()
#limit recommendations to those by users who've recommended 100+ anime and anime with over 20000 ratings, and to those either completed or dropped
ratings_df = ratings_df[ratings_df["watching_status"].isin([1,2])]
ratings_df = ratings_df[ratings_df['user_id'].isin(n_ratings_by_user[n_ratings_by_user >= 100].index)].copy() 
ratings_df = ratings_df[ratings_df['anime_id'].isin(n_ratings_per_anime[n_ratings_per_anime >= 20000].index)].copy() 

ratings_df.head()

Unnamed: 0,user_id,Name,anime_id,rating,watching_status
75,1,Ano Hi Mita Hana no Namae wo Bokutachi wa Mada...,9989,0,1
76,1,Ansatsu Kyoushitsu,24833,0,1
77,1,Bakuman.,7674,7,1
78,1,Black Clover,34572,0,1
79,1,Boruto: Naruto Next Generations,34566,7,1


In [5]:
Avg_Rating = np.mean(ratings_df['rating'])
print("Average Rating: {} \nNumber of Ratings: {} \nUnique Anime: {}\nUnique Users: ".format(Avg_Rating, len(ratings_df), ratings_df['anime_id'].nunique()), ratings_df['user_id'].nunique())

Average Rating: 6.3713422487674745 
Number of Ratings: 49095534 
Unique Anime: 1501
Unique Users:  241288


In [6]:
g = ratings_df.groupby('user_id')['rating'].count()
top_users = g.dropna().sort_values(ascending=False)[:20]
top_r = ratings_df.join(top_users, rsuffix='_r', how='inner', on='user_id')

g = ratings_df.groupby('anime_id')['rating'].count()
top_animes = g.dropna().sort_values(ascending=False)[:20]
top_r = top_r.join(top_animes, rsuffix='_r', how='inner', on='anime_id')

pivot = pd.crosstab(top_r.user_id, top_r.anime_id, top_r.rating, aggfunc=np.sum)
pivot.fillna(0, inplace = True) #replace NaNs (unwatched) with a rating of 0

In [35]:
pivottable = ratings_df.pivot_table(index="Name",columns="user_id", values="rating").fillna(0)
pivottable

user_id,1,2,3,4,5,6,7,8,11,12,...,353389,353390,353391,353392,353393,353395,353396,353398,353400,353403
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Bungaku Shoujo"" Movie",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11eyes,0.0,0.0,0.0,0.0,0.0,7.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
3-gatsu no Lion,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.0,7.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zutto Mae kara Suki deshita.: Kokuhaku Jikkou Iinkai,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0
ef: A Tale of Melodies.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ef: A Tale of Memories.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,9.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
xxxHOLiC,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
matrix = csr_matrix(pivottable.values)
model = NearestNeighbors(metric="cosine", algorithm= 'brute')
model.fit(matrix)

def predict():
    random_anime = np.random.choice(pivottable.shape[0]) # This will choose a random anime name and our model will predict on it.
    query = pivottable.iloc[random_anime, :].values.reshape(1, -1)
    distance, suggestions = model.kneighbors(query, n_neighbors=6)
    
    for i in range(0, len(distance.flatten())):
        if i == 0:
            print('Recommendations for {0}:\n'.format(pivottable.index[random_anime]))
        else:
            print('{0}: {1}, with distance of {2}:'.format(i, pivottable.index[suggestions.flatten()[i]], distance.flatten()[i]))

predict()

Recommendations for Ansatsu Kyoushitsu:

1: Ansatsu Kyoushitsu 2nd Season, with distance of 0.15730932756257865:
2: Boku no Hero Academia, with distance of 0.36219278468446714:
3: One Punch Man, with distance of 0.3728888275011487:
4: No Game No Life, with distance of 0.3854779686010765:
5: Noragami, with distance of 0.39195573425933306:
