Let's make  a hybrid recommendation system for anime

In [1]:
import pandas as pd
import numpy as np
from html import unescape
import warnings

warnings.filterwarnings('ignore')

Load the datasets

In [2]:
df1 = pd.read_csv("anime.csv")
df2 = pd.read_csv("rating.csv")

In [3]:
df1.head(3)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262


In [4]:
df2.head(3)

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1


In [5]:
#remove html entity codes from name
df1['name'] = df1['name'].apply(lambda x: unescape(x))

# Content Based Recommendation

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

df1.dropna(subset=['genre'], inplace=True)

tfidf = TfidfVectorizer(stop_words='english')

#make a tfidf matrix based on genre to calculate cosine similarity
tfidf_matrix = tfidf.fit_transform(df1['genre'])

In [7]:
tfidf_matrix

<12232x46 sparse matrix of type '<class 'numpy.float64'>'
	with 40418 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.metrics.pairwise import linear_kernel

#calculate the cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [9]:
cosine_sim

array([[1.        , 0.14778251, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.14778251, 1.        , 0.17849957, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.17849957, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ]])

In [10]:
# Make a Series with equal rows as cosine similarity matrix
# to search the animes with name
indices = pd.Series(df1.index, index=df1['name'])

In [11]:
indices

name
Kimi no Na wa.                                            0
Fullmetal Alchemist: Brotherhood                          1
Gintama°                                                  2
Steins;Gate                                               3
Gintama'                                                  4
                                                      ...  
Toushindai My Lover: Minami tai Mecha-Minami          12289
Under World                                           12290
Violence Gekiga David no Hoshi                        12291
Violence Gekiga Shin David no Hoshi: Inma Densetsu    12292
Yasuji no Pornorama: Yacchimae!!                      12293
Length: 12232, dtype: int64

In [12]:
def content_recommender(name):
    # Get the index of anime
    idx = indices[name]


    sim_scores = list(enumerate(cosine_sim[idx]))
    

    # Sort based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    

    sim_scores = sim_scores[1:11]


    anime_indices = [i[0] for i in sim_scores]

    return df1['name'].iloc[anime_indices]

In [13]:
content_recommender('Gintama')

4                                                 Gintama'
8        Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...
9                                      Gintama': Enchousen
12                                                 Gintama
63             Gintama: Yorinuki Gintama-san on Theater 2D
65                  Gintama Movie: Shinyaku Benizakura-hen
216                       Gintama: Shinyaku Benizakura-hen
306                       Gintama: Jump Festa 2014 Special
10896                                       Gintama (2017)
380      Gintama: Nanigoto mo Saiyo ga Kanjin nano de T...
Name: name, dtype: object

# Collaborative Filtering

In [14]:
#drop all animes that are not rated
df2.drop(df2[df2['rating']==-1].index, inplace=True)
df2.reset_index()

Unnamed: 0,index,user_id,anime_id,rating
0,47,1,8074,10
1,81,1,11617,10
2,83,1,11757,10
3,101,1,15451,10
4,153,2,11771,10
...,...,...,...,...
6337236,7813732,73515,16512,7
6337237,7813733,73515,17187,9
6337238,7813734,73515,22145,10
6337239,7813735,73516,790,9


In [15]:
grp = df2.groupby('user_id')

# drop all users who have rated less than 20 anime
flt = grp.filter(lambda x: x['rating'].count()>20)

In [16]:
df = pd.merge(df1, flt, on="anime_id", how="inner")

In [17]:
# drop ova music and special
df.drop(df[(df['type']=='OVA') | (df['type']=='Special') | (df['type']=='Music')].index, inplace=True)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5108690 entries, 0 to 6151804
Data columns (total 9 columns):
 #   Column    Dtype  
---  ------    -----  
 0   anime_id  int64  
 1   name      object 
 2   genre     object 
 3   type      object 
 4   episodes  object 
 5   rating_x  float64
 6   members   int64  
 7   user_id   int64  
 8   rating_y  int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 389.8+ MB


In [19]:
# change datatype to reduce memory usuage

df = df.astype({
    'anime_id': int,
    'user_id': int,
    'rating_y': float
})

In [20]:
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating_x,members,user_id,rating_y
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,99,5.0
1,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,152,10.0
2,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,244,10.0
3,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,271,10.0
4,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,322,10.0
...,...,...,...,...,...,...,...,...,...
6143743,5101,Doutei Kawaiya,Hentai,ONA,1,5.67,787,73188,5.0
6149873,30663,Docchi mo Maid,"Action, Comedy, Ecchi, Yuri",ONA,1,5.15,223,12431,3.0
6149874,30663,Docchi mo Maid,"Action, Comedy, Ecchi, Yuri",ONA,1,5.15,223,53698,4.0
6151803,11141,Blue Seagull,"Action, Hentai",Movie,1,4.60,337,54819,2.0


In [22]:
# selecting only 3M rows because my PC doesnot support all the computation

to_pivot = df.head(3000000)

In [23]:
# Make a pivot table
pvt = pd.pivot_table(to_pivot, values='rating_y', index='name', columns='user_id')

In [24]:
to_pivot.set_index('name', inplace=True)

In [25]:
# drop rows that are all na
pvt.dropna(how='all', inplace=True)

In [26]:
# normalize the rating
pvt = pvt.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

In [27]:
pvt.replace(np.NaN, 0, inplace=True)

In [28]:
pvt

user_id,3,5,7,11,12,14,17,18,19,21,...,73499,73500,73501,73502,73503,73504,73507,73510,73513,73515
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Bungaku Shoujo"" Movie",0.0,0.000000,0.0,0.0,0.0,0.018634,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.240856,0.000000,0.000000,0.000000,0.0,0.0,0.0
91 Days,0.0,0.000000,0.0,0.0,0.0,0.000000,0.001024,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.143882,0.000000,0.000000,0.0,0.0,0.0
AKB0048: Next Stage,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.226757,0.000000,0.000000,0.000000,0.0,0.0,0.0
Aa! Megami-sama! Movie,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.253410,0.000000,-0.079923,0.000000,0.0,0.0,0.0
Aa! Megami-sama!: Sorezore no Tsubasa,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.276097,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ef: A Tale of Melodies.,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.182820,0.0,0.0,0.071709,0.000000,0.000000,0.000000,0.0,0.0,0.0
ef: A Tale of Memories.,0.0,0.000000,0.0,0.0,0.0,0.000000,-0.036349,0.0,0.0,0.0,...,0.185873,0.0,0.0,0.185873,0.000000,0.000000,0.000000,0.0,0.0,0.0
xxxHOLiC,0.0,-0.694271,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.194618,-0.027604,0.000000,0.194618,0.0,0.0,0.0
xxxHOLiC Kei,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.175341,-0.046881,0.000000,0.064230,0.0,0.0,0.0


In [29]:
from sklearn.metrics.pairwise import cosine_similarity

# calculate cosine similarity as it gave more accuracy than other algorithm
sim_cos = pd.DataFrame(cosine_similarity(pvt, pvt), index=pvt.index, columns=pvt.index)

In [30]:
def get_similar_anime_coll(anime_name):
    if anime_name not in pvt.index:
        return None, None
    else:
        sim_animes = sim_cos.sort_values(by=anime_name, ascending=False).index[1:11]
        sim_score = sim_cos.sort_values(by=anime_name, ascending=False).loc[:, anime_name].tolist()[1:11]
        return sim_animes, sim_score

In [31]:
def predict_rating_coll(user_id, anime_name, max_neighbor=10):
    animes, scores = get_similar_anime_coll(anime_name)
    anime_arr = np.array([x for x in animes])
    sim_arr = np.array([x for x in scores])
    
    # select only the anime that has already rated by user x
    filtering = pvt[user_id].loc[anime_arr] != 0

    # calculate the predicted score
    score = np.dot(sim_arr[filtering][:max_neighbor], pvt[user_id].loc[anime_arr[filtering][:max_neighbor]]) / np.sum(sim_arr[filtering][:max_neighbor])
    
    return score

In [32]:
def get_recommendation_coll(user_id, n_anime=10):
    predicted_rating = np.array([])
    

    for _anime in pvt.index:
        predicted_rating = np.append(predicted_rating, predict_rating_coll(user_id, _anime))
    
    # don't recommend something that user has already rated
    temp = pd.DataFrame({'predicted':predicted_rating, 'name':pvt.index})
    filtering = (pvt[user_id] == 0.0)
    temp = temp.loc[filtering.values].sort_values(by='predicted', ascending=False)
    
    
    return temp[1:11]

# Hybrid Filtering

In [33]:
def recommend_user(user_id, name):
    rslt = list(set(get_recommendation_coll(user_id)['name'].tolist() + content_recommender(name).tolist()))
    return rslt

In [34]:
recommend_user(25, 'Naruto')

['Byousoku 5 Centimeter',
 'Kyutai Panic Adventure!',
 'Naruto: Shippuuden Movie 4 - The Lost Tower',
 'Naruto: Shippuuden',
 'Nisemonogatari',
 'Naruto Shippuuden: Sunny Side Battle',
 'Naruto Soyokazeden Movie: Naruto to Mashin to Mitsu no Onegai Dattebayo!!',
 'Gintama',
 'Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono',
 'Gekkan Shoujo Nozaki-kun',
 'Saenai Heroine no Sodatekata',
 'Naruto',
 'Sakigake!! Cromartie Koukou',
 'Suzumiya Haruhi no Yuuutsu',
 'Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi',
 'Tsuritama',
 'Naruto x UT',
 'Yahari Ore no Seishun Love Comedy wa Machigatteiru. Zoku',
 'Naruto: Shippuuden Movie 6 - Road to Ninja',
 'Hanbun no Tsuki ga Noboru Sora']

In [35]:
def hybrid_recommend(name):
    rslt = list(set(get_similar_anime_coll(name)[0].tolist() + content_recommender(name).tolist()))
    return rslt

In [36]:
hybrid_recommend('Dragon Ball Z')

['Dragon Ball GT: Goku Gaiden! Yuuki no Akashi wa Suushinchuu',
 'Saint Seiya',
 'InuYasha',
 'Digimon Adventure',
 'Dragon Ball Z Movie 15: Fukkatsu no F',
 'Dragon Ball Kai',
 'Dragon Ball Z: Summer Vacation Special',
 'Naruto',
 'Dragon Ball Z: Atsumare! Gokuu World',
 'Dragon Ball',
 'Bleach',
 'Dragon Ball Z Movie 12: Fukkatsu no Fusion!! Gokuu to Vegeta',
 'Dragon Ball Z Movie 14: Kami to Kami',
 'Yuu☆Yuu☆Hakusho',
 'Sword Art Online',
 'Dragon Ball Z Movie 11: Super Senshi Gekiha!! Katsu no wa Ore da',
 'Dragon Ball Super',
 'Dragon Ball Kai (2014)']

In [41]:
# since we are only using 3M rows in this method we cannot get all the results
get_similar_anime_coll('One Punch Man')[0]

Index(['Boku no Hero Academia', 'Shokugeki no Souma',
       'Boku dake ga Inai Machi', 'No Game No Life', 'Overlord',
       'Noragami Aragoto', 'Kiseijuu: Sei no Kakuritsu', 'Tokyo Ghoul',
       'Mob Psycho 100', 'Nanatsu no Taizai'],
      dtype='object', name='name')

In [42]:
# collaborative filtering
content_recommender('One Punch Man')

754                        One Punch Man Specials
770                   One Punch Man: Road to Hero
10897                             One Punch Man 2
447               Tentai Senshi Sunred 2nd Season
580                          Tentai Senshi Sunred
4471           Tentai Senshi Sunred: Short Corner
557                                      Gungrave
7363     Sentou Yousei Shoujo Tasukete! Mave-chan
6995                   Ippatsu Hicchuu!! Devander
4785                                    Tokyo ESP
Name: name, dtype: object