## A Recommendation System for Anime Movies is built using Collaborative Filtering.

In [1]:
import operator
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

In [2]:
# The datsets are imported using pandas
anime = pd.read_csv('../input/anime.csv')
rating = pd.read_csv('../input/rating.csv')

In [3]:
# There are over 12000 anime movies
anime.shape

(12294, 7)

In [4]:
# There are over 7 million ratings
rating.shape

(7813737, 3)

In [5]:
# Anime datset contains all the anime movies with its features such as genre,type,no of episodes and average rating.
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [6]:
# Rating dataset contains all the user ratings. -1 means that user has not rated the movies. It is replaced by 0
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [7]:
# In the rating dataset, -1 has been replaced by NaN
rating.loc[rating.rating == -1, 'rating'] = np.NaN
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


In [8]:
# The two dataframes are merged and the column in first dataframe has been given a suffix abc two differentiate 

# between the two if column names are same. 
combined = rating.merge(anime, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_abc', ''])

In [9]:
combined['user_id'].nunique()

73515

In [10]:
# Dataframe is renamed to a easily recognizable name
combined.rename(columns = {'rating_abc':'user_rating'}, inplace = True)

In [11]:
combined.head()

Unnamed: 0,user_id,anime_id,user_rating,name,genre,type,episodes,rating,members
0,1,20,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
1,3,20,8.0,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
2,5,20,6.0,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
3,6,20,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
4,10,20,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297


In [12]:
# Only the user ratings are being considered, not the item features because we are only building a collaborative

# filtering algorithm. We only run the algorithm for the first 20000 user ids for computational reasons
combined=combined[['user_id', 'name', 'user_rating']]

combined_small= combined[combined.user_id <= 20000]
combined_small.head()

Unnamed: 0,user_id,name,user_rating
0,1,Naruto,
1,3,Naruto,8.0
2,5,Naruto,6.0
3,6,Naruto,
4,10,Naruto,


In [13]:
combined_small.shape

(2065588, 3)

In [14]:
# We create a pivot table 
collab = combined_small.pivot_table(index=['user_id'], columns=['name'], values='user_rating')

In [15]:
# We are subtracting mean from the rating of the user to normalize. Some users me be more generous in their rating,
# and some may be less generous in their rating. Normalization can deal with it.
collab_normalize = collab.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
# The users who haven't rated any anime movies are removed.
collab_normalize.fillna(0, inplace=True)

collab_normalize = collab_normalize.T
collab_normalize = collab_normalize.loc[:, (collab_normalize != 0).any(axis=0)]

In [16]:
# The data is present in sparse format i.e. it contains a lot of zeros and is converted to a better format. 

collab_sparse = sp.sparse.csr_matrix(collab_normalize.values)

In [17]:
collab_sparse

<9013x17353 sparse matrix of type '<class 'numpy.float64'>'
	with 1660773 stored elements in Compressed Sparse Row format>

Cosine Similarity is computed to find correlation 

In [None]:
# Item similarity finds correlation between anime movies.
# User similarity finds correlation between users.

item_similarity = cosine_similarity(collab_sparse)

user_similarity = cosine_similarity(collab_sparse.T)

In [None]:
# Converting the item and user similarity into pandas dataframe

collab_item_sim = pd.DataFrame(item_similarity, index = collab_normalize.index, columns = collab_normalize.index)

collab_user_sim = pd.DataFrame(user_similarity, index = collab_normalize.columns, columns = collab_normalize.columns)

In [None]:
# Top 20 similar shows are displayed

def similar_movies(movie_names):
    count = 1
    
    print('Similar anime to {} are:\n'.format(movie_names))
    
    for item in collab_item_sim.sort_values(by = movie_names, ascending = False).index[1:21]:
        print('Anime {}: {}'.format(count, item))
        count +=1  

In [None]:
# Top 10 similar users are displayed 

def similar_users(sim_user):
    print('Users with similar tastes:\n')
    
    sim_user_values = collab_user_sim.sort_values(by=sim_user, ascending=False).loc[:,sim_user].tolist()[1:11]
    sim_users = collab_user_sim.sort_values(by=sim_user, ascending=False).index[1:11]
    
    combine = zip(sim_users, sim_user_values,)
    for sim_user, sim in combine:
        print('Other users #{0}, How Similar: {1:.2f}'.format(sim_user, sim)) 

In [None]:
# This function calculates the weighted average of similar users
# to determine a potential rating for an input user and show

def user_anime_rating(anime, user):
    sim_users = collab_user_sim.sort_values(by=user, ascending=False).index[1:1000]
    
    user_values = collab_user_sim.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:1000]
    
    rating_list = []
    weight_list = []
    
    for j, i in enumerate(sim_users):
        
        rating = collab.loc[i, anime]
        
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
            
    return sum(rating_list)/sum(weight_list)    

In [None]:
similar_movies('Fate/Zero')

In [None]:
similar_users(10)

In [None]:
user_anime_rating('Code Geass: Hangyaku no Lelouch', 11)