In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator
%matplotlib inline

In [2]:
anime = pd.read_csv('./input/anime.csv')
rating = pd.read_csv('./input/rating.csv')

In [3]:
# avoid to distort the average of rating
rating.rating.replace({-1: np.nan}, inplace=True)
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


In [4]:
# Focus on TV category
anime_TV = anime[anime.type == 'TV']
anime_TV.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351


In [5]:
# Merge tables
merged = rating.merge(anime_TV, on='anime_id', suffixes= ['_user', ''])
merged.rename(columns={'rating_user':'user_rating'}, inplace=True)

In [6]:
# Consider the computer memory so that only take the first 10000 users
merged = merged[['user_id', 'name', 'user_rating']]
merged_sub = merged[merged.user_id<10000]
merged_sub.head()

Unnamed: 0,user_id,name,user_rating
0,1,Naruto,
1,3,Naruto,8.0
2,5,Naruto,6.0
3,6,Naruto,
4,10,Naruto,


In [7]:
piv = merged_sub.pivot_table(index=['user_id'], values=['user_rating'], columns=['name'])

In [96]:
piv.columns = [j for i,j in piv.columns]
print(piv.shape)
piv.head()

(9386, 2708)


Unnamed: 0_level_0,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,3 Choume no Tama: Uchi no Tama Shirimasenka?,30-sai no Hoken Taiiku,91 Days,...,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,2.0,
7,,,,,,,,,,,...,,,,,,,,,,


In [97]:
# Normalize user_rating
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm!=0).any(axis=0)]

In [98]:
# Prepare for the following functions and convert data in a sparse matrix format 
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)

In [99]:
# Compute cosine similarity values between each user/user array pair and item/item array pair
item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)

In [100]:
item_sim_df = pd.DataFrame(item_similarity, index=piv_norm.index, columns=piv_norm.index)
user_sim_df = pd.DataFrame(user_similarity, index=piv_norm.columns, columns=piv_norm.columns)

In [108]:
# Return the top 10 TVs with the highest similarity value
def top_animes(anime_name):
    count = 1
    print("Similar shows to {} include:\n".format(anime_name))
    for item in item_sim_df.sort_values(by=[anime_name], ascending=False).index[1:11]:
        print("No. {}: {}".format(count, item))
        count += 1

In [109]:
top_animes('Fate/Zero')

Similar shows to Fate/Zero include:

No. 1: Fate/Zero 2nd Season
No. 2: Fate/stay night: Unlimited Blade Works
No. 3: Fate/stay night: Unlimited Blade Works 2nd Season
No. 4: Steins;Gate
No. 5: Fullmetal Alchemist: Brotherhood
No. 6: Psycho-Pass
No. 7: Code Geass: Hangyaku no Lelouch
No. 8: Code Geass: Hangyaku no Lelouch R2
No. 9: Monogatari Series: Second Season
No. 10: Mahou Shoujo Madoka★Magica


In [110]:
# Return the top 5 users with the highest similarity value
def top_users(user):
    if user not in piv_norm.columns:
        return("No data available on user {}".format(user))
    print("Most Similar Users:\n")
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:11]
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values)
    for user, sim in zipped:
        print("User #{0}, Similarity value: {1:.2f}".format(user, sim))

In [111]:
top_users(3)

Most Similar Users:

User #2986, Similarity value: 0.37
User #2411, Similarity value: 0.36
User #3681, Similarity value: 0.36
User #656, Similarity value: 0.35
User #298, Similarity value: 0.34
User #3028, Similarity value: 0.34
User #8436, Similarity value: 0.33
User #2038, Similarity value: 0.33
User #2374, Similarity value: 0.33
User #4233, Similarity value: 0.33


In [114]:
# Construct a list of lists containing the highest rated TVs per similar user 
# and return the name of TVs along with the frequency it appears in the list
def similar_user_recs(user):
    if user not in piv_norm.columns:
        return("No data available on user {}".format(user))
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = piv_norm.loc[:, i].max()
        best.append(piv_norm[piv_norm.loc[:, i]==max_score].index.tolist())

    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]

In [115]:
similar_user_recs(3)

[('Boku dake ga Inai Machi', 4),
 ('Shingeki no Kyojin', 4),
 ('Steins;Gate', 4),
 ('Fullmetal Alchemist: Brotherhood', 4),
 ('Clannad: After Story', 3)]

In [130]:
# Calculate the weighted average of similar users to determine a potential rating for an input user and show
def predict_rating(anime_name, user):
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:1000]
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:, user].tolist()[1:1000]
    rating_list = []
    weight_list = []
    for idx, usr in enumerate(sim_users):
        rating = piv.loc[usr, anime_name]
        similarity = user_values[idx]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)

    return sum(rating_list) / sum(weight_list)

In [138]:
predict_rating('Fate/Zero', 3)

8.512946105165964