In [1]:
#Import libraries
import numpy as np
import pandas as pd
import warnings
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings('ignore')

In [2]:
anime = pd.read_csv("/Users/adarshcj/Downloads/anime1.csv")
anime.head()

Unnamed: 0,anime_id,name,score,genres,type,episodes,members
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,1251960
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,273145
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26,558913
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",TV,26,94683
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",TV,52,13224


In [3]:
#Select TV shows and movies
print(anime.shape)
anime = anime[(anime['type'] == 'TV') | (anime['type'] == 'Movie')]
print(anime.shape)

(17562, 7)
(8037, 7)


In [4]:
#Select popular animes(75% percentile)
m = anime['members'].quantile(0.75)
anime = anime[(anime['members'] >= m)]
anime.shape

(2010, 7)

In [5]:
rating = pd.read_csv("/Users/adarshcj/Downloads/rating.csv")
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9


In [6]:
#Replace missing rating values with NaN
rating.loc[rating.rating == -1, 'rating'] = np.NaN
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,0,430,9.0
1,0,1004,5.0
2,0,3010,7.0
3,0,570,7.0
4,0,2762,9.0


In [7]:
#Create an anime index for names
anime_index = pd.Series(anime.index, index=anime.name)
anime_index.head()

name
Cowboy Bebop                       0
Cowboy Bebop: Tengoku no Tobira    1
Trigun                             2
Witch Hunter Robin                 3
Eyeshield 21                       5
dtype: int64

In [8]:
#Join data
joined = anime.merge(rating, how='inner', on='anime_id')
joined.head()

Unnamed: 0,anime_id,name,score,genres,type,episodes,members,user_id,rating
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,1251960,3,9.0
1,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,1251960,6,6.0
2,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,1251960,14,9.0
3,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,1251960,19,8.0
4,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,1251960,22,9.0


In [9]:
#Create a pivot table
joined = joined[['user_id', 'name', 'rating']]

#Can only use 10,000 users cause of memory error.
joined = joined[(joined['user_id'] <= 10000)]

pivot = pd.pivot_table(joined, index='user_id', columns='name', values='rating')
pivot.head()

name,"""Bungaku Shoujo"" Movie",.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,07-Ghost,100-man no Inochi no Ue ni Ore wa Tatteiru,11eyes,18if,3-gatsu no Lion,3-gatsu no Lion 2nd Season,...,Zoku Sayonara Zetsubou Sensei,Zombie-Loan,Zombieland Saga,Zutto Mae kara Suki deshita.: Kokuhaku Jikkou Iinkai,ef: A Tale of Melodies.,ef: A Tale of Memories.,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,9.0,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,8.0,9.0,


In [10]:
pivot.shape

(8673, 1954)

In [11]:
#Drop users who never rate an anime
pivot.dropna(axis=0, how='all', inplace=True)
pivot.head()

name,"""Bungaku Shoujo"" Movie",.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,07-Ghost,100-man no Inochi no Ue ni Ore wa Tatteiru,11eyes,18if,3-gatsu no Lion,3-gatsu no Lion 2nd Season,...,Zoku Sayonara Zetsubou Sensei,Zombie-Loan,Zombieland Saga,Zutto Mae kara Suki deshita.: Kokuhaku Jikkou Iinkai,ef: A Tale of Melodies.,ef: A Tale of Memories.,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,9.0,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,8.0,9.0,


In [12]:
pivot.shape

(8673, 1954)

In [13]:
#Center mean around 0(Centered Cosine)
pivot_norm = pivot.apply(lambda x: x - np.nanmean(x), axis=1)
pivot_norm.head()

name,"""Bungaku Shoujo"" Movie",.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,07-Ghost,100-man no Inochi no Ue ni Ore wa Tatteiru,11eyes,18if,3-gatsu no Lion,3-gatsu no Lion 2nd Season,...,Zoku Sayonara Zetsubou Sensei,Zombie-Loan,Zombieland Saga,Zutto Mae kara Suki deshita.: Kokuhaku Jikkou Iinkai,ef: A Tale of Melodies.,ef: A Tale of Memories.,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,1.401361,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,0.1375,1.1375,


In [14]:
# User based Collabrative Filtering
pivot_norm.fillna(0, inplace=True)
pivot_norm.head()

name,"""Bungaku Shoujo"" Movie",.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,07-Ghost,100-man no Inochi no Ue ni Ore wa Tatteiru,11eyes,18if,3-gatsu no Lion,3-gatsu no Lion 2nd Season,...,Zoku Sayonara Zetsubou Sensei,Zombie-Loan,Zombieland Saga,Zutto Mae kara Suki deshita.: Kokuhaku Jikkou Iinkai,ef: A Tale of Melodies.,ef: A Tale of Memories.,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.401361,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1375,1.1375,0.0


In [15]:
#Calculate similar users by converting them into dataframes
user_sim_df = pd.DataFrame(cosine_similarity(pivot_norm, pivot_norm), index=pivot_norm.index, columns=pivot_norm.index)
user_sim_df.head()

user_id,0,1,2,3,4,5,6,7,8,10,...,9990,9991,9992,9993,9995,9996,9997,9998,9999,10000
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.005772,0.0,-0.007149,-0.021861,0.004904,0.086745,0.010771,0.091362,0.0,...,0.012699,0.019713,0.119738,0.0,0.119656,0.000448,0.0,0.0,0.015176,0.025548
1,0.005772,1.0,0.103517,0.03669,0.056602,0.017711,0.072101,0.054966,-0.023688,0.0,...,0.01161,0.072485,0.012098,-0.035506,-0.021583,0.022336,0.038377,0.073327,0.079093,0.089206
2,0.0,0.103517,1.0,0.04299,0.125411,-0.016409,-0.005229,0.037063,0.009426,0.0,...,0.041666,0.116486,-0.006373,-0.013906,-0.012944,-0.020534,0.013796,-0.011892,0.062205,0.063557
3,-0.007149,0.03669,0.04299,1.0,0.039001,0.032756,0.046576,0.083872,0.037338,0.0,...,0.084349,0.087704,0.082848,-0.009568,0.002855,0.015323,-0.006531,0.024581,0.134855,0.065731
4,-0.021861,0.056602,0.125411,0.039001,1.0,0.064478,-0.012656,-0.000178,-0.022087,0.0,...,-0.004264,0.037178,-0.000876,0.027622,-0.049751,0.023748,0.0,0.003424,0.092057,-0.030897


In [16]:
def get_similar_user(user_id):
    if user_id not in pivot_norm.index:
        return None, None
    else:
        sim_users = user_sim_df.sort_values(by=user_id, ascending=False).index[1:]
        sim_score = user_sim_df.sort_values(by=user_id, ascending=False).loc[:, user_id].tolist()[1:]
        return sim_users, sim_score

In [17]:
users, score = get_similar_user(3)
for x,y in zip(users[:10], score[:10]):
    print("User {} with similarity of {}".format(x, y))

User 8314 with similarity of 0.3074054391455056
User 5658 with similarity of 0.27297164194902696
User 321 with similarity of 0.2721364099203867
User 469 with similarity of 0.258194229797845
User 5653 with similarity of 0.25701042191293516
User 2976 with similarity of 0.2542297079378905
User 5049 with similarity of 0.25277778944383084
User 9479 with similarity of 0.25220814088995436
User 7865 with similarity of 0.24776761257482074
User 7032 with similarity of 0.24455574017088355


In [18]:
#Getting Recommendations
def get_recommendation(user_id, n_anime=10):
    users, scores = get_similar_user(user_id)
    
    #There's no information for this user
    if users is None or score is None:
        return None
    
    #Only take 10 nearest users
    user_arr = np.array([x for x in users[:10]])
    sim_arr = np.array([x for x in scores[:10]])
    predicted_rating = np.array([])
    
    for anime_name in pivot_norm.columns:
        filtering = pivot_norm[anime_name].loc[user_arr] != 0.0  
        temp = np.dot(pivot[anime_name].loc[user_arr[filtering]], sim_arr[filtering]) / np.sum(sim_arr[filtering])
        predicted_rating = np.append(predicted_rating, temp)
    
    #Do not recommend something that user has already rated
    temp = pd.DataFrame({'predicted':predicted_rating, 'name':pivot_norm.columns})
    filtering = (pivot_norm.loc[user_id] == 0.0)
    temp = temp.loc[filtering.values].sort_values(by='predicted', ascending=False)

    #Recommend n_anime anime
    return anime.loc[anime_index.loc[temp.name[:n_anime]]]

In [19]:
get_recommendation(2)

Unnamed: 0,anime_id,name,score,genres,type,episodes,members
6607,11741,Fate/Zero 2nd Season,8.59,"Action, Supernatural, Magic, Fantasy",TV,12,808294
5856,9656,Kimi ni Todoke 2nd Season,8.01,"Slice of Life, Drama, Romance, School, Shoujo",TV,12,334486
4273,5680,K-On!,7.84,"Music, Slice of Life, Comedy, School",TV,13,776322
72,93,Mobile Suit Gundam SEED,7.79,"Action, Drama, Mecha, Military, Romance, Sci-F...",TV,50,140346
770,853,Ouran Koukou Host Club,8.2,"Comedy, Harem, Romance, School, Shoujo",TV,26,827960
7013,13601,Psycho-Pass,8.38,"Action, Sci-Fi, Police, Psychological",TV,22,1211503
187,210,Ranma ½,7.76,"Slice of Life, Comedy, Martial Arts, Fantasy, ...",TV,161,194975
1654,1818,Claymore,7.78,"Action, Adventure, Super Power, Demons, Supern...",TV,26,544135
182,205,Samurai Champloo,8.5,"Action, Adventure, Comedy, Historical, Samurai...",TV,26,892196
1415,1559,Shijou Saikyou no Deshi Kenichi,8.11,"Action, Comedy, Martial Arts, School, Shounen",TV,50,242355


In [20]:
get_recommendation(10)

Unnamed: 0,anime_id,name,score,genres,type,episodes,members
4584,6408,"""Bungaku Shoujo"" Movie",7.41,"Mystery, Drama, Romance, School",Movie,1,57895
788,873,.hack//Roots,6.91,"Adventure, Drama, Fantasy, Game, Sci-Fi",TV,26,66696
29,48,.hack//Sign,6.98,"Game, Sci-Fi, Adventure, Mystery, Magic, Fantasy",TV,26,158227
274,298,.hack//Tasogare no Udewa Densetsu,6.6,"Adventure, Comedy, Fantasy, Game, Sci-Fi, Shounen",TV,12,60523
4168,5525,07-Ghost,7.24,"Action, Demons, Fantasy, Josei, Magic, Military",TV,25,183988
16625,41380,100-man no Inochi no Ue ni Ore wa Tatteiru,6.45,"Action, Game, Drama, Fantasy, Shounen",TV,12,132579
4693,6682,11eyes,6.16,"Action, Ecchi, Super Power, Supernatural",TV,12,207294
12934,35248,18if,6.17,"Mystery, Supernatural",TV,13,46598
11046,31646,3-gatsu no Lion,8.43,"Drama, Game, Seinen, Slice of Life",TV,22,461713
12898,35180,3-gatsu no Lion 2nd Season,9.0,"Drama, Game, Seinen, Slice of Life",TV,22,266243


In [21]:
get_recommendation(20)

Unnamed: 0,anime_id,name,score,genres,type,episodes,members
5683,9253,Steins;Gate,9.11,"Thriller, Sci-Fi",TV,24,1771162
11857,33352,Violet Evergarden,8.64,"Slice of Life, Drama, Fantasy",TV,13,1039300
148,170,Slam Dunk,8.53,"Comedy, Drama, School, Shounen, Sports",TV,101,201203
4970,7311,Suzumiya Haruhi no Shoushitsu,8.65,"Comedy, Mystery, Romance, School, Sci-Fi, Supe...",Movie,1,486871
9648,27833,Durarara!!x2 Ketsu,8.1,"Action, Mystery, Supernatural",TV,12,297451
4728,6746,Durarara!!,8.18,"Action, Mystery, Supernatural",TV,24,1111256
10569,30484,Steins;Gate 0,8.51,"Sci-Fi, Thriller",TV,23,590847
6201,10408,Hotarubi no Mori e,8.38,"Drama, Romance, Shoujo, Supernatural",Movie,1,567617
11234,32105,Sousei no Onmyouji,7.32,"Action, Supernatural, Romance, Fantasy, Shounen",TV,50,375693
202,226,Elfen Lied,7.56,"Action, Horror, Psychological, Supernatural, D...",TV,13,1187921
