In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator
%matplotlib inline

In [2]:
anime = pd.read_csv('anime.csv')         #from MAL datasets
rating = pd.read_csv('rating.csv')
#rating -1 for not available value - nan
rating.head()    #head returns first n rows where n=5 is default

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [3]:
anime.head()


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
merged_table = rating.merge(anime, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])
#suffixes : 2-length sequence (tuple, list, ...)
#Suffix to apply to overlapping column names in the left and right side, respectively -- rating in both column
merged_table= merged_table[merged_table.user_id <=5000]
#to remove memory error
merged_table=merged_table[['user_id', 'name', 'rating_user']]
merged_table.head()


Unnamed: 0,user_id,name,rating_user
0,1,Naruto,-1
1,3,Naruto,8
2,5,Naruto,6
3,6,Naruto,-1
4,10,Naruto,-1


In [5]:
#this is for collaborative filtering. the p_table will help in defining the similarity between users and predict.
p_table = merged_table.pivot_table(index=['user_id'], columns=['name'], values='rating_user')
p_table.head()
# Memory error while working with lage datasets
#so we need to limit the data in above code

name,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,gdgd Fairies Movie: tte Iu Eiga wa Dou kana...?,iDOLM@STER Xenoglossia,iDOLM@STER Xenoglossia Specials,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,2.0,,,,,


In [6]:
#this is for collaborative filtering. the p_table will help in defining the similarity between users and predict.
p_table = merged_table.pivot_table(index=['user_id'], columns=['name'], values='rating_user')
p_table.head()
# Memory error while working with lage datasets
#so we need to limit the data in above code

name,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,gdgd Fairies Movie: tte Iu Eiga wa Dou kana...?,iDOLM@STER Xenoglossia,iDOLM@STER Xenoglossia Specials,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,2.0,,,,,


In [7]:
# subtract the mean from each rating to standardize and normalize 
#All users with only one rating or who had rated everything the same will be dropped

p_norm = p_table.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

# Drop all columns containing only zeros -- users with 0 rating
p_norm.fillna(0, inplace=True)
p_norm = p_norm.T
p_norm = p_norm.loc[:, (p_norm != 0).any(axis=0)]

# data in a sparse matrix format to be read by the following functions

p_sparse = sp.sparse.csr_matrix(p_norm.values)

#matrices show us the computed cosine similarity values between each user/user and item/item array pair.

item_similarity = cosine_similarity(p_sparse)
user_similarity = cosine_similarity(p_sparse.T)

#similarity matricies into dataframe objects:

item_sim_df = pd.DataFrame(item_similarity, index = p_norm.index, columns = p_norm.index)
user_sim_df = pd.DataFrame(user_similarity, index = p_norm.columns, columns = p_norm.columns)

In [13]:
# return the top 5 shows with the highest cosine similarity value

def top_animes(anime_name):
    count = 1
    print('Users who watch {} also like:\n'.format(anime_name))
    for item in item_sim_df.sort_values(by = anime_name, ascending = False).index[1:11]:
        print('{}'.format( item))
        count +=1  

# return the top 5 users with the highest similarity value 

def top_users(user):
    
    if user not in p_norm.columns:
        return('No data available on user {}'.format(user))
    
    print('Most Similar Users:\n')
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:11]
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values,)
    for user, sim in zipped:
        print('User #{0}, Similarity value: {1:.2f}'.format(user, sim)) 

In [14]:
top_animes('Naruto')

Users who watch Naruto also like:

Bleach
Katekyo Hitman Reborn!
Fairy Tail
Groove Adventure Rave
Sword Gai
Dragon Ball Z
Beast Wars Neo
Dragon Ball GT
Yu☆Gi☆Oh!: Duel Monsters GX
Highschool of the Dead


In [15]:
top_animes('Noragami')

Users who watch Noragami also like:

Noragami Aragoto
Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai.
Fate/stay night: Unlimited Blade Works
Barakamon
Haikyuu!!
Kyoukai no Kanata
Gin no Saji 2nd Season
Gekkan Shoujo Nozaki-kun
Kiseijuu: Sei no Kakuritsu
Hotarubi no Mori e


In [16]:
top_users(3)


Most Similar Users:

User #2277, Similarity value: 0.50
User #4647, Similarity value: 0.45
User #3225, Similarity value: 0.41
User #1038, Similarity value: 0.33
User #1406, Similarity value: 0.33
User #3028, Similarity value: 0.32
User #934, Similarity value: 0.31
User #4037, Similarity value: 0.31
User #4465, Similarity value: 0.25
User #3090, Similarity value: 0.24
