# 協調フィルタリング

## 参照
https://www.kaggle.com/ajmichelutti/collaborative-filtering-on-anime-data

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
anime = pd.read_csv("./input/anime.csv")
rating = pd.read_csv("./input/rating.csv")

In [3]:
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


## rating=-1は、未記入であるから、NaNに置き換える

In [4]:
rating["rating"].replace({-1: np.nan}, inplace=True)

In [5]:
rating.rename(columns={'rating': 'user_rating'}, inplace=True)

In [6]:
rating

Unnamed: 0,user_id,anime_id,user_rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,
...,...,...,...
7813732,73515,16512,7.0
7813733,73515,17187,9.0
7813734,73515,22145,10.0
7813735,73516,790,9.0


## TVだけを取り上げる

In [7]:
anime_tv = anime[anime['type'] == 'TV']
anime_tv.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351


## merge

In [8]:
merged = pd.merge(rating, anime_tv, on='anime_id', how='left')

In [9]:
merged.head()

Unnamed: 0,user_id,anime_id,user_rating,name,genre,type,episodes,rating,members
0,1,20,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297.0
1,1,24,,School Rumble,"Comedy, Romance, School, Shounen",TV,26,8.06,178553.0
2,1,79,,Shuffle!,"Comedy, Drama, Ecchi, Fantasy, Harem, Magic, R...",TV,24,7.31,158772.0
3,1,226,,Elfen Lied,"Action, Drama, Horror, Psychological, Romance,...",TV,13,7.85,623511.0
4,1,241,,Girls Bravo: First Season,"Comedy, Ecchi, Fantasy, Harem, Romance, School",TV,11,6.69,84395.0


## user_idが10000以下の人だけ抽出

In [10]:
merged = merged[['user_id', 'name', 'user_rating']]
merged_sub = merged[merged.user_id <= 10000]

In [11]:
merged_sub.head()

Unnamed: 0,user_id,name,user_rating
0,1,Naruto,
1,1,School Rumble,
2,1,Shuffle!,
3,1,Elfen Lied,
4,1,Girls Bravo: First Season,


## pibot_table

In [12]:
piv = merged_sub.pivot_table(index=['user_id'], columns=['name'], values='user_rating')

In [13]:
piv

name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,3 Choume no Tama: Uchi no Tama Shirimasenka?,30-sai no Hoken Taiiku,91 Days,...,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,2.0,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,,,,,,,,,,,...,,,,,,,,,,
9997,,,,,6.0,,,,,,...,,,,,,,,,,
9998,,,,,,,,,,8.0,...,,,,,,,,,,
9999,,,,,,,,,,,...,,,,,,,,,,


## ピポットを標準化

In [14]:
piv_norm = piv.apply(lambda x:(x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

In [15]:
piv_norm

name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,3 Choume no Tama: Uchi no Tama Shirimasenka?,30-sai no Hoken Taiiku,91 Days,...,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,-0.251132,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,,,,,,,,,,,...,,,,,,,,,,
9997,,,,,-0.519231,,,,,,...,,,,,,,,,,
9998,,,,,,,,,,-0.078431,...,,,,,,,,,,
9999,,,,,,,,,,,...,,,,,,,,,,


In [16]:
piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T

In [17]:
piv_norm

user_id,1,2,3,5,7,8,9,10,11,12,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0
.hack//Sign,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0
.hack//Tasogare no Udewa Densetsu,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0
009-1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,-0.519231,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
gdgd Fairies 2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0
iDOLM@STER Xenoglossia,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.020096,0.0,0.000000,0.0,0.0,0.0
s.CRY.ed,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0
xxxHOLiC,0.0,0.0,0.0,-0.251132,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0


In [18]:
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]

## スパース

In [19]:
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)

In [20]:
item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)

In [21]:
item_sim_df = pd.DataFrame(item_similarity, index=piv_norm.index, columns=piv_norm.index)
user_sim_df = pd.DataFrame(user_similarity, index=piv_norm.columns, columns=piv_norm.columns)

# コサイン類似度の高いbest10

## animeベース

In [22]:
def top_animes(anime_name):
    count = 1
    print(anime_name)
    
    for item in item_sim_df.sort_values(by=anime_name, ascending=False).index[1:11]:
        print(item)
        count+=1

In [23]:
top_animes('Naruto')

Naruto
Bleach
Yu☆Gi☆Oh! Duel Monsters
Dragon Ball GT
Pokemon Advanced Generation
Arashi no Yoru ni: Himitsu no Tomodachi
Dragon Ball Z
Domo TV
Green Green
Lemon Angel (1988/II)
Mizuiro Jidai


## userベース

In [24]:
def top_users(user):
    if user not in piv_norm.columns:
        return('No data available on user {}'.format(user))
    
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:, user].tolist()[1:11]
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    
    zipped = zip(sim_users, sim_values,)
    for user, sim in zipped:
        print(user, sim)

In [25]:
top_users(3)

2986 0.36792907496592187
2411 0.36064331366296454
3681 0.3562237220219756
656 0.35234604079594706
298 0.34274027523186557
3028 0.3392575003360781
8436 0.3341164623323613
2038 0.33342049055849127
2374 0.3321071517278494
4233 0.329088475847572
