In [2]:
# 使用するライブラリのインポート
import numpy as np
import pandas as pd
import matplotlib as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [3]:
# csvをデータフレーム形式で読み込み
ratings = pd.read_csv('csv/rating.csv')
anime = pd.read_csv('csv/anime.csv')

In [4]:
# ratingのデータフレームの最初の5行を表示
ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [5]:
# animeのデータフレームの最初の5行を表示
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [6]:
# animeデータフレームをmembersの数で降順ソートして10件表示
anime.sort_values('members', ascending=False)[:10]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
40,1535,Death Note,"Mystery, Police, Psychological, Supernatural, ...",TV,37,8.71,1013917
86,16498,Shingeki no Kyojin,"Action, Drama, Fantasy, Shounen, Super Power",TV,25,8.54,896229
804,11757,Sword Art Online,"Action, Adventure, Fantasy, Game, Romance",TV,25,7.83,893100
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
159,6547,Angel Beats!,"Action, Comedy, Drama, School, Supernatural",TV,13,8.39,717796
19,1575,Code Geass: Hangyaku no Lelouch,"Action, Mecha, Military, School, Sci-Fi, Super...",TV,25,8.83,715151
841,20,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
445,10620,Mirai Nikki (TV),"Action, Mystery, Psychological, Shounen, Super...",TV,26,8.07,657190
131,4224,Toradora!,"Comedy, Romance, School, Slice of Life",TV,25,8.45,633817


In [7]:
# animeの基本統計量の確認
round(anime.describe(), 2)

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.22,6.47,18071.34
std,11455.29,1.03,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [8]:
# ratindsの基本統計量確認
# ratingの-1は「アニメを見たことがあるが、ratingを付与しなかった」
round(ratings.describe(), 2)

Unnamed: 0,user_id,anime_id,rating
count,7813737.0,7813737.0,7813737.0
mean,36727.96,8909.07,6.14
std,20997.95,8883.95,3.73
min,1.0,1.0,-1.0
25%,18974.0,1240.0,6.0
50%,36791.0,6213.0,7.0
75%,54757.0,14093.0,9.0
max,73516.0,34519.0,10.0


In [9]:
# ratingsのヒストグラムを作成
ratings['rating'].hist(bins=11, figsize=(10, 10), color='red')

<matplotlib.axes._subplots.AxesSubplot at 0x1a10c2e710>

In [10]:
# membersの値が10000より大きいデータのみに変更
anime = anime[anime.members > 10000]
round(anime.describe(), 2)

Unnamed: 0,anime_id,rating,members
count,2990.0,2947.0,2990.0
mean,11390.69,7.4,69017.84
std,10802.35,0.64,94410.4
min,1.0,2.37,10005.0
25%,1556.25,7.02,17817.75
50%,8262.0,7.42,34583.0
75%,19364.5,7.82,79131.0
max,34451.0,9.37,1013917.0


In [11]:
# 欠損データの確認
anime.isnull().sum()

anime_id     0
name         0
genre        2
type         4
episodes     0
rating      43
members      0
dtype: int64

In [12]:
# 欠損データをdropna()でデータセットから取り除く
anime = anime.dropna()

In [13]:
# ratingの値が0以上のみを残す
ratings = ratings[ratings.rating >= 0]
round(ratings.describe(), 2)

Unnamed: 0,user_id,anime_id,rating
count,6337241.0,6337241.0,6337241.0
mean,36747.91,8902.87,7.81
std,21013.4,8882.0,1.57
min,1.0,1.0,1.0
25%,18984.0,1239.0,7.0
50%,36815.0,6213.0,8.0
75%,54873.0,14075.0,9.0
max,73516.0,34475.0,10.0


In [18]:
# animeとratingsの2つのデータフレームをマージさせる
mergeddf = ratings.merge(anime, on='anime_id', suffixes=['_user', '_average'])
mergeddf.head()

Unnamed: 0,user_id,anime_id,rating_user,name,genre,type,episodes,rating_average,members
0,1,8074,10,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
1,3,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
2,5,8074,2,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
3,12,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
4,14,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892


In [35]:
# mergeddfの基本統計量確認
round(mergeddf.describe(), 2)

Unnamed: 0,user_id,anime_id,rating_user,rating,members
count,5957004.0,5957004.0,5957004.0,5957004.0,5957004.0
mean,36665.8,9023.38,7.87,7.74,196030.78
std,21030.91,8920.21,1.54,0.62,191317.96
min,1.0,1.0,1.0,2.37,10005.0
25%,18911.0,1195.0,7.0,7.35,57029.0
50%,36534.0,6500.0,8.0,7.74,130689.0
75%,54851.0,14345.0,9.0,8.17,271484.0
max,73516.0,34240.0,10.0,9.37,1013917.0


In [26]:
# 不必要な項目と重複項目を削除
mergeddf = mergeddf[['user_id', 'name', 'rating_user']]
mergeddf = mergeddf.drop_duplicates(['user_id', 'name'])
mergeddf.info()
mergeddf.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5956997 entries, 0 to 5957003
Data columns (total 3 columns):
user_id        int64
name           object
rating_user    int64
dtypes: int64(2), object(1)
memory usage: 181.8+ MB


Unnamed: 0,user_id,name,rating_user
0,1,Highschool of the Dead,10
1,3,Highschool of the Dead,6
2,5,Highschool of the Dead,2
3,12,Highschool of the Dead,6
4,14,Highschool of the Dead,6
5,17,Highschool of the Dead,7
6,24,Highschool of the Dead,7
7,27,Highschool of the Dead,9
8,29,Highschool of the Dead,2
9,30,Highschool of the Dead,8


In [24]:
# データフレームのピボット
anime_pivot = mergeddf.pivot(index='name', columns='user_id', values='rating_user').fillna(0)
anime_pivot_sparse = csr_matrix(anime_pivot.values)

In [25]:
# anime_pivotの最初の10行を表示
anime_pivot.head(20)

user_id,1,2,3,5,7,8,9,10,11,12,...,73507,73508,73509,73510,73511,73512,73513,73514,73515,73516
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Memoire,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Movie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//G.U. Returner,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//G.U. Trilogy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Gift,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Intermezzo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Liminality,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Quantum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Roots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


ここまででデータの前処理終了。ここからk近傍法を使用してレコメンド機能を作成していく

In [40]:
# Sklearnのライブラリを利用
knn = NearestNeighbors(n_neighbors=9, algorithm='brute', metric='cosine')

In [41]:
# 前処理したデータセットでモデルを訓練
model_knn = knn.fit(anime_pivot_sparse)

In [42]:
# データセットのタイトルをキーワードで検索
def searchanime(string):
    print(anime_pivot[anime_pivot.index.str.contains(string)].index[0:])

In [57]:
searchanime('Death')

Index(['Death Billiards', 'Death Note', 'Death Note Rewrite', 'Death Parade',
       'Digimon Xros Wars: Aku no Death General to Nanatsu no Oukoku',
       'Neon Genesis Evangelion: Death &amp; Rebirth'],
      dtype='object', name='name')


In [58]:
# 類似作を表示したいアニメのタイトルを設定
Anime = 'Death Note'

In [59]:
# 設定したアニメに対してのオススメアニメ10選表示
distance, indice = model_knn.kneighbors(anime_pivot.iloc[anime_pivot.index==Anime].values.reshape(1,-1), n_neighbors=11)
for i in range(0, len(distance.flatten())):
    if i == 0:
        print('Recommendations if you like the anime {0}:\n'.format(anime_pivot[anime_pivot.index==Anime].index[0]))
    else :
        print('{0}: {1} with distanse: {2}'.format(i, anime_pivot.index[indice.flatten()[i]], distance.flatten()[i]))

Recommendations if you like the anime Death Note:

1: Code Geass: Hangyaku no Lelouch with distanse: 0.3801604472543014
2: Code Geass: Hangyaku no Lelouch R2 with distanse: 0.4077402835017113
3: Elfen Lied with distanse: 0.416976282982345
4: Shingeki no Kyojin with distanse: 0.42526883654659386
5: Fullmetal Alchemist: Brotherhood with distanse: 0.43496917577512295
6: Naruto with distanse: 0.44815151601300474
7: Fullmetal Alchemist with distanse: 0.4481653659916168
8: Sword Art Online with distanse: 0.4604725702561627
9: Angel Beats! with distanse: 0.48179450739930296
10: Mirai Nikki (TV) with distanse: 0.48344862764227503
