In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_pickle("../data/data.pkl")

### Popular recommend

In [3]:
popular_df = df[["name", "user_rating"]]

In [4]:
popular_df.groupby("name")["user_rating"].mean().sort_values(
    ascending=False
).reset_index().head(10)

Unnamed: 0,name,user_rating
0,Gintama°,9.449495
1,Kimi no Na wa.,9.426313
2,Ginga Eiyuu Densetsu,9.389788
3,Fullmetal Alchemist: Brotherhood,9.322741
4,Gintama&#039;,9.272552
5,Steins;Gate,9.261326
6,Gintama,9.236398
7,Hunter x Hunter (2011),9.234586
8,Gintama&#039;: Enchousen,9.202258
9,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,9.19143


### Content based filtering
- 微妙な気がするのでパス
- タイトル、ジャンルの情報でembeddingして類似度を求めるのが王道パターン
- 概要文があればいいけど、この程度の情報だけどだとシリーズものばかりが多分出てしまう

In [5]:
df = pd.read_pickle("../data/pivot_data.pkl")

In [6]:
df = df.T.reset_index()

In [7]:
df.head()

name,user_id,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,ef: A Tale of Melodies. - Prologue,ef: A Tale of Memories.,ef: A Tale of Memories. - Prologue,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,8.11,0.0,0.0,0.0,0.0
4,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# reduce data due to memory limitation
df = df[0:1000]

In [9]:
sparse_df = csr_matrix(df.values)

In [10]:
# calc user similary
user_similarity = cosine_similarity(sparse_df)

In [11]:
user_similarity = pd.DataFrame(user_similarity, index=df.index, columns=df.index)

In [12]:
user_similarity.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,1.0,0.014876,0.105364,0.099098,0.090065,0.161376,0.048674,0.265726,0.056458,0.206205,...,0.064656,0.078466,0.06795,0.068065,0.075602,0.068228,0.064593,0.068359,0.064585,0.068187
1,0.014876,1.0,0.121212,0.00725,0.011835,0.066277,0.173189,0.13259,0.030559,0.068272,...,0.230053,0.235594,0.237464,0.228941,0.237255,0.229515,0.22983,0.229978,0.2298,0.237125
2,0.105364,0.121212,1.0,0.199933,0.149205,0.07478,0.030677,0.180902,0.092445,0.168593,...,0.040749,0.062603,0.045167,0.056758,0.05484,0.045226,0.041659,0.043287,0.047682,0.060956
3,0.099098,0.00725,0.199933,1.0,0.350731,0.104789,0.05637,0.040428,0.079002,0.091892,...,0.031922,0.087531,0.036718,0.048735,0.046309,0.049352,0.034637,0.033575,0.036534,0.051481
4,0.090065,0.011835,0.149205,0.350731,1.0,0.063324,0.076799,0.078495,0.161621,0.10191,...,0.051922,0.09427,0.055369,0.066911,0.067458,0.066599,0.052247,0.054969,0.054434,0.062283


In [13]:
# Extract users similar to user A
sim_user_1 = user_similarity.iloc[0, :].sort_values(ascending=False)[0:100]
sim_user_1 = list(sim_user_1.index)
sim_user_1.pop(0)

0

In [14]:
sim_df = df[df["user_id"].isin(sim_user_1)]
sim_df.drop("user_id", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sim_df.drop("user_id", axis=1, inplace=True)


In [15]:
recommend_list = (sim_df != 0).mean().sort_values(ascending=False)
recommend_list = list(recommend_list.index)

In [16]:
recommend_list

['Death Note',
 'Sword Art Online',
 'Shingeki no Kyojin',
 'Elfen Lied',
 'Angel Beats!',
 'Code Geass: Hangyaku no Lelouch',
 'Mirai Nikki (TV)',
 'Naruto',
 'Code Geass: Hangyaku no Lelouch R2',
 'Fullmetal Alchemist: Brotherhood',
 'Highschool of the Dead',
 'Soul Eater',
 'Ao no Exorcist',
 'Steins;Gate',
 'Fullmetal Alchemist',
 'Durarara!!',
 'Deadman Wonderland',
 'Toradora!',
 'Clannad',
 'Hataraku Maou-sama!',
 'High School DxD',
 'Another',
 'Clannad: After Story',
 'Noragami',
 'Cowboy Bebop',
 'Suzumiya Haruhi no Yuuutsu',
 'Kuroshitsuji',
 'Tonari no Kaibutsu-kun',
 'Psycho-Pass',
 'Magi: The Labyrinth of Magic',
 'Neon Genesis Evangelion',
 'No Game No Life',
 'Mahou Shoujo Madoka★Magica',
 'Sen to Chihiro no Kamikakushi',
 'Tokyo Ghoul',
 'Chuunibyou demo Koi ga Shitai!',
 'Bleach',
 'Boku wa Tomodachi ga Sukunai',
 'Date A Live',
 'Danganronpa: Kibou no Gakuen to Zetsubou no Koukousei The Animation',
 'Fate/stay night',
 'Dragon Ball Z',
 'Blood Lad',
 'Fate/Zero',
 'K

In [17]:
# post processing
# remove user A's watched anime
def nonzero(row):
    nonzero_cols = row[row != 0].index.to_list()
    return nonzero_cols


user_1 = df[df["user_id"] == 1]
user_1.drop("user_id", axis=1, inplace=True)
user_1 = user_1[user_1.apply(lambda x: x.astype(bool).any(), axis=1)]
watched = user_1.apply(nonzero, axis=1)
watched = sum(watched.to_list(), [])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_1.drop("user_id", axis=1, inplace=True)


In [18]:
for w in watched:
    if w in recommend_list:
        recommend_list.remove(w)

In [19]:
# recommend
recommend_list[0:10]

['Death Note',
 'Shingeki no Kyojin',
 'Elfen Lied',
 'Angel Beats!',
 'Code Geass: Hangyaku no Lelouch',
 'Mirai Nikki (TV)',
 'Naruto',
 'Code Geass: Hangyaku no Lelouch R2',
 'Fullmetal Alchemist: Brotherhood',
 'Soul Eater']

In [20]:
watched

['High School DxD',
 'High School DxD New',
 'Highschool of the Dead',
 'Sword Art Online']

In [21]:
# お酒飲みながら書いてるから多分ミスってる