# Content Based Recommendation System: Similarity Measure- Cosine Similarity

\begin{align}
Cosine(x,y)= \frac{\sum_{i = 1}^{n}x_i y_i
}{\sqrt{\sum_{i=1}^{n}x_i^2} \sqrt{\sum_{i=1}^{n}y_i^2}}
\end{align}

## Illustration 1

In [2]:
import numpy as np
import pandas as pd

In [4]:
contoh= pd.DataFrame({
    'Movie': ['Terminator 2', 'Ant Man 2','GOTG Vol 2','3 Idiots'],
    'Comedy':[0,1,1,1],
    'Action':[1,1,1,0],
    'Scifi':[1,1,1,0],
    'Romantic':[0,1,0,0]
})
contoh

Unnamed: 0,Movie,Comedy,Action,Scifi,Romantic
0,Terminator 2,0,1,1,0
1,Ant Man 2,1,1,1,1
2,GOTG Vol 2,1,1,1,0
3,3 Idiots,1,0,0,0


In [5]:
# Ant man vs GOTG vol 2
(1*1+1*1+1*1+1*0)/((4**(1/2))*(3**(1/2)))

0.8660254037844387

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
cosine_similarity(contoh[contoh['Movie']=='Ant Man 2'].drop(columns='Movie'),
                  contoh[contoh['Movie']=='GOTG Vol 2'].drop(columns='Movie'))

array([[0.8660254]])

In [37]:
score=cosine_similarity(contoh[contoh['Movie']=='Ant Man 2'].drop(columns='Movie'),contoh.drop(columns='Movie'))

In [42]:
contoh['score']=score.reshape(-1,1)

In [44]:
contoh[['Movie','score']].sort_values('score', ascending=False)

Unnamed: 0,Movie,score
1,Ant Man 2,1.0
2,GOTG Vol 2,0.866025
0,Terminator 2,0.707107
3,3 Idiots,0.5


Film yang direkomendasikan untuk item Ant Man adalah GOTG Vol 2, Terminator 2 dan 3 Idiots secara berurutan.

## Illustration 2

In [46]:
contoh= pd.DataFrame({
    'Comedy':[0,4,3,4],
    'Action':[5,5,5,0],
    'Scifi':[3,5,5,0],
    'Romantic':[0,4,0,0]
}, index=['Terminator 2', 'Ant Man 2','GOTG Vol 2','3 Idiots'])
contoh

Unnamed: 0,Comedy,Action,Scifi,Romantic
Terminator 2,0,5,3,0
Ant Man 2,4,5,5,4
GOTG Vol 2,3,5,5,0
3 Idiots,4,0,0,0


In [55]:
#Antman vs GOTG vol 2
(4*3+5*5+5*5+4*0)/(((4**2+5**2+5**2+4**2)**(1/2))*((3**2+5**2+5**2)**(1/2)))

0.891371527293353

In [59]:
cosine_similarity(contoh.loc[['Ant Man 2']], contoh.loc[['GOTG Vol 2']])

array([[0.89137153]])

In [63]:
contoh['score']=cosine_similarity(contoh.loc[['Ant Man 2']], contoh).reshape(-1,1)

In [66]:
contoh[['score']].sort_values('score', ascending=False)

Unnamed: 0,score
Ant Man 2,1.0
GOTG Vol 2,0.891372
Terminator 2,0.757554
3 Idiots,0.441726


Film yang direkomendasikan untuk item Ant Man adalah GOTG Vol 2, Terminator 2 dan 3 Idiots secara berurutan.

## Let's Try Anime Recommendation

In [70]:
df_anime= pd.read_csv('anime.csv')
df_anime.shape

(12294, 7)

In [71]:
df_anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [72]:
df_anime=df_anime.loc[:,['anime_id','name','genre']]

In [73]:
df_anime.isna().sum()

anime_id     0
name         0
genre       62
dtype: int64

In [74]:
df_anime=df_anime.dropna()
df_anime.isna().sum()

anime_id    0
name        0
genre       0
dtype: int64

In [77]:
df_anime=df_anime.reset_index(drop=True)

In [79]:
df_anime['genre'].value_counts()

Hentai                                                         823
Comedy                                                         523
Music                                                          301
Kids                                                           199
Comedy, Slice of Life                                          179
                                                              ... 
Action, Adventure, Comedy, Military, Romance, Sci-Fi, Space      1
Ecchi, Harem, Romance                                            1
Action, Drama, Mecha, Military, School, Supernatural             1
Comedy, Drama, Ecchi, Harem, Romance                             1
Martial Arts, Seinen, Sports                                     1
Name: genre, Length: 3264, dtype: int64

In [80]:
from sklearn.feature_extraction.text import CountVectorizer

In [94]:
cvr=CountVectorizer(tokenizer= lambda x:x.split(', ')) # pakai koma untuk memecah kolom genre
mgenre_a=cvr.fit_transform(df_anime['genre'])
print(len(cvr.get_feature_names()))
print(cvr.get_feature_names())

43
['action', 'adventure', 'cars', 'comedy', 'dementia', 'demons', 'drama', 'ecchi', 'fantasy', 'game', 'harem', 'hentai', 'historical', 'horror', 'josei', 'kids', 'magic', 'martial arts', 'mecha', 'military', 'music', 'mystery', 'parody', 'police', 'psychological', 'romance', 'samurai', 'school', 'sci-fi', 'seinen', 'shoujo', 'shoujo ai', 'shounen', 'shounen ai', 'slice of life', 'space', 'sports', 'super power', 'supernatural', 'thriller', 'vampire', 'yaoi', 'yuri']


In [95]:
mgenre_a.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [96]:
genre= pd.DataFrame(mgenre_a.toarray(), columns= cvr.get_feature_names())
genre.shape

(12232, 43)

In [97]:
genre=pd.concat([df_anime[['anime_id']],genre],axis=1)


In [98]:
genre #profile items

Unnamed: 0,anime_id,action,adventure,cars,comedy,dementia,demons,drama,ecchi,fantasy,...,shounen ai,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri
0,32281,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,5114,1,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,28977,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9253,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,9969,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12227,9316,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12228,5543,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12229,5621,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12230,6133,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Anime recommender

In [99]:
#misal user menyukai anime naruto
df_anime[df_anime['name']=='Naruto']

Unnamed: 0,anime_id,name,genre
841,20,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P..."


In [101]:
genre[genre['anime_id']==20].drop(columns='anime_id')

Unnamed: 0,action,adventure,cars,comedy,dementia,demons,drama,ecchi,fantasy,game,...,shounen ai,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri
841,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [102]:
score=cosine_similarity(genre[genre['anime_id']==20].drop(columns='anime_id'),
                       genre.drop('anime_id',axis=1))

In [104]:
score.shape

(1, 12232)

In [105]:
genre.shape

(12232, 44)

In [119]:
df_score=pd.concat([df_anime[['name','anime_id']],pd.DataFrame(score, index=['score']).T], axis=1)

In [120]:
df_score

Unnamed: 0,name,anime_id,score
0,Kimi no Na wa.,32281,0.000000
1,Fullmetal Alchemist: Brotherhood,5114,0.338062
2,Gintama°,28977,0.507093
3,Steins;Gate,9253,0.000000
4,Gintama&#039;,9969,0.507093
...,...,...,...
12227,Toushindai My Lover: Minami tai Mecha-Minami,9316,0.000000
12228,Under World,5543,0.000000
12229,Violence Gekiga David no Hoshi,5621,0.000000
12230,Violence Gekiga Shin David no Hoshi: Inma Dens...,6133,0.000000


In [123]:
df_score[df_score['anime_id']!=20].sort_values('score',ascending=False).iloc[:5]

Unnamed: 0,name,anime_id,score
615,Naruto: Shippuuden,1735,1.0
2996,Naruto Soyokazeden Movie: Naruto to Mashin to ...,10659,1.0
486,Boruto: Naruto the Movie,28755,1.0
1343,Naruto x UT,10075,1.0
1573,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,6325,1.0


Coba buat function, di mana kita tinggal memasukkan nama/id anime dan mendapat rekomendasi anime lain yang mirip dengan anime yang kita masukkan

In [144]:
def recommend():
    anime=int(input("Masukkan id anime yang kalian suka: "))

    import pandas as pd
    import numpy as np
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    
    # Buat dataframe dengan judul anime dan genre sebagai features
    df_anime = pd.read_csv('anime.csv')
    df_anime=df_anime.loc[:,['anime_id','name','genre']].dropna().reset_index(drop=True)

    cvr= CountVectorizer(tokenizer= lambda x:x.split(', ')) #hanya koma untuk memecah kolom genre
    mgenre_a= cvr.fit_transform(df_anime['genre'])

    genre=pd.DataFrame(mgenre_a.toarray(),columns=cvr.get_feature_names())
    genre=pd.concat([df_anime[['anime_id']],genre],axis=1) #item profile
    
    #hitung cosine similarity score
    score=cosine_similarity(genre[genre['anime_id']==anime].drop(columns='anime_id'),genre.drop('anime_id',axis=1))
    df_score=pd.concat([df_anime[['name','anime_id']],pd.DataFrame(score, index=['score']).T], axis=1)
    return df_score[df_score['anime_id']!=anime].sort_values('score',ascending=False).iloc[:5]


In [145]:
recommend()

Masukkan id anime yang kalian suka: 20


Unnamed: 0,name,anime_id,score
615,Naruto: Shippuuden,1735,1.0
2996,Naruto Soyokazeden Movie: Naruto to Mashin to ...,10659,1.0
486,Boruto: Naruto the Movie,28755,1.0
1343,Naruto x UT,10075,1.0
1573,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,6325,1.0


# Content Based Recommendation System: Content Based Filtering

In [146]:
df_movies=pd.DataFrame({
    'movies':['Terminator 2','Interstellar','Ant Man 2','3 Idiots'],
    'scores':[7,9,8,9],
    'action':[1,0,1,0],
    'scifi':[1,1,1,0],
    'adventure':[0,1,1,0],
    'comedy':[0,0,1,1],
    'drama':[0,1,0,1]
})

In [147]:
df_movies #item feature matrix

Unnamed: 0,movies,scores,action,scifi,adventure,comedy,drama
0,Terminator 2,7,1,1,0,0,0
1,Interstellar,9,0,1,1,0,1
2,Ant Man 2,8,1,1,1,1,0
3,3 Idiots,9,0,0,0,1,1


In [148]:
df_movies2= df_movies.copy()

In [151]:
df_movies2=df_movies2.drop('movies', axis=1)
df_movies2

Unnamed: 0,scores,action,scifi,adventure,comedy,drama
0,7,1,1,0,0,0
1,9,0,1,1,0,1
2,8,1,1,1,1,0
3,9,0,0,0,1,1


In [154]:
for i in df_movies2.drop('scores',axis=1).columns:
    df_movies2[i]=df_movies2['scores']*df_movies2[i]

df_movies2 #item feature matrix with rating

Unnamed: 0,scores,action,scifi,adventure,comedy,drama
0,7,7,7,0,0,0
1,9,0,9,9,0,9
2,8,8,8,8,8,0
3,9,0,0,0,9,9


In [155]:
df_movies2.drop(columns='scores').sum() #total score untuk masing2 genre

action       15
scifi        24
adventure    17
comedy       17
drama        18
dtype: int64

In [156]:
df_movies2.drop(columns='scores').sum().sum() #total score keseluruhan

91

In [158]:
user_feature_vector=df_movies2.drop(columns='scores').sum()/df_movies2.drop(columns='scores').sum().sum()

In [159]:
user_feature_vector

action       0.164835
scifi        0.263736
adventure    0.186813
comedy       0.186813
drama        0.197802
dtype: float64

In [169]:
df_movies_recommendation=pd.DataFrame({
    'movies':['Titanic','Martian','GOTG Vol 2'],
    'action':[1,0,1],
    'scifi':[1,1,1],
    'adventure':[0,1,1],
    'comedy':[0,0,1],
    'drama':[0,1,0]
})

In [170]:
df_movies_recommendation #item feature matrix

Unnamed: 0,movies,action,scifi,adventure,comedy,drama
0,Titanic,1,1,0,0,0
1,Martian,0,1,1,0,1
2,GOTG Vol 2,1,1,1,1,0


In [171]:
user_feature_vector

action       0.164835
scifi        0.263736
adventure    0.186813
comedy       0.186813
drama        0.197802
dtype: float64

In [172]:
for i in df_movies_recommendation.drop(columns='movies').columns:
    df_movies_recommendation[i]=df_movies_recommendation[i]*user_feature_vector[i]
    
df_movies_recommendation

Unnamed: 0,movies,action,scifi,adventure,comedy,drama
0,Titanic,0.164835,0.263736,0.0,0.0,0.0
1,Martian,0.0,0.263736,0.186813,0.0,0.197802
2,GOTG Vol 2,0.164835,0.263736,0.186813,0.186813,0.0


In [175]:
df_movies_recommendation['rating prediction']=df_movies_recommendation.sum(axis=1)

In [176]:
df_movies_recommendation

Unnamed: 0,movies,action,scifi,adventure,comedy,drama,rating prediction
0,Titanic,0.164835,0.263736,0.0,0.0,0.0,0.428571
1,Martian,0.0,0.263736,0.186813,0.0,0.197802,0.648352
2,GOTG Vol 2,0.164835,0.263736,0.186813,0.186813,0.0,0.802198


untuk user ini, kita akan merekomendasilan film GOTG vol 2, Martian dan Titanic secara berurutan

## Try with Anime Dataset

In [178]:
df_anime=pd.read_csv('anime.csv')

In [179]:
df_anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [180]:
df_anime=df_anime.loc[:,['anime_id','name','rating','genre']]
df_anime

Unnamed: 0,anime_id,name,rating,genre
0,32281,Kimi no Na wa.,9.37,"Drama, Romance, School, Supernatural"
1,5114,Fullmetal Alchemist: Brotherhood,9.26,"Action, Adventure, Drama, Fantasy, Magic, Mili..."
2,28977,Gintama°,9.25,"Action, Comedy, Historical, Parody, Samurai, S..."
3,9253,Steins;Gate,9.17,"Sci-Fi, Thriller"
4,9969,Gintama&#039;,9.16,"Action, Comedy, Historical, Parody, Samurai, S..."
...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,4.15,Hentai
12290,5543,Under World,4.28,Hentai
12291,5621,Violence Gekiga David no Hoshi,4.88,Hentai
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,4.98,Hentai


In [181]:
# Anime yang disukai
df_anime[df_anime['name'].isin(['Naruto','One Piece','Dragon Ball'])]

Unnamed: 0,anime_id,name,rating,genre
74,21,One Piece,8.58,"Action, Adventure, Comedy, Drama, Fantasy, Sho..."
346,223,Dragon Ball,8.16,"Adventure, Comedy, Fantasy, Martial Arts, Shou..."
841,20,Naruto,7.81,"Action, Comedy, Martial Arts, Shounen, Super P..."


In [184]:
#dataframe untuk recomendasi
df_anime[~df_anime['name'].isin(['Naruto','One Piece','Dragon Ball'])].drop(columns='rating')

Unnamed: 0,anime_id,name,genre
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural"
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili..."
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S..."
3,9253,Steins;Gate,"Sci-Fi, Thriller"
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S..."
...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai
12290,5543,Under World,Hentai
12291,5621,Violence Gekiga David no Hoshi,Hentai
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai


* drop na
* vectorize
* df anime yang disukai user
* cari user feature vector
* cari score untuk anime di df rekomendasi anime
* cari 10 rekomendasi anime untuk user 