In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
import re

## Jaccard Similarity

In [3]:
#importing data
anime = pd.read_csv('anime.csv')

In [4]:
#saving a copy for later
anime_orig=pd.read_csv('anime.csv')

In [5]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [6]:
#dropping 'unknown' in episodes
anime['episodes'] = anime['episodes'].replace('Unknown', np.nan)
anime = anime.dropna()

#converting to int
anime.episodes = anime.episodes.astype('int')

In [7]:
#making bins for word bag
bins = [0,25,50,100,125,150,175,200,400,600,800,1000,1200,1400,1600,1900]
labels = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
anime['binned_episodes'] = pd.cut(anime.episodes, bins=bins, labels=labels)
anime.binned_episodes = anime.binned_episodes.astype('str')

#maing bins again for word bag
bins1 = [0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10]
labels1 = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o']
anime['binned_ratings'] = pd.cut(anime.rating, bins=bins, labels=labels1)
anime.binned_ratings = anime.binned_ratings.astype('str')

#making bins again for word bag
bins2 = [0,200,500,1000,1500,2000,3000,4000,5000,10000,50000,100000,150000,200000,250000,300000,400000,600000,800000,1000000]
labels2 = ['aa','bb','cc','dd','ee','ff','gg','hh','ii','jj','kk','ll','mm','nn','oo','pp','qq','rr','ss']
anime['binned_members'] = pd.cut(anime.members, bins=bins2, labels=labels2)
anime['binned_members'] = anime.binned_members.astype('str')

In [8]:
#creating word bag
anime['word_bag'] = anime['binned_episodes']+ ', '+ anime['genre'] + ', ' + anime['binned_ratings'] + ', ' + anime['binned_members'] + ', ' + anime['type']

In [9]:
#function for jaccard scores
def get_jaccard_sim(str1, str2): 
    a = set(str1.lower().split(', ')) 
    b = set(str2.lower().split(', '))
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [10]:
#checking jaccard scores
get_jaccard_sim(anime.word_bag[2],anime.word_bag[4])

0.8333333333333334

In [11]:
#limiting to top 100 anime IDs for computational efficiency 
x=pd.DataFrame(columns=['anime_1','anime_2','jaccard_score'])

for i in anime.index:
    
    for j in anime.index:        
        x = x.append({'anime_1': anime.anime_id[i], 'anime_2':anime.anime_id[j], 'jaccard_score': get_jaccard_sim(anime.word_bag[i].lower(), anime.word_bag[j].lower())},ignore_index=True)
        if j>=100:
            break
    if i>=100:
        break

In [13]:
#creating the jaccard matrix
jaccard_matrix = x.pivot_table(index='anime_1',columns='anime_2',values='jaccard_score')
jaccard_matrix.head()

anime_2,1.0,19.0,44.0,164.0,170.0,199.0,245.0,263.0,431.0,457.0,...,31043.0,31240.0,31757.0,31933.0,32182.0,32281.0,32366.0,32935.0,32983.0,32995.0
anime_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1.0,0.166667,0.176471,0.214286,0.266667,0.307692,0.357143,0.285714,0.2,0.235294,...,0.2,0.2,0.125,0.538462,0.285714,0.125,0.142857,0.266667,0.1875,0.230769
19.0,0.166667,1.0,0.105263,0.058824,0.176471,0.125,0.176471,0.266667,0.117647,0.222222,...,0.357143,0.357143,0.117647,0.166667,0.117647,0.1875,0.0625,0.176471,0.176471,0.133333
44.0,0.176471,0.105263,1.0,0.214286,0.117647,0.214286,0.117647,0.125,0.285714,0.105263,...,0.125,0.2,0.2,0.176471,0.2,0.285714,0.230769,0.1875,0.1875,0.230769
164.0,0.214286,0.058824,0.214286,1.0,0.066667,0.4,0.066667,0.071429,0.666667,0.285714,...,0.153846,0.363636,0.363636,0.214286,0.25,0.25,0.181818,0.142857,0.230769,0.181818
170.0,0.266667,0.176471,0.117647,0.066667,1.0,0.142857,0.5,0.545455,0.133333,0.111111,...,0.133333,0.214286,0.0625,0.461538,0.214286,0.214286,0.153846,0.8,0.2,0.363636


In [15]:
#function to recommend top n similar animes
def recommend_jaccard(anime_name, n, jaccard=jaccard_matrix):
    id = anime_orig.anime_id[anime_orig.name == anime_name].values[0].astype(int)
    return anime_orig[anime_orig['anime_id'].isin(list(jaccard_matrix.sort_values(by=id,ascending=False)[id].head(n+1).index))]['name'][1:]

In [16]:
#fetching top 10 most similar anime
anime_name = 'Clannad: After Story'
recommend_jaccard(anime_name, 10)

15                        Sen to Chihiro no Kamikakushi
16                              Shigatsu wa Kimi no Uso
17                        Mushishi Zoku Shou 2nd Season
27                                   Mushishi Zoku Shou
31                                Natsume Yuujinchou Go
34                               Natsume Yuujinchou Shi
46                               Natsume Yuujinchou San
56                              Zoku Natsume Yuujinchou
57    Ano Hi Mita Hana no Namae wo Bokutachi wa Mada...
94                                            Fate/Zero
Name: name, dtype: object

## Cosine Similarity

In [17]:
#cosine similarity on top 100 animes to compare with jaccard results
anime_cosine = anime.set_index('anime_id')['word_bag'].head(100)

In [18]:
anime_cosine.head()

anime_id
32281    0, Drama, Romance, School, Supernatural, a, nn...
5114     2, Action, Adventure, Drama, Fantasy, Magic, M...
28977    2, Action, Comedy, Historical, Parody, Samurai...
9253                        0, Sci-Fi, Thriller, a, rr, TV
9969     2, Action, Comedy, Historical, Parody, Samurai...
Name: word_bag, dtype: object

In [19]:
#converting to a list of strings
def create_list(string):
    return re.sub("[^\w]", " ",  string).split()

In [20]:
anime_cosine=anime_cosine.apply(create_list)

In [21]:
#converting word bag to dummy columns to obtain vectors for cosine similarity
mlb = MultiLabelBinarizer()
anime_cosine = pd.DataFrame(mlb.fit_transform(anime_cosine),columns=mlb.classes_, index=anime_cosine.index)

In [22]:
#sparse matrix with the following columns
anime_cosine.columns

Index(['0', '1', '2', '3', '4', '7', 'Action', 'Adventure', 'Ai', 'Arts',
       'Comedy', 'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Fi', 'Game',
       'Historical', 'Horror', 'Josei', 'Life', 'Magic', 'Martial', 'Mecha',
       'Military', 'Movie', 'Music', 'Mystery', 'OVA', 'Parody', 'Police',
       'Power', 'Psychological', 'Romance', 'Samurai', 'School', 'Sci',
       'Seinen', 'Shoujo', 'Shounen', 'Slice', 'Space', 'Special', 'Sports',
       'Super', 'Supernatural', 'TV', 'Thriller', 'Vampire', 'a', 'jj', 'kk',
       'll', 'mm', 'nan', 'nn', 'of', 'oo', 'pp', 'qq', 'rr', 'ss'],
      dtype='object')

In [23]:
anime_cosine.head()

Unnamed: 0_level_0,0,1,2,3,4,7,Action,Adventure,Ai,Arts,...,ll,mm,nan,nn,of,oo,pp,qq,rr,ss
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32281,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5114,0,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,1,0
28977,0,0,1,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9253,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9969,0,0,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [24]:
#creating cosine matrix out of the sparse matrix from above
xx=cosine_similarity(anime_cosine)
cosine_matrix = pd.DataFrame(xx, index = anime_cosine.index, columns = anime_cosine.index)

In [25]:
#function to fetch n most similar anime
def recommend_cosine(anime_name, n, cosine=cosine_matrix):
    id = anime_orig.anime_id[anime_orig.name == anime_name].values[0].astype(int)
    return anime_orig[anime_orig['anime_id'].isin(list(cosine.sort_values(by=id,ascending=False)[id].head(n+1).index))]['name'][1:]

In [26]:
recommend_jaccard(anime_name, 10)

15                        Sen to Chihiro no Kamikakushi
16                              Shigatsu wa Kimi no Uso
17                        Mushishi Zoku Shou 2nd Season
27                                   Mushishi Zoku Shou
31                                Natsume Yuujinchou Go
34                               Natsume Yuujinchou Shi
46                               Natsume Yuujinchou San
56                              Zoku Natsume Yuujinchou
57    Ano Hi Mita Hana no Namae wo Bokutachi wa Mada...
94                                            Fate/Zero
Name: name, dtype: object

In [27]:
recommend_cosine(anime_name, 10)

17                        Mushishi Zoku Shou 2nd Season
27                                   Mushishi Zoku Shou
31                                Natsume Yuujinchou Go
34                               Natsume Yuujinchou Shi
46                               Natsume Yuujinchou San
56                              Zoku Natsume Yuujinchou
57    Ano Hi Mita Hana no Namae wo Bokutachi wa Mada...
80                                           Usagi Drop
82                                       Mob Psycho 100
94                                            Fate/Zero
Name: name, dtype: object