In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Importing data files
Data from https://www.kaggle.com/yonatanrabinovich/anime-recommendations-project/data

In [3]:
anime_df = pd.read_csv("anime.csv")

In [4]:
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [6]:
rating_df = pd.read_csv("rating.csv")

In [7]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [8]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


#### Missing values in Anime_df

In [9]:
print(round(anime_df.isnull().sum().sort_values(ascending=False)/len(anime_df.index),4)*100)

rating      1.87
genre       0.50
type        0.20
anime_id    0.00
name        0.00
episodes    0.00
members     0.00
dtype: float64


#### Missing values in Rating_df

In [10]:
print(round(rating_df.isnull().sum().sort_values(ascending=False)/len(rating_df.index),4)*100)

user_id     0.0
anime_id    0.0
rating      0.0
dtype: float64


#### dropping anime with no rating

In [11]:
anime_df = anime_df[~anime_df["rating"].isnull()]

In [12]:
print(round(anime_df.isnull().sum().sort_values(ascending=False)/len(anime_df.index),4)*100)

genre       0.39
anime_id    0.00
name        0.00
type        0.00
episodes    0.00
rating      0.00
members     0.00
dtype: float64


#### Filling in na values

In [13]:
anime_df['genre'] = anime_df['genre'].fillna(anime_df['genre'].dropna().mode().values[0])

In [14]:
print(round(anime_df.isnull().sum().sort_values(ascending=False)/len(anime_df.index),4)*100)

anime_id    0.0
name        0.0
genre       0.0
type        0.0
episodes    0.0
rating      0.0
members     0.0
dtype: float64


### Feature engineering

In [15]:
rating_df['rating'] = rating_df['rating'].apply(lambda x: np.nan if x==-1 else x)
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


### Number of users in the rating dataframe

In [16]:
len(rating_df["user_id"].unique())

73515

### Making a new dataframe to perform cosine similarity

In [17]:
rated_anime = rating_df.merge(anime_df, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])

In [18]:
rated_anime.head()

Unnamed: 0,user_id,anime_id,rating_user,name,genre,type,episodes,rating,members
0,1,20,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
1,3,20,8.0,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
2,5,20,6.0,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
3,6,20,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
4,10,20,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297


#### we only require to user_id, name and rating

In [19]:
rated_anime = rated_anime[["user_id","name","rating"]]

In [20]:
rated_anime = rated_anime[rated_anime["user_id"]<=8000]

In [21]:
rated_anime.head()

Unnamed: 0,user_id,name,rating
0,1,Naruto,7.81
1,3,Naruto,7.81
2,5,Naruto,7.81
3,6,Naruto,7.81
4,10,Naruto,7.81


#### now we pivot the table with users as index and animes as columns

In [22]:
pivot = rated_anime.pivot_table(index="user_id",columns="name",values="rating")

In [23]:
pivot.head()

name,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,gdgd Fairies Movie: tte Iu Eiga wa Dou kana...?,iDOLM@STER Xenoglossia,iDOLM@STER Xenoglossia Specials,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,8.11,,,,,


***

### engineering the pivot table

#### replacing NaN values with 0

In [24]:
pivot = pivot.fillna(0)

#### removing anime with no ratings

In [25]:
pivot = pivot.T

In [26]:
pivot.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,7991,7992,7993,7994,7995,7996,7997,7998,7999,8000
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Memoire,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Movie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//G.U. Returner,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//G.U. Trilogy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
pivot = pivot.loc[:,(pivot != 0).any(axis=0)]

### Fitting to similarity

In [28]:
from sklearn.metrics.pairwise import cosine_similarity
import scipy as sp

In [29]:
piv_sparse = sp.sparse.csr_matrix(pivot.values)

In [32]:
anime_similarity = cosine_similarity(piv_sparse)

#Df of anime similarities
ani_sim_df = pd.DataFrame(anime_similarity, index = pivot.index, columns = pivot.index)

In [33]:
ani_sim_df.head()

name,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,gdgd Fairies Movie: tte Iu Eiga wa Dou kana...?,iDOLM@STER Xenoglossia,iDOLM@STER Xenoglossia Specials,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,1.0,0.648924,0.585662,0.101093,0.072894,0.063112,0.08573,0.083794,0.056505,0.145795,...,0.0,0.069673,0.040226,0.028587,0.185795,0.194593,0.193696,0.252794,0.236317,0.0
&quot;Bungaku Shoujo&quot; Memoire,0.648924,1.0,0.59952,0.123877,0.099248,0.070305,0.127336,0.114089,0.089922,0.133753,...,0.0,0.058211,0.044811,0.038215,0.15523,0.163902,0.166454,0.227104,0.184989,0.0
&quot;Bungaku Shoujo&quot; Movie,0.585662,0.59952,1.0,0.131854,0.112241,0.090033,0.12424,0.129025,0.111864,0.160796,...,0.0,0.042597,0.032791,0.051268,0.160923,0.166366,0.153385,0.21272,0.171814,0.0
.hack//G.U. Returner,0.101093,0.123877,0.131854,1.0,0.666246,0.558579,0.476088,0.436256,0.462283,0.3572,...,0.0,0.054411,0.041885,0.101207,0.124943,0.123549,0.115249,0.067928,0.113057,0.0
.hack//G.U. Trilogy,0.072894,0.099248,0.112241,0.666246,1.0,0.513336,0.445005,0.419425,0.451181,0.364882,...,0.0,0.058124,0.033558,0.15263,0.14208,0.130661,0.110803,0.081635,0.106564,0.0


In [36]:
ani_sim_df["Naruto"].sort_values(ascending=False)

name
Naruto                                1.000000
Death Note                            0.607310
Fullmetal Alchemist                   0.548035
Code Geass: Hangyaku no Lelouch       0.540799
Code Geass: Hangyaku no Lelouch R2    0.520423
                                        ...   
Hermes: Ai wa Kaze no Gotoku          0.000000
Hey! Bumboo                           0.000000
Hi no Tori: Uchuu-hen                 0.000000
Hibari no Yadogae                     0.000000
Hana to Mogura                        0.000000
Name: Naruto, Length: 8203, dtype: float64

In [45]:
def anime_recc():
#     while True:
    ani_name = input("What anime do you like? ")
    number = 1
    print('Recommended because you watched {}:\n'.format(ani_name))
    for anime in ani_sim_df.sort_values(by = ani_name, ascending = False).index[1:6]:
        print(f'#{number}: {anime}, {round(ani_sim_df[anime][ani_name]*100,2)}% match')
        number +=1  

In [47]:
anime_recc()

What anime do you like? Dragon Ball Z
Recommended because you watched Dragon Ball Z:

#1: Dragon Ball, 80.53% match
#2: Dragon Ball GT, 77.45% match
#3: Naruto, 51.97% match
#4: Death Note, 49.48% match
#5: Fullmetal Alchemist, 47.95% match
