In [1]:
import os # paths to file
import numpy as np # math
import pandas as pd # data processes
import warnings # warnings
import scipy as sp # pivot engineering

from sklearn.metrics.pairwise import cosine_similarity # model

pd.options.display.max_columns # default theme and settings

20

In [2]:
# setting warnings
warnings.filterwarnings("always")
warnings.filterwarnings("ignore")

In [5]:
# paths and get csvs
rating_path = "kaggle/input/anime-recommendations-database/rating.csv"
anime_path = "kaggle/input/anime-recommendations-database/anime.csv"
#rating gives user data, anime gives general info
rating_df = pd.read_csv(rating_path)
anime_df = pd.read_csv(anime_path)

print(rating_df.head())
print(anime_df.head())

   user_id  anime_id  rating
0        1        20      -1
1        1        24      -1
2        1        79      -1
3        1       226      -1
4        1       241      -1
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262 

In [6]:
# printing data info
print(f"anime set (row, col): {anime_df.shape}\n\nrating set (row, col): {rating_df.shape}")
print("Anime:\n")
print(anime_df.info())
print("\n","*"*50,"\nRating:\n")
print(rating_df.info())

anime set (row, col): (12294, 7)

rating set (row, col): (7813737, 3)
Anime:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None

 ************************************************** 
Rating:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB
None


In [7]:
# finding missing values to clear
print("Anime missing values (%):\n")
print(round(anime_df.isnull().sum().sort_values(ascending=False)/len(anime_df.index),4)*100) 
print("\n","*"*50,"\n\nRating missing values (%):\n")
print(round(rating_df.isnull().sum().sort_values(ascending=False)/len(rating_df.index),4)*100)

Anime missing values (%):

rating      1.87
genre       0.50
type        0.20
members     0.00
episodes    0.00
name        0.00
anime_id    0.00
dtype: float64

 ************************************************** 

Rating missing values (%):

rating      0.0
anime_id    0.0
user_id     0.0
dtype: float64


In [8]:
# most common type and genre
print(anime_df['type'].mode())
print(anime_df['genre'].mode()) # haha mostly hentai

0    TV
dtype: object
0    Hentai
dtype: object


In [12]:
# handling missing data
# deleting anime with 0 rating
anime_df=anime_df[~np.isnan(anime_df["rating"])]

# filling mode value for genre and type (guess we're filling it with hentai and TV huh)
anime_df['genre'] = anime_df['genre'].fillna(anime_df['genre'].dropna().mode().values[0])
anime_df['type'] = anime_df['type'].fillna(anime_df['type'].dropna().mode().values[0])

#checking if all null values are filled
print(anime_df.isnull().sum())

#replace values that have -1 rating with NaN
f = (lambda x: np.nan if x==-1 else x)
rating_df['rating'] = rating_df['rating'].apply(f)
print(rating_df.head(20))

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64
    user_id  anime_id  rating
0         1        20     NaN
1         1        24     NaN
2         1        79     NaN
3         1       226     NaN
4         1       241     NaN
5         1       355     NaN
6         1       356     NaN
7         1       442     NaN
8         1       487     NaN
9         1       846     NaN
10        1       936     NaN
11        1      1546     NaN
12        1      1692     NaN
13        1      1836     NaN
14        1      2001     NaN
15        1      2025     NaN
16        1      2144     NaN
17        1      2787     NaN
18        1      2993     NaN
19        1      3455     NaN


In [16]:
# engineering dataframes
anime_df = anime_df[anime_df['type'] == 'TV']

# merges and coordinates, suffixes adds the name of things on left and on right to rating
rated_anime = rating_df.merge(anime_df, left_on = 'anime_id', right_on = 'anime_id', suffixes = ['_user', ''])
#print(rated_anime)

# isolate these columns
rated_anime = rated_anime[['user_id', 'name', 'rating']]

# limiting 
rated_anime_10000 = rated_anime[rated_anime.user_id <= 7500]
rated_anime_10000.head()

Unnamed: 0,user_id,name,rating
0,1,Naruto,7.81
1,3,Naruto,7.81
2,5,Naruto,7.81
3,6,Naruto,7.81
4,10,Naruto,7.81


In [17]:
# pivots by user_id x name and makes the values of table ratings (easier to read)
pivot = rated_anime_10000.pivot_table(index=['user_id'], columns=['name'], values='rating')
pivot.head()

name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,3 Choume no Tama: Uchi no Tama Shirimasenka?,30-sai no Hoken Taiiku,91 Days,...,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,6.49,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,8.11,


In [37]:
# value normalization through min-max scaling
f = lambda x: (x-np.mean(x))/(np.max(x)-np.min(x))
pivot_n = pivot.apply(f, axis=1) # axis changes how lambdas functions (1 is across columns)

pivot_n.fillna(0, inplace=True) # inplace true replaces data frame
pivot_n = pivot_n.transpose() # reflects the data

#drops column with values of 0
pivot_n = pivot_n.loc[:, (pivot_n != 0).any(axis=0)]

#converts to sparse matrix format (just a matrix format where lots of 0s are there to process)
piv_sparse = sp.sparse.csr_matrix(pivot_n.values)
pivot_n.head()

user_id,1,2,3,4,5,6,7,8,10,11,...,7489,7490,7491,7492,7494,7495,7496,7497,7499,7500
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.179964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Tasogare no Udewa Densetsu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
009-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# model based on anime similarity
anime_similarity = cosine_similarity(piv_sparse)

#convert to data frame of anime similarities
ani_sim_df = pd.DataFrame(anime_similarity, index = pivot_n.index, columns = pivot_n.index)

In [39]:
# method for anime_recommendation
def anime_recommendation(ani_name):
    number = 1
    print('Recommended because you watched {}:\n'.format(ani_name))
    for anime in ani_sim_df.sort_values(by = ani_name, ascending = False).index[1:6]:
        print(f'#{number}: {anime}, {round(ani_sim_df[anime][ani_name]*100,2)}% match')
        number +=1  

In [40]:
anime_recommendation('Dragon Ball Z')

Recommended because you watched Dragon Ball Z:

#1: Dragon Ball, 79.32% match
#2: Fullmetal Alchemist, 42.81% match
#3: Death Note, 42.6% match
#4: Code Geass: Hangyaku no Lelouch, 37.64% match
#5: Yuu☆Yuu☆Hakusho, 37.39% match


In [43]:
anime_recommendation('Haikyuu!! Second Season')

Recommended because you watched Haikyuu!! Second Season:

#1: Haikyuu!!, 67.81% match
#2: Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou, 49.78% match
#3: Kuroko no Basket 3rd Season, 47.74% match
#4: Kuroko no Basket 2nd Season, 38.09% match
#5: Diamond no Ace, 36.58% match


In [46]:
anime_recommendation('Natsume Yuujinchou San')

Recommended because you watched Natsume Yuujinchou San:

#1: Natsume Yuujinchou Shi, 87.1% match
#2: Zoku Natsume Yuujinchou, 75.94% match
#3: Natsume Yuujinchou, 65.34% match
#4: Usagi Drop, 30.21% match
#5: Mushishi, 29.1% match


In [47]:
anime_recommendation('Shigatsu wa Kimi no Uso')

Recommended because you watched Shigatsu wa Kimi no Uso:

#1: Boku dake ga Inai Machi, 48.68% match
#2: Shokugeki no Souma, 48.6% match
#3: Yahari Ore no Seishun Love Comedy wa Machigatteiru. Zoku, 47.0% match
#4: Kiseijuu: Sei no Kakuritsu, 45.96% match
#5: No Game No Life, 45.59% match
