In [133]:
import pandas as pd
import numpy as np
import html

from surprise import Reader, Dataset

# Preprocess animes

In [102]:
anime = pd.read_csv("../data/external/anime.csv", encoding="utf-8")

In [103]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [104]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


## convert name to unicode

In [105]:
anime['name'] = anime.name.map(lambda x: html.unescape(x))

anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama',"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


## get genre dummies

In [106]:
anime.genre.unique()

array(['Drama, Romance, School, Supernatural',
       'Action, Adventure, Drama, Fantasy, Magic, Military, Shounen',
       'Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen',
       ..., 'Hentai, Sports', 'Drama, Romance, School, Yuri',
       'Hentai, Slice of Life'], dtype=object)

In [107]:
genre_dummies = anime['genre'].str.get_dummies(', ')
genre_dummies = genre_dummies.add_prefix('genre_')
genre_dummies.rename(columns=lambda x: x.replace(" ", "_"), inplace=True)
anime = pd.concat([anime, genre_dummies], axis=1)

## remove unappropiate genres

In [108]:
mask = (anime.genre_Hentai == 0) & (anime.genre_Ecchi == 0) & (anime.genre_Harem == 0)
anime = anime[mask]
anime.drop(columns=['genre', 'genre_Hentai', 'genre_Ecchi', 'genre_Harem'], inplace=True)

anime.head()

Unnamed: 0,anime_id,name,type,episodes,rating,members,genre_Action,genre_Adventure,genre_Cars,genre_Comedy,...,genre_Shounen_Ai,genre_Slice_of_Life,genre_Space,genre_Sports,genre_Super_Power,genre_Supernatural,genre_Thriller,genre_Vampire,genre_Yaoi,genre_Yuri
0,32281,Kimi no Na wa.,Movie,1,9.37,200630,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,TV,64,9.26,793665,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,TV,51,9.25,114262,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,TV,24,9.17,673572,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,9969,Gintama',TV,51,9.16,151266,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## anime type feature

### keep only movies or TV series

In [109]:
mask = (anime.type == 'Movie') | (anime.type == 'TV')
anime = anime[mask]

anime.head()

Unnamed: 0,anime_id,name,type,episodes,rating,members,genre_Action,genre_Adventure,genre_Cars,genre_Comedy,...,genre_Shounen_Ai,genre_Slice_of_Life,genre_Space,genre_Sports,genre_Super_Power,genre_Supernatural,genre_Thriller,genre_Vampire,genre_Yaoi,genre_Yuri
0,32281,Kimi no Na wa.,Movie,1,9.37,200630,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,TV,64,9.26,793665,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,TV,51,9.25,114262,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,TV,24,9.17,673572,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,9969,Gintama',TV,51,9.16,151266,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### get anime type dummies

In [110]:
type_dummies = pd.get_dummies(anime.type, prefix='type')
anime = pd.concat([anime, type_dummies], axis=1)

anime.drop(columns=['type'], inplace=True)

anime.head()

Unnamed: 0,anime_id,name,episodes,rating,members,genre_Action,genre_Adventure,genre_Cars,genre_Comedy,genre_Dementia,...,genre_Space,genre_Sports,genre_Super_Power,genre_Supernatural,genre_Thriller,genre_Vampire,genre_Yaoi,genre_Yuri,type_Movie,type_TV
0,32281,Kimi no Na wa.,1,9.37,200630,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,5114,Fullmetal Alchemist: Brotherhood,64,9.26,793665,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,28977,Gintama°,51,9.25,114262,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,9253,Steins;Gate,24,9.17,673572,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,9969,Gintama',51,9.16,151266,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


## concat additional anime metadata - release date

In [111]:
release_dates = pd.read_csv('../data/raw/anime_dates.csv')
anime = anime.merge(release_dates, on='anime_id')

anime.head()

Unnamed: 0,anime_id,name,episodes,rating,members,genre_Action,genre_Adventure,genre_Cars,genre_Comedy,genre_Dementia,...,genre_Sports,genre_Super_Power,genre_Supernatural,genre_Thriller,genre_Vampire,genre_Yaoi,genre_Yuri,type_Movie,type_TV,year
0,32281,Kimi no Na wa.,1,9.37,200630,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,2016.0
1,5114,Fullmetal Alchemist: Brotherhood,64,9.26,793665,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,2009.0
2,28977,Gintama°,51,9.25,114262,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,2015.0
3,9253,Steins;Gate,24,9.17,673572,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,2011.0
4,9969,Gintama',51,9.16,151266,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,2011.0


## set anime_id as dataframe index

In [112]:
anime.set_index('anime_id', drop=True, inplace=True)

anime.head()

Unnamed: 0_level_0,name,episodes,rating,members,genre_Action,genre_Adventure,genre_Cars,genre_Comedy,genre_Dementia,genre_Demons,...,genre_Sports,genre_Super_Power,genre_Supernatural,genre_Thriller,genre_Vampire,genre_Yaoi,genre_Yuri,type_Movie,type_TV,year
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32281,Kimi no Na wa.,1,9.37,200630,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,2016.0
5114,Fullmetal Alchemist: Brotherhood,64,9.26,793665,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2009.0
28977,Gintama°,51,9.25,114262,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,2015.0
9253,Steins;Gate,24,9.17,673572,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,2011.0
9969,Gintama',51,9.16,151266,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,2011.0


## episodes feature

### concat additional anime metadata - update episodes

In [113]:
anime.loc[1735].episodes

'Unknown'

In [114]:
episode_data = pd.read_csv('../data/raw/anime_episodes.csv')
episode_data.set_index('anime_id', inplace=True)
episode_data

Unnamed: 0_level_0,episodes
anime_id,Unnamed: 1_level_1
21,
235,
1735,500.0
21639,148.0
8687,
...,...
34076,12.0
32924,12.0
34522,12.0
34467,13.0


In [115]:
anime.update(episode_data)

In [116]:
anime.loc[1735].episodes

500.0

### create new stillAiring feature based on episodes

In [117]:
anime['stillAiring'] = anime.episodes.map(lambda x: True if x == 'Unknown' else False)

anime.head()

Unnamed: 0_level_0,name,episodes,rating,members,genre_Action,genre_Adventure,genre_Cars,genre_Comedy,genre_Dementia,genre_Demons,...,genre_Super_Power,genre_Supernatural,genre_Thriller,genre_Vampire,genre_Yaoi,genre_Yuri,type_Movie,type_TV,year,stillAiring
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32281,Kimi no Na wa.,1,9.37,200630,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,2016.0,False
5114,Fullmetal Alchemist: Brotherhood,64,9.26,793665,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,2009.0,False
28977,Gintama°,51,9.25,114262,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,2015.0,False
9253,Steins;Gate,24,9.17,673572,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,2011.0,False
9969,Gintama',51,9.16,151266,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,2011.0,False


### cast episodes to float

In [118]:
anime['episodes'] = anime.episodes.replace('Unknown', np.nan)
anime['episodes'] = anime.episodes.astype("float64")

anime.head()

Unnamed: 0_level_0,name,episodes,rating,members,genre_Action,genre_Adventure,genre_Cars,genre_Comedy,genre_Dementia,genre_Demons,...,genre_Super_Power,genre_Supernatural,genre_Thriller,genre_Vampire,genre_Yaoi,genre_Yuri,type_Movie,type_TV,year,stillAiring
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32281,Kimi no Na wa.,1.0,9.37,200630,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,2016.0,False
5114,Fullmetal Alchemist: Brotherhood,64.0,9.26,793665,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,2009.0,False
28977,Gintama°,51.0,9.25,114262,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,2015.0,False
9253,Steins;Gate,24.0,9.17,673572,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,2011.0,False
9969,Gintama',51.0,9.16,151266,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,2011.0,False


### remove outliers - animes with too much episodes

In [119]:
# but we still keep the ones still airing and Doraemon as an exception
anime = anime[(anime.episodes <= 500) | (anime.index == 2471) | (anime.episodes.isna())]

## update rating feature

In [120]:
anime.loc[34096].rating

nan

In [121]:
scores_data = pd.read_csv('../data/raw/anime_scores.csv')
scores_data.set_index('anime_id', inplace=True)
scores_data.rename(columns={'scores': 'rating'}, inplace=True)
scores_data

Unnamed: 0_level_0,rating
anime_id,Unnamed: 1_level_1
34502,6.26
34309,
34096,8.98
34134,7.51
25777,8.51
...,...
32222,6.19
34471,6.85
34284,7.70
34445,7.68


In [122]:
anime.update(scores_data)

In [123]:
anime.loc[34096].rating

8.98

## remove rows without mean rating or release date

In [124]:
# we keep rows with NaN values in episodes because it means they are still airing

anime.dropna(subset=['rating', 'year'], inplace=True)

anime.head()

Unnamed: 0_level_0,name,episodes,rating,members,genre_Action,genre_Adventure,genre_Cars,genre_Comedy,genre_Dementia,genre_Demons,...,genre_Super_Power,genre_Supernatural,genre_Thriller,genre_Vampire,genre_Yaoi,genre_Yuri,type_Movie,type_TV,year,stillAiring
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32281,Kimi no Na wa.,1.0,9.37,200630,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,2016.0,False
5114,Fullmetal Alchemist: Brotherhood,64.0,9.26,793665,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,2009.0,False
28977,Gintama°,51.0,9.25,114262,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,2015.0,False
9253,Steins;Gate,24.0,9.17,673572,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,2011.0,False
9969,Gintama',51.0,9.16,151266,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,2011.0,False


# Preprocess ratings

In [125]:
ratings = pd.read_csv("../data/external/rating.csv")

In [126]:
ratings = ratings[ratings.rating != -1]

In [127]:
ratings = ratings[ratings.anime_id.isin(anime.index)]

In [128]:
ratings

Unnamed: 0,user_id,anime_id,rating
83,1,11757,10
153,2,11771,10
156,3,20,8
157,3,154,6
158,3,170,9
...,...,...,...
7813731,73515,14345,7
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
