<h1>Explore anime dataset from Kaggle<h1>

57M ratings on over 17k anime

In [88]:
import pandas as pd
import pickle

In [89]:
df_items = pd.read_csv("/home/diego/chat-rerank/dataset/anime/anime.csv")
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17562 entries, 0 to 17561
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MAL_ID         17562 non-null  int64 
 1   Name           17562 non-null  object
 2   Score          17562 non-null  object
 3   Genres         17562 non-null  object
 4   English name   17562 non-null  object
 5   Japanese name  17562 non-null  object
 6   Type           17562 non-null  object
 7   Episodes       17562 non-null  object
 8   Aired          17562 non-null  object
 9   Premiered      17562 non-null  object
 10  Producers      17562 non-null  object
 11  Licensors      17562 non-null  object
 12  Studios        17562 non-null  object
 13  Source         17562 non-null  object
 14  Duration       17562 non-null  object
 15  Rating         17562 non-null  object
 16  Ranked         17562 non-null  object
 17  Popularity     17562 non-null  int64 
 18  Members        17562 non-n

In [119]:
df_ratings = pd.read_csv("/home/diego/chat-rerank/dataset/anime/rating_complete.csv")
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57633278 entries, 0 to 57633277
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 1.3 GB


<h3>Filtering the dataset<h3>

Items:
- Remove duplicates
- Remove items where "English name" is unknown
- Remove recent items (older than 2020). This is because language models aren't updated with fresh info (chatgpt up to Sept 2021)
- Remove items with unknown premiered

Ratings:
- Remove users with less than 70 and more than 300 ratings
- Remove users with less than 30 positive ratings
- Map ratings into 1-5 stars

In [95]:
df_items.sort_values(by="Popularity", ascending=True, inplace=True)
df_items.drop_duplicates(subset=["English name"], keep="first", inplace=True)
len(df_items)

2762

In [96]:
df_items = df_items[df_items["English name"] != "Unknown"]
len(df_items)

2762

In [97]:
df_items = df_items[df_items["Premiered"] != "Unknown"]
len(df_items)

2762

In [98]:
df_items["prem_year"] = df_items.apply(lambda x:int(x["Premiered"][-4:]), axis=1)
df_items.head(1)

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,prem_year
1393,1535,Death Note,8.63,"Mystery, Police, Psychological, Supernatural, ...",Death Note,デスノート,TV,37,"Oct 4, 2006 to Jun 27, 2007",Fall 2006,...,535252.0,415890.0,201522.0,68577.0,28048.0,10462.0,3692.0,2256.0,3586.0,2006


In [99]:
df_items = df_items[df_items["prem_year"] < 2021]
len(df_items)

2711

In [100]:
df_items = df_items[["MAL_ID", "English name", "Popularity", "Genres", "prem_year"]]
df_items.columns

Index(['MAL_ID', 'English name', 'Popularity', 'Genres', 'prem_year'], dtype='object')

In [101]:
df_items = df_items.rename(columns={"MAL_ID": "item_id", "English name": "item_name", "Popularity": "pop_score"})
df_items.columns

Index(['item_id', 'item_name', 'pop_score', 'Genres', 'prem_year'], dtype='object')

In [102]:
df_items["genres"] = df_items.apply(lambda x: x["Genres"].split(", "), axis=1)
del df_items["Genres"]
df_items.head(1)

Unnamed: 0,item_id,item_name,pop_score,prem_year,genres
1393,1535,Death Note,1,2006,"[Mystery, Police, Psychological, Supernatural,..."


In [103]:
len(df_items["item_id"].unique())

2711

In [104]:
out_dir = "/home/diego/chat-rerank/experiments/anime/"

In [105]:
df_items.to_csv(f"{out_dir}df_items.csv", sep=",", index=False)

In [106]:
itemid_to_name = {}
for i, n in zip(df_items["item_id"].values, df_items["item_name"].values):
    itemid_to_name[i] = n
itemname_to_id = {v: k for k, v in itemid_to_name.items()}
with open(f"{out_dir}itemid_to_name.pkl", 'wb') as fp:
    pickle.dump(itemid_to_name, fp)
with open(f"{out_dir}itemname_to_id.pkl", 'wb') as fp:
    pickle.dump(itemname_to_id, fp)

In [107]:
# prepare generes file
out_string = ""
for i, row in df_items.iterrows():
    for genre in row["genres"]:
        if row['item_name'] in itemname_to_id:
            out_string += f"{itemname_to_id[row['item_name']]}\t{genre}\n"
        else:
            print(row)

with open(f"{out_dir}genres_file.txt", "w") as text_file:
    text_file.write(out_string)

In [108]:
genres = set()
for g in df_items["genres"].values:
    genres = genres.union(g)
print(len(genres))
genres

40


{'Action',
 'Adventure',
 'Cars',
 'Comedy',
 'Dementia',
 'Demons',
 'Drama',
 'Ecchi',
 'Fantasy',
 'Game',
 'Harem',
 'Historical',
 'Horror',
 'Josei',
 'Kids',
 'Magic',
 'Martial Arts',
 'Mecha',
 'Military',
 'Music',
 'Mystery',
 'Parody',
 'Police',
 'Psychological',
 'Romance',
 'Samurai',
 'School',
 'Sci-Fi',
 'Seinen',
 'Shoujo',
 'Shoujo Ai',
 'Shounen',
 'Shounen Ai',
 'Slice of Life',
 'Space',
 'Sports',
 'Super Power',
 'Supernatural',
 'Thriller',
 'Vampire'}

In [120]:
df_ratings.rename(columns={"anime_id": "item_id"}, inplace=True)

In [121]:
df_ratings = df_ratings[df_ratings["item_id"].isin(df_items["item_id"].values)]
len(df_ratings)

35080540

In [122]:
upper_bound = 300
lower_bound = 70
to_keep = df_ratings['user_id'].value_counts()[lambda x: (x>lower_bound) & (x<upper_bound)].index.to_list()

In [123]:
df_ratings = df_ratings[df_ratings["user_id"].isin(to_keep)]
len(df_ratings)

20093374

In [124]:
ratings_mapper = {1: 1, 2:1, 3: 2, 4:2, 5: 3, 6:3, 7: 4, 8:1, 9: 5, 10:5}
df_ratings["rating"] = df_ratings.apply(lambda x:ratings_mapper[x["rating"]], axis=1)
df_ratings['rating'].value_counts()

5    6507912
1    5631614
4    4407961
3    2958053
2     587834
Name: rating, dtype: int64

In [125]:
df_ratings["is_pos"] = df_ratings.apply(lambda x:x["rating"] > 3, axis=1)
df_ratings['is_pos'].value_counts()

True     10915873
False     9177501
Name: is_pos, dtype: int64

In [126]:
df_ratings = df_ratings.groupby(["user_id", "is_pos"]).filter(lambda x: x["is_pos"].count() > 50)
len(df_ratings)

17040914

In [127]:
print(len(df_ratings["item_id"].unique()))
print(len(df_ratings["user_id"].unique()))

2627
118614


In [128]:
df_items.sort_values("prem_year", ascending=False, inplace=True)
df_items.head(10)

Unnamed: 0,item_id,item_name,pop_score,prem_year,genres
15947,40059,Golden Kamuy Season 3,1736,2020,"[Action, Adventure, Historical, Seinen]"
15803,39790,Adachi and Shimamura,1342,2020,"[Slice of Life, Romance, School, Shoujo Ai]"
16131,40392,Smile Down the Runway,1646,2020,"[Slice of Life, Drama, School, Shounen]"
15418,39184,A3! Season Spring & Summer,3660,2020,"[Slice of Life, Drama]"
16264,40623,SUPER HXEROS,1693,2020,"[Action, Comedy, Supernatural, Ecchi, School, ..."
16245,40591,Kaguya-sama:Love is War Season 2,151,2020,"[Comedy, Psychological, Romance, School, Seinen]"
15130,38790,"BOFURI:I Don't Want to Get Hurt, so I'll Max O...",516,2020,"[Action, Game, Sci-Fi, Adventure, Comedy, Fant..."
16732,41638,Bite-Choicar,16027,2020,"[Cars, Kids]"
16426,40902,Food Wars! The Fifth Plate,524,2020,"[Ecchi, School, Shounen]"
15614,39463,Gleipnir,533,2020,"[Action, Mystery, Supernatural, Ecchi, Seinen]"


In [130]:
df_items.sort_values("pop_score", ascending=False, inplace=True)
df_items.head(10)

Unnamed: 0,item_id,item_name,pop_score,prem_year,genres
16854,42144,Jing-Ju Cats 2,17353,2017,"[Action, Adventure, Comedy, Kids, Fantasy]"
17062,42660,Robocar Poli 4,17258,2015,"[Cars, Comedy, Kids]"
17058,42654,Robocar Poli 3,17229,2014,"[Cars, Comedy, Kids]"
16856,42146,Jing-Ju Cats 3,17213,2018,"[Action, Adventure, Comedy, Kids, Martial Arts..."
16059,40273,Fuwa,17171,2007,[Historical]
17086,42740,Screechers Wild!,17160,2016,"[Action, Adventure, Cars]"
16740,41667,MONKART,17143,2017,"[Action, Adventure, Fantasy, Kids]"
16731,41635,GG Bond Season 16:Racing,17111,2019,"[Sci-Fi, Cars, Space, Super Power, Kids]"
17207,42998,Seven Lucky Gods,17039,2020,"[Comedy, Kids, Supernatural]"
14581,37941,Cocomong,16903,2008,[Kids]


In [None]:
out_dir = "/home/diego/chat-reranking/dataset/anime/"
df_ratings.to_csv(f"{out_dir}ratings17M.csv")

DATASET STATISTICS:
- 17M ratings
- 2627 items
- 118614 users
- 143 ratings per user
- 6486 ratings per item
- 40 genres
- 94.5% sparsity 