<h1>Explore anime dataset from Kaggle<h1>

57M ratings on over 17k anime

In [32]:
import pandas as pd
import pickle 

In [2]:
df_items = pd.read_csv("/home/diego/chat-rerank/dataset/anime/anime.csv")
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17562 entries, 0 to 17561
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MAL_ID         17562 non-null  int64 
 1   Name           17562 non-null  object
 2   Score          17562 non-null  object
 3   Genres         17562 non-null  object
 4   English name   17562 non-null  object
 5   Japanese name  17562 non-null  object
 6   Type           17562 non-null  object
 7   Episodes       17562 non-null  object
 8   Aired          17562 non-null  object
 9   Premiered      17562 non-null  object
 10  Producers      17562 non-null  object
 11  Licensors      17562 non-null  object
 12  Studios        17562 non-null  object
 13  Source         17562 non-null  object
 14  Duration       17562 non-null  object
 15  Rating         17562 non-null  object
 16  Ranked         17562 non-null  object
 17  Popularity     17562 non-null  int64 
 18  Members        17562 non-n

In [3]:
df_ratings = pd.read_csv("/home/diego/chat-rerank/dataset/anime/rating_complete.csv")
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57633278 entries, 0 to 57633277
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 1.3 GB


<h3>Filtering the dataset<h3>

Items:
- Remove duplicates
- Remove items where "English name" is unknown
- Remove recent items (older than 2020). This is because language models aren't updated with fresh info (chatgpt up to Sept 2021)
- Remove items with unknown premiered

Ratings:
- Remove users with less than 70 and more than 300 ratings
- Remove users with less than 30 positive ratings
- Map ratings into 1-5 stars

In [4]:
df_items.sort_values(by="Popularity", ascending=True, inplace=True)
df_items.drop_duplicates(subset=["English name"], keep="first")
len(df_items)

17562

In [14]:
df_items = df_items[df_items["English name"] != "Unknown"]
len(df_items)

6997

In [18]:
df_items = df_items[df_items["Premiered"] != "Unknown"]
len(df_items)

2799

In [22]:
df_items["prem_year"] = df_items.apply(lambda x:int(x["Premiered"][-4:]), axis=1)
df_items.head(1)

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,prem_year
1393,1535,Death Note,8.63,"Mystery, Police, Psychological, Supernatural, ...",Death Note,デスノート,TV,37,"Oct 4, 2006 to Jun 27, 2007",Fall 2006,...,535252.0,415890.0,201522.0,68577.0,28048.0,10462.0,3692.0,2256.0,3586.0,2006


In [23]:
df_items = df_items[df_items["prem_year"] < 2021]
len(df_items)

2746

In [24]:
df_items = df_items[["MAL_ID", "English name", "Popularity", "Genres"]]
df_items.columns

Index(['MAL_ID', 'English name', 'Popularity', 'Genres'], dtype='object')

In [28]:
df_items = df_items.rename(columns={"MAL_ID": "item_id", "English name": "item_name", "Popularity": "pop_score"})
df_items.columns

Index(['item_id', 'item_name', 'pop_score', 'Genres'], dtype='object')

In [30]:
df_items["genres"] = df_items.apply(lambda x: x["Genres"].split(", "), axis=1)
del df_items["Genres"]
df_items.head(1)

Unnamed: 0,item_id,item_name,pop_score,genres
1393,1535,Death Note,1,"[Mystery, Police, Psychological, Supernatural,..."


In [31]:
len(df_items["item_id"].unique())

2746

In [38]:
out_dir = "/home/diego/chat-rerank/experiments/anime/"

In [42]:
df_items.to_csv(f"{out_dir}df_items.csv", sep=",", index=False)

In [40]:
itemid_to_name = {}
for i, n in zip(df_items["item_id"].values, df_items["item_name"].values):
    itemid_to_name[i] = n
itemname_to_id = {v: k for k, v in itemid_to_name.items()}
with open(f"{out_dir}itemid_to_name.pkl", 'wb') as fp:
    pickle.dump(itemid_to_name, fp)
with open(f"{out_dir}itemname_to_id.pkl", 'wb') as fp:
    pickle.dump(itemname_to_id, fp)

In [41]:
# prepare generes file
out_string = ""
for i, row in df_items.iterrows():
    for genre in row["genres"]:
        if row['item_name'] in itemname_to_id:
            out_string += f"{itemname_to_id[row['item_name']]}\t{genre}\n"
        else:
            print(row)

with open(f"{out_dir}genres_file.txt", "w") as text_file:
    text_file.write(out_string)

In [46]:
df_ratings.rename(columns={"anime_id": "item_id"}, inplace=True)

In [47]:
df_ratings = df_ratings[df_ratings["item_id"].isin(df_items["item_id"].values)]
len(df_ratings)

35340515