In [1]:
import os
import pandas as pd
from collections import Counter
import pickle

In [2]:
os.makedirs("../data", exist_ok=True)

In [3]:
# ! kaggle datasets download -d hernan4444/anime-recommendation-database-2020
# ! mv anime-recommendation-database-2020.zip ../data/
# ! unzip ../data/anime-recommendation-database-2020.zip -d ../data/
# ! rm ../data/anime-recommendation-database-2020.zip

In [4]:
anime_df = pd.read_csv("../data/anime.csv")

In [5]:
Counter(anime_df["Type"])

Counter({'TV': 4996,
         'OVA': 3894,
         'Movie': 3041,
         'Special': 2218,
         'ONA': 1907,
         'Music': 1469,
         'Unknown': 37})

In [6]:
# TV, OVAに絞る
anime_df = anime_df[(anime_df["Type"] == "TV") | (anime_df["Type"] == "OVA")]

In [7]:
anime_df.columns

Index(['MAL_ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1'],
      dtype='object')

In [8]:
# RankedがUnknownの作品を削除
anime_df = anime_df[anime_df["Ranked"] != "Unknown"]
anime_df["Ranked"] = anime_df["Ranked"].astype(float)

In [9]:
# ScoreがUnknownの作品を削除
anime_df = anime_df[anime_df["Score"] != "Unknown"]
anime_df["Score"] = anime_df["Score"].astype(float)

In [10]:
# お気に入り登録者が0人の作品を削除
anime_df = anime_df[anime_df["Favorites"] != 0]

In [11]:
anime_df = anime_df[
    ["MAL_ID", "Name", "English name", "Japanese name", "Aired", "Score", "Ranked"]
]
anime_df.rename(
    columns={
        "MAL_ID": "anime_id",
        "English name": "English_name",
        "Japanese name": "Japanese_name",
    },
    inplace=True,
)

In [12]:
rating_df = pd.read_csv("../data/rating_complete.csv")

In [13]:
df = pd.merge(rating_df, anime_df, on="anime_id", how="inner")

In [14]:
df.shape

(43167749, 9)

In [15]:
df.isnull().sum()

user_id          0
anime_id         0
rating           0
Name             0
English_name     0
Japanese_name    0
Aired            0
Score            0
Ranked           0
dtype: int64

In [16]:
# プロンプトで好きなアニメを入力してもらうので、
# df = df[df["rating"] >= 8.0]

In [17]:
df.to_pickle("../data/trainset.pkl")