In [1]:
# Import libraries
import pandas as pd
import numpy as np

# Change pandas settings so we can see the all columns in the dataframe
pd.set_option('max_columns', 99)

# Preprocessing Data

In [2]:
anime_df = pd.read_csv("cleaned_anime.csv")
anime_df.head()

Unnamed: 0,anime_id,name,genre
0,28891,Haikyuu!! Second Season,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun..."
1,23273,Shigatsu wa Kimi no Uso,"['Drama', 'Music', 'Romance', 'School', 'Shoun..."
2,34599,Made in Abyss,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F..."
3,5114,Fullmetal Alchemist: Brotherhood,"['Action', 'Military', 'Adventure', 'Comedy', ..."
4,31758,Kizumonogatari III: Reiketsu-hen,"['Action', 'Mystery', 'Supernatural', 'Vampire']"


In [3]:
anime_df.dropna(subset=["genre"], inplace=True)

In [4]:
anime_df["genre"] = anime_df["genre"].str.replace("'", "").str.strip("][").str.split(", ")
type(anime_df["genre"][0])

list

In [5]:
anime_df.head()

Unnamed: 0,anime_id,name,genre
0,28891,Haikyuu!! Second Season,"[Comedy, Sports, Drama, School, Shounen]"
1,23273,Shigatsu wa Kimi no Uso,"[Drama, Music, Romance, School, Shounen]"
2,34599,Made in Abyss,"[Sci-Fi, Adventure, Mystery, Drama, Fantasy]"
3,5114,Fullmetal Alchemist: Brotherhood,"[Action, Military, Adventure, Comedy, Drama, M..."
4,31758,Kizumonogatari III: Reiketsu-hen,"[Action, Mystery, Supernatural, Vampire]"


In [6]:
# Using scikit learn's MLB package to one hot encode the genres
from sklearn.preprocessing import MultiLabelBinarizer

# Code from https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list
mlb = MultiLabelBinarizer(sparse_output=True)

anime_df = anime_df.join(pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(anime_df["genre"]),
                index=anime_df.index,
                columns=mlb.classes_))

# Drop the origininal genre column
anime_df.drop("genre", axis=1, inplace=True)

In [7]:
anime_df.head()

Unnamed: 0,anime_id,name,Unnamed: 3,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,Hentai,Historical,Horror,Josei,Kids,Magic,Martial Arts,Mecha,Military,Music,Mystery,Parody,Police,Psychological,Romance,Samurai,School,Sci-Fi,Seinen,Shoujo,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,28891,Haikyuu!! Second Season,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
1,23273,Shigatsu wa Kimi no Uso,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,34599,Made in Abyss,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,5114,Fullmetal Alchemist: Brotherhood,0,1,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,31758,Kizumonogatari III: Reiketsu-hen,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0


In [8]:
rating_df = pd.read_csv("cleaned_rating.csv")
rating_df.head()

Unnamed: 0,user,anime_id,rating
0,-----noname-----,18441,2
1,-----noname-----,2025,4
2,---SnowFlake---,1535,6
3,---was-----,10110,8
4,--EYEPATCH--,35839,10


In [10]:
rating_df["user"].value_counts()

Sidewinder51    611
Stark700        537
ktulu007        482
LegendAqua      442
ggultra2764     355
               ... 
caldxm            1
liabia            1
Marii-nyan        1
KykyGhibli        1
aaronschmit       1
Name: user, Length: 47885, dtype: int64

In [16]:
import random
random.seed(5)
random.choice(rating_df["user"])

'Vaenny'

In [17]:
user_df = rating_df[rating_df["user"]=="Vaenny"]

# Reset the indexes
user_df.reset_index(drop=True, inplace=True)
# Drop the columns that are not needed
user_df = user_df.drop("user", axis=1)
user_df

Unnamed: 0,anime_id,rating
0,15583,6
1,8675,9
2,1824,8
3,23289,8
4,9253,9
5,19815,8
6,30014,7
7,37281,6
8,10417,8
9,8861,9
