In [46]:
# Import libraries
import pandas as pd
import numpy as np

# Change pandas settings so we can see the all columns in the dataframe
pd.set_option('max_columns', 99)

# Preprocessing Data

In [75]:
anime_df = pd.read_csv("cleaned_anime.csv")
anime_df.head()

Unnamed: 0,anime_id,name,genre
0,28891,Haikyuu!! Second Season,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun..."
1,23273,Shigatsu wa Kimi no Uso,"['Drama', 'Music', 'Romance', 'School', 'Shoun..."
2,34599,Made in Abyss,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F..."
3,5114,Fullmetal Alchemist: Brotherhood,"['Action', 'Military', 'Adventure', 'Comedy', ..."
4,31758,Kizumonogatari III: Reiketsu-hen,"['Action', 'Mystery', 'Supernatural', 'Vampire']"


In [76]:
anime_df = anime_df[anime_df["genre"]  != "['']"]

In [77]:
anime_df["genre"] = anime_df["genre"].str.replace("'", "").str.strip("][").str.split(", ")
type(anime_df["genre"][0])

list

In [78]:
anime_df.head()

Unnamed: 0,anime_id,name,genre
0,28891,Haikyuu!! Second Season,"[Comedy, Sports, Drama, School, Shounen]"
1,23273,Shigatsu wa Kimi no Uso,"[Drama, Music, Romance, School, Shounen]"
2,34599,Made in Abyss,"[Sci-Fi, Adventure, Mystery, Drama, Fantasy]"
3,5114,Fullmetal Alchemist: Brotherhood,"[Action, Military, Adventure, Comedy, Drama, M..."
4,31758,Kizumonogatari III: Reiketsu-hen,"[Action, Mystery, Supernatural, Vampire]"


In [79]:
# Using scikit learn's MLB package to one hot encode the genres
from sklearn.preprocessing import MultiLabelBinarizer

# Code from https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list
mlb = MultiLabelBinarizer(sparse_output=True)

anime_df = anime_df.join(pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(anime_df["genre"]),
                index=anime_df.index,
                columns=mlb.classes_))

# Drop the origininal genre column
anime_df.drop("genre", axis=1, inplace=True)

In [80]:
anime_df.head()

Unnamed: 0,anime_id,name,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,Hentai,Historical,Horror,Josei,Kids,Magic,Martial Arts,Mecha,Military,Music,Mystery,Parody,Police,Psychological,Romance,Samurai,School,Sci-Fi,Seinen,Shoujo,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,28891,Haikyuu!! Second Season,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
1,23273,Shigatsu wa Kimi no Uso,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,34599,Made in Abyss,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,5114,Fullmetal Alchemist: Brotherhood,1,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,31758,Kizumonogatari III: Reiketsu-hen,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0


In [64]:
anime_df.loc[569, :]

anime_id                                        40080
name             Quanzhi Gaoshou zhi Dianfeng Rongyao
                                                    1
Action                                              0
Adventure                                           0
Cars                                                0
Comedy                                              0
Dementia                                            0
Demons                                              0
Drama                                               0
Ecchi                                               0
Fantasy                                             0
Game                                                0
Harem                                               0
Hentai                                              0
Historical                                          0
Horror                                              0
Josei                                               0
Kids                        

In [53]:
rating_df = pd.read_csv("cleaned_rating.csv")
rating_df.head()

Unnamed: 0,user,anime_id,rating
0,-----noname-----,18441,2
1,-----noname-----,2025,4
2,---SnowFlake---,1535,6
3,---was-----,10110,8
4,--EYEPATCH--,35839,10


In [54]:
rating_df["user"].value_counts()

Sidewinder51    611
Stark700        537
ktulu007        482
LegendAqua      442
ggultra2764     355
               ... 
caldxm            1
liabia            1
Marii-nyan        1
KykyGhibli        1
aaronschmit       1
Name: user, Length: 47885, dtype: int64

In [55]:
import random
random.seed(5)
random.choice(rating_df["user"])

'Vaenny'

In [56]:
user_df = rating_df[rating_df["user"]=="Vaenny"]

# Reset the indexes
user_df.reset_index(drop=True, inplace=True)
# Drop the columns that are not needed
user_df = user_df.drop("user", axis=1)
user_df

Unnamed: 0,anime_id,rating
0,15583,6
1,8675,9
2,1824,8
3,23289,8
4,9253,9
5,19815,8
6,30014,7
7,37281,6
8,10417,8
9,8861,9


In [57]:
user_df[user_df["anime_id"] == 38826]

Unnamed: 0,anime_id,rating
46,38826,9


In [58]:
user_genre_df = anime_df[anime_df["anime_id"].isin(user_df["anime_id"])]
user_genre_df.head()

Unnamed: 0,anime_id,name,Unnamed: 3,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,Hentai,Historical,Horror,Josei,Kids,Magic,Martial Arts,Mecha,Military,Music,Mystery,Parody,Police,Psychological,Romance,Samurai,School,Sci-Fi,Seinen,Shoujo,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
72,28999,Charlotte,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
197,32093,Tanaka-kun wa Itsumo Kedaruge,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
291,29803,Overlord,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
335,23289,Gekkan Shoujo Nozaki-kun,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
343,28725,Kokoro ga Sakebitagatterunda.,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [59]:
user_genre_df = user_genre_df.sort_values("anime_id")
user_genre_df.reset_index(drop=True, inplace=True)
user_genre_df.head()

Unnamed: 0,anime_id,name,Unnamed: 3,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,Hentai,Historical,Horror,Josei,Kids,Magic,Martial Arts,Mecha,Military,Music,Mystery,Parody,Police,Psychological,Romance,Samurai,School,Sci-Fi,Seinen,Shoujo,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,670,Lamune,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,811,I''s Pure,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1824,Hadashi no Gen,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3958,Kannagi,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
4,4548,Yozakura Quartet,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0


In [61]:
user_genre_df.columns

Index(['anime_id', 'name', '', 'Action', 'Adventure', 'Cars', 'Comedy',
       'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem',
       'Hentai', 'Historical', 'Horror', 'Josei', 'Kids', 'Magic',
       'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery', 'Parody',
       'Police', 'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi',
       'Seinen', 'Shoujo', 'Shoujo Ai', 'Shounen', 'Shounen Ai',
       'Slice of Life', 'Space', 'Sports', 'Super Power', 'Supernatural',
       'Thriller', 'Vampire', 'Yaoi', 'Yuri'],
      dtype='object')

In [62]:
user_genre_df[""]

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
43    0
44    0
45    0
46    0
47    0
48    0
49    0
50    0
51    0
52    0
53    0
54    0
55    0
56    0
57    0
Name: , dtype: Sparse[int32, 0]

In [60]:
user_genre_matrix = user_genre_df.drop(["anime_id", "name"], axis=1)
user_genre_matrix

Unnamed: 0,Unnamed: 1,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,Hentai,Historical,Horror,Josei,Kids,Magic,Martial Arts,Mecha,Military,Music,Mystery,Parody,Police,Psychological,Romance,Samurai,School,Sci-Fi,Seinen,Shoujo,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
4,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0
5,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
7,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
