# Feature Engineering

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import OneHotEncoder

In [2]:
anime_db = pd.read_csv("../../data/raw/anime.csv")
ratings_db = pd.read_csv("../../data/raw/rating.csv")

In [3]:
anime_db.rename(columns= {"rating": "anime_avg_rating", "members": "anime_members", "episodes": "anime_episodes"}, inplace=True)
anime_db["genre"] = anime_db["genre"].str.split(", ", n= 1, expand=True)[0]
anime_db.dropna(subset=["anime_avg_rating", "genre"], inplace=True)
anime_db["anime_episodes"] = anime_db["anime_episodes"].apply(lambda x: np.nan if x == "Unknown" else int(x))

In [4]:
avg_impute_episodes = int(round(anime_db[anime_db["type"] == "TV" ]["anime_episodes"].mean(),0))
avg_impute_episodes

36

In [5]:
anime_db['anime_episodes'] = anime_db['anime_episodes'].fillna(avg_impute_episodes).astype(int)

In [6]:
anime_db.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12017 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   anime_id          12017 non-null  int64  
 1   name              12017 non-null  object 
 2   genre             12017 non-null  object 
 3   type              12017 non-null  object 
 4   anime_episodes    12017 non-null  int64  
 5   anime_avg_rating  12017 non-null  float64
 6   anime_members     12017 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 751.1+ KB


In [7]:
ratings_db_full = ratings_db[ratings_db["rating"] != -1].merge(anime_db.drop("name", axis=1), on="anime_id")
ratings_db_full

Unnamed: 0,user_id,anime_id,rating,genre,type,anime_episodes,anime_avg_rating,anime_members
0,1,8074,10,Action,TV,12,7.46,535892
1,3,8074,6,Action,TV,12,7.46,535892
2,5,8074,2,Action,TV,12,7.46,535892
3,12,8074,6,Action,TV,12,7.46,535892
4,14,8074,6,Action,TV,12,7.46,535892
...,...,...,...,...,...,...,...,...
6337141,69964,23585,7,Adventure,Special,2,6.14,138
6337142,69964,33659,6,Comedy,Special,1,5.15,444
6337143,72800,30738,4,Adventure,Movie,1,5.55,185
6337144,73135,8723,5,Comedy,OVA,4,5.84,264


In [9]:
watched_animes = ratings_db.groupby("user_id").count().rename(columns={"anime_id": "user_watched_animes"})["user_watched_animes"].reset_index()

In [10]:
aggregated_user_features = ratings_db_full[ratings_db_full["rating"] != -1 ].groupby("user_id").agg({ "rating": ["mean"], "anime_episodes": ["sum"], "genre": [pd.Series.mode]}).reset_index()

In [11]:
aggregated_user_features.columns = aggregated_user_features.columns.droplevel()
aggregated_user_features.columns = ["user_id", "user_avg_rating", "user_watched_episodes", "user_favorite_genre"]
user_features = aggregated_user_features.merge(watched_animes, on="user_id")

In [12]:
full_db = ratings_db_full.merge(user_features, how="left", on="user_id")

In [13]:
full_db["user_avg_episodes"] = full_db["user_watched_episodes"] / full_db["user_watched_animes"]
full_db

Unnamed: 0,user_id,anime_id,rating,genre,type,anime_episodes,anime_avg_rating,anime_members,user_avg_rating,user_watched_episodes,user_favorite_genre,user_watched_animes,user_avg_episodes
0,1,8074,10,Action,TV,12,7.46,535892,10.000000,61,Action,153,0.398693
1,3,8074,6,Action,TV,12,7.46,535892,7.565217,2874,Action,94,30.574468
2,5,8074,2,Action,TV,12,7.46,535892,4.355120,10047,Comedy,467,21.513919
3,12,8074,6,Action,TV,12,7.46,535892,8.818182,1121,Action,22,50.954545
4,14,8074,6,Action,TV,12,7.46,535892,7.195122,1638,Comedy,123,13.317073
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6337141,69964,23585,7,Adventure,Special,2,6.14,138,7.891859,16013,Action,825,19.409697
6337142,69964,33659,6,Comedy,Special,1,5.15,444,7.891859,16013,Action,825,19.409697
6337143,72800,30738,4,Adventure,Movie,1,5.55,185,5.930159,2121,Action,315,6.733333
6337144,73135,8723,5,Comedy,OVA,4,5.84,264,5.710953,12148,Action,986,12.320487


Unnamed: 0,user_id,anime_id,rating,genre,type,anime_episodes,avg_anime_rating,anime_members,user_avg_rating,user_watched_episodes,user_favorite_genre,user_watched_animes,user_avg_episodes
314,1000,8074,9,Action,TV,12,7.46,535892,8.341463,1374,Action,41,33.512195
32109,1000,11757,9,Action,TV,25,7.83,893100,8.341463,1374,Action,41,33.512195
75574,1000,20,9,Action,TV,220,7.81,683297,8.341463,1374,Action,41,33.512195
106193,1000,199,10,Adventure,Movie,1,8.93,466254,8.341463,1374,Action,41,33.512195
196852,1000,1535,10,Mystery,TV,37,8.71,1013917,8.341463,1374,Action,41,33.512195
258230,1000,3588,7,Action,TV,51,8.08,580184,8.341463,1374,Action,41,33.512195
696588,1000,30,9,Action,TV,26,8.32,461946,8.341463,1374,Action,41,33.512195
711536,1000,45,8,Action,TV,94,8.43,218928,8.341463,1374,Action,41,33.512195
859238,1000,269,10,Action,TV,366,7.95,624055,8.341463,1374,Action,41,33.512195
1144500,1000,4224,8,Comedy,TV,25,8.45,633817,8.341463,1374,Action,41,33.512195
