## Anime Cleaning

In [3]:
import pandas as pd
import numpy as np
import datetime

anime = pd.read_json('data/raw/anime_jikan.json')

# Useful when there's more id's
anime.rename(columns={'mal_id': 'anime_id'}, inplace=True)

# Drop Duplicates
old_size = anime.shape[0]
anime = anime.drop_duplicates(subset=['anime_id']).reset_index(drop=True)
print('Duplicates:', old_size - anime.shape[0])

# Remove useless columns (airing column = Currently Airing status, explicit_genres is void)
assert all(anime['airing'] == (anime['status']=='Currently Airing'))
assert all(anime.explicit_genres.astype('str')=='[]')
anime.drop(columns=['airing', 'explicit_genres'], inplace=True)

# Avoid 'Unknown' and 'None' strings
for col in ['type', 'source', 'duration']:
    anime[col] = anime[col].replace('Unknown', np.nan)
anime['rating'] = anime['rating'].replace('None', np.nan)

# Avoid unnecessary floats
for col in ['scored_by', 'episodes', 'year']:
    anime[col] = anime[col].astype('Int64')

# Better air dates from the string
def format_date(date):
    if date=='Not available' or date=='?' or date is np.nan:  
        return np.nan
    if len(date)==4:
        return date
    if len(date)==8:
        return datetime.datetime.strptime(date, '%b %Y').strftime('%Y-%m')
    return datetime.datetime.strptime(date, '%b %d, %Y').strftime('%Y-%m-%d')
anime['real_start_date'] = anime['aired'].str['string'].str.split(' to ').str[0].apply(format_date)
anime['real_end_date'] = anime['aired'].str['string'].str.split(' to ').str[1].apply(format_date)
anime.loc[~anime['aired'].str['string'].str.contains('to'), 'real_end_date'] = anime['real_start_date']
anime['start_date'] = pd.to_datetime(anime['real_start_date'])
anime['end_date'] = pd.to_datetime(anime['real_end_date'])
anime.drop(columns=['aired'], inplace=True)

# Use popularity=0 to detect 'pending approval' animes
anime['approved'] = anime['popularity'] != 0

#  Drop rank and popularity, as they sort equal score / members alphabetically...
anime.drop(columns=['rank', 'popularity'], inplace=True)

# Missing synopsis and background
old_default_synopsis = 'No synopsis has been added for this series yet. Click here to update this information.'
anime['synopsis'] = anime['synopsis'].replace('', np.nan).replace(old_default_synopsis, np.nan)
anime['background'] = anime['background'].replace('', np.nan)

# season and year should be for all, premiered only in TV shows
anime.rename(columns={'season': 'premiered_season', 'year': 'premiered_year'}, inplace=True)

# Simplify broadcast
anime['broadcast_day'] = anime['broadcast'].str['day']
anime.loc[anime['broadcast'].str['string']=='Not scheduled once per week', 'broadcast_day'] = 'Other'
anime['broadcast_time'] = pd.to_datetime(anime['broadcast'].str['time']).dt.time
anime.drop(columns=['broadcast'], inplace=True)

# Only keep names
for col in ['producers', 'licensors', 'studios', 'genres', 'themes', 'demographics']:
    anime[col] = anime[col].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])
anime['genres'] = anime['genres'].apply(sorted)

# Replace old Themes names
old_themes_names = {'Police': 'Detective', 'Cars': 'Racing', 'Demons': 'Mythology', 'Game': 'Strategy Game'}
anime['themes'] = anime['themes'].apply(lambda x: [t if t not in old_themes_names else old_themes_names[t] for t in x])

# Mark R18+ Titles (not ranked)
anime['sfw'] = anime['genres'].apply(lambda x: 'Hentai' not in x and 'Erotica' not in x)

# Simplify trailer. Pictures: default.jpg and prefixs sd, mq, hq, maxres
anime['trailer_url'] = anime['trailer'].str['url']
anime.drop(columns=['trailer'], inplace=True)

# Simplify main_picture, delete default. Options: .jpg, t.jpg, l.jpg, .webp, t.webp, l.webp
default_image = 'https://cdn.myanimelist.net/img/sp/icon/apple-touch-icon-256.png'
anime['main_picture'] = anime['images'].str['jpg'].str['large_image_url'].replace(default_image, np.nan)
anime.drop(columns=['images'], inplace=True)

# Better order
order = ['anime_id', 'title', 'type', 'score', 'scored_by', 'status', 'episodes', 'start_date', 'end_date', 'source',
         'members', 'favorites', 'duration', 'rating', 'sfw', 'approved', 'premiered_season', 'premiered_year', 
         'real_start_date', 'real_end_date', 'broadcast_day', 'broadcast_time', 'genres', 'themes', 'demographics',
         'studios', 'producers', 'licensors',  'synopsis', 'background', 'main_picture', 'url', 'trailer_url',
         'title_english', 'title_japanese', 'title_synonyms']

anime = anime[order]

# Sort by Top Anime
anime['tmp'] = anime['score'].rank(ascending=False) + anime['scored_by'].rank(ascending=False)
anime = anime.sort_values('tmp').reset_index(drop=True)
anime.drop(columns=['tmp'], inplace=True)

# Save as csv
anime.to_csv('data/anime_jikan.csv', index=False)

print(anime.shape)

pd.options.display.max_columns = None
anime.head(1)

Duplicates: 2
(24695, 36)


Unnamed: 0,anime_id,title,type,score,scored_by,status,episodes,start_date,end_date,source,members,favorites,duration,rating,sfw,approved,premiered_season,premiered_year,real_start_date,real_end_date,broadcast_day,broadcast_time,genres,themes,demographics,studios,producers,licensors,synopsis,background,main_picture,url,trailer_url,title_english,title_japanese,title_synonyms
0,5114,Fullmetal Alchemist: Brotherhood,TV,9.13,1871260,Finished Airing,64,2009-04-05,2010-07-04,Manga,2931916,204618,24 min per ep,R - 17+ (violence & profanity),True,True,spring,2009,2009-04-05,2010-07-04,Sundays,17:00:00,"[Action, Adventure, Drama, Fantasy]",[Military],[Shounen],[Bones],"[Aniplex, Square Enix, Mainichi Broadcasting S...","[Funimation, Aniplex of America]",After a horrific alchemy experiment goes wrong...,,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...,https://www.youtube.com/watch?v=--IcmZkvL0Q,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,[Hagane no Renkinjutsushi: Fullmetal Alchemist...


## Load Anime

In [4]:
import pandas as pd
import ast

anime = pd.read_csv('data/anime_jikan.csv')

for col in ['scored_by', 'episodes', 'premiered_year']:
    anime[col] = anime[col].astype('Int64')

for col in ['start_date', 'end_date']:
    anime[col] = pd.to_datetime(anime[col]).dt.date

anime['broadcast_time'] = pd.to_datetime(anime['broadcast_time']).dt.time

for col in ['genres', 'themes', 'demographics', 'studios', 'producers', 'licensors', 'title_synonyms']:
    anime[col] = anime[col].apply(ast.literal_eval)


anime.head(1)

Unnamed: 0,anime_id,title,type,score,scored_by,status,episodes,start_date,end_date,source,members,favorites,duration,rating,sfw,approved,premiered_season,premiered_year,real_start_date,real_end_date,broadcast_day,broadcast_time,genres,themes,demographics,studios,producers,licensors,synopsis,background,main_picture,url,trailer_url,title_english,title_japanese,title_synonyms
0,5114,Fullmetal Alchemist: Brotherhood,TV,9.13,1871260,Finished Airing,64,2009-04-05,2010-07-04,Manga,2931916,204618,24 min per ep,R - 17+ (violence & profanity),True,True,spring,2009,2009-04-05,2010-07-04,Sundays,17:00:00,"[Action, Adventure, Drama, Fantasy]",[Military],[Shounen],[Bones],"[Aniplex, Square Enix, Mainichi Broadcasting S...","[Funimation, Aniplex of America]",After a horrific alchemy experiment goes wrong...,,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...,https://www.youtube.com/watch?v=--IcmZkvL0Q,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,[Hagane no Renkinjutsushi: Fullmetal Alchemist...
