## People Cleaning

In [1]:
import pandas as pd
import numpy as np

people = pd.read_json('data/raw/people.json')

# Drop Duplicates
old_size = people.shape[0]
people = people.drop_duplicates(subset=['mal_id']).reset_index(drop=True)
print('Duplicates:', old_size - people.shape[0])

# Avoid missing names
people['given_name'] = people['given_name'].replace('', np.nan)

# Some birthdays are wrong 0001, 09XX, but a few, simply ignore
people['birthday'] = pd.to_datetime(people['birthday'], errors='coerce')

# Simplify main_picture, delete default. Options: .jpg, t.jpg, l.jpg, .webp, t.webp, l.webp
default_image = 'https://cdn.myanimelist.net/img/sp/icon/apple-touch-icon-256.png'
people['main_picture'] = people['images'].str['jpg'].str['image_url'].replace(default_image, np.nan)
people.drop(columns=['images'], inplace=True)

# Better order
order = ['mal_id', 'name', 'given_name', 'family_name', 'alternate_names', 'birthday', 'favorites', 'about', 'website_url', 'main_picture', 'url']
people = people[order]

# Sort by desc favorites and asc mal_id
people = people.sort_values(['favorites', 'mal_id'], ascending=[False, True]).reset_index(drop=True)

# Save as csv
people.to_csv('data/people.csv', index=False)

print(people.shape)

people.head(3)

Duplicates: 50
(16943, 11)


Unnamed: 0,mal_id,name,given_name,family_name,alternate_names,birthday,favorites,about,website_url,main_picture,url
0,118,Hiroshi Kamiya,浩史,神谷,"[ヒロC, HiroC, Kamiyan]",1975-01-28 00:00:00+00:00,102842,"Birth place: Matsudo, Chiba Prefecture, Japan\...",,https://cdn.myanimelist.net/images/voiceactors...,https://myanimelist.net/people/118/Hiroshi_Kamiya
1,185,Kana Hanazawa,香菜,花澤,"[HanaKana, KanaHana]",1989-02-25 00:00:00+00:00,98551,"Hometown: Tokyo, Japan\nHeight: 156 cm\nBlood ...",http://www.hanazawakana-music.net/,https://cdn.myanimelist.net/images/voiceactors...,https://myanimelist.net/people/185/Kana_Hanazawa
2,65,Mamoru Miyano,真守,宮野,[],1983-06-08 00:00:00+00:00,86081,"Hometown: Saitama, Japan\nHeight: 182 cm\nWeig...",http://miyanomamoru.com/,https://cdn.myanimelist.net/images/voiceactors...,https://myanimelist.net/people/65/Mamoru_Miyano


## Load People

In [2]:
import pandas as pd
import ast

people = pd.read_csv('data/people.csv')

people['alternate_names'] = people['alternate_names'].apply(ast.literal_eval)

people['birthday'] = pd.to_datetime(people['birthday'])

people.head(3)

Unnamed: 0,mal_id,name,given_name,family_name,alternate_names,birthday,favorites,about,website_url,main_picture,url
0,118,Hiroshi Kamiya,浩史,神谷,"[ヒロC, HiroC, Kamiyan]",1975-01-28 00:00:00+00:00,102842,"Birth place: Matsudo, Chiba Prefecture, Japan\...",,https://cdn.myanimelist.net/images/voiceactors...,https://myanimelist.net/people/118/Hiroshi_Kamiya
1,185,Kana Hanazawa,香菜,花澤,"[HanaKana, KanaHana]",1989-02-25 00:00:00+00:00,98551,"Hometown: Tokyo, Japan\nHeight: 156 cm\nBlood ...",http://www.hanazawakana-music.net/,https://cdn.myanimelist.net/images/voiceactors...,https://myanimelist.net/people/185/Kana_Hanazawa
2,65,Mamoru Miyano,真守,宮野,[],1983-06-08 00:00:00+00:00,86081,"Hometown: Saitama, Japan\nHeight: 182 cm\nWeig...",http://miyanomamoru.com/,https://cdn.myanimelist.net/images/voiceactors...,https://myanimelist.net/people/65/Mamoru_Miyano
