## Manga Cleaning

In [1]:
import pandas as pd
import numpy as np

manga = pd.read_json('data/raw/manga.json')

# Drop Duplicates
old_size = manga.shape[0]
manga = manga.drop_duplicates(subset=['mal_id']).reset_index(drop=True)
print('Duplicates:', old_size - manga.shape[0])

# Remove useless columns (publishing column = Publishing status, explicit_genres is void, scored = score)
assert all(manga['publishing'] == (manga['status']=='Publishing'))
assert all((manga['score']==manga['scored']) | (manga['score'].isna() & manga['scored'].isna()))
assert all(manga.explicit_genres.astype('str')=='[]')
manga.drop(columns=['publishing', 'scored', 'explicit_genres'], inplace=True)

# Avoid unnecessary floats
for col in ['chapters', 'volumes', 'rank']:
    manga[col] = manga[col].astype('Int64')
# Simplify published dates
manga['published_from'] = pd.to_datetime(manga['published'].str['from']).dt.date
manga['published_to'] = pd.to_datetime(manga['published'].str['to']).dt.date
manga.drop(columns=['published'], inplace=True)

# Use popularity=0 to detect 'pending approval' mangas
manga['pending_approval'] = manga['popularity'] == 0

#  Drop rank and popularity, as they sort equal score / members alphabetically...
manga.drop(columns=['rank', 'popularity'], inplace=True)

# Missing synopsis and background
manga['synopsis'] = manga['synopsis'].replace('', np.nan).replace('N/A', np.nan).replace('None.', np.nan).replace('...', np.nan).replace('.', np.nan)
manga['background'] = manga['background'].replace('', np.nan).replace('N/A', np.nan)

# Authors: (id, name)
manga['authors'] = manga['authors'].apply(lambda x: [(dic['mal_id'], dic['name']) for dic in x] if not x is np.nan else [])

# Only keep names
for col in ['serializations', 'genres', 'themes', 'demographics']:
    manga[col] = manga[col].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])

# Replace old Serialization name
manga['serializations'] = manga['serializations'].apply(lambda x: [s if s!='Asuka (Monthly)' else 'Asuka' for s in x])

# R18+ definition by MAL (not ranked)
manga['nsfw'] = manga['genres'].apply(lambda x: 'Hentai' in x or 'Erotica' in x)

# Simplify main_picture, delete default. Options: .jpg, t.jpg, l.jpg, .webp, t.webp, l.webp
default_image = 'https://cdn.myanimelist.net/img/sp/icon/apple-touch-icon-256.png'
manga['main_picture'] = manga['images'].str['jpg'].str['image_url'].replace(default_image, np.nan)
manga.drop(columns=['images'], inplace=True)

# Missing strings
manga['type'] = manga['type'].replace('', np.nan)
manga['synopsis'] = manga['synopsis'].replace('n/a', np.nan).replace('None', np.nan)
manga['title_english'] = manga['title_english'].replace('N/A', np.nan)
manga['title_japanese'] = manga['title_japanese'].replace('', np.nan)

# Better order
order = ['mal_id', 'title', 'type', 'score', 'scored_by', 'status', 'volumes', 'chapters', 'published_from', 'published_to',
         'members', 'favorites', 'nsfw', 'pending_approval', 'genres', 'themes', 'demographics', 'authors', 'serializations',
         'synopsis', 'background', 'main_picture', 'url', 'title_english', 'title_japanese', 'title_synonyms']

manga = manga[order]

# Sort by Top Manga
manga['tmp'] = manga['score'].rank(ascending=False) + manga['scored_by'].rank(ascending=False)
manga = manga.sort_values('tmp').reset_index(drop=True)
manga.drop(columns=['tmp'], inplace=True)

# Save as csv
manga.to_csv('data/manga.csv', index=False)

print(manga.shape)

pd.options.display.max_columns = None
manga.head(1)

Duplicates: 8
(66371, 26)


Unnamed: 0,mal_id,title,type,score,scored_by,status,volumes,chapters,published_from,published_to,members,favorites,nsfw,pending_approval,genres,themes,demographics,authors,serializations,synopsis,background,main_picture,url,title_english,title_japanese,title_synonyms
0,2,Berserk,Manga,9.45,267095.0,Publishing,,,1989-08-25,NaT,548371,103266,False,False,"[Action, Adventure, Award Winning, Drama, Fant...","[Gore, Military, Mythology, Psychological]",[Seinen],"[(1868, Miura, Kentarou), (49592, Studio Gaga)]",[Young Animal],"Guts, a former mercenary now known as the ""Bla...",Berserk won the Award for Excellence at the si...,https://cdn.myanimelist.net/images/manga/1/157...,https://myanimelist.net/manga/2/Berserk,Berserk,ベルセルク,[Berserk: The Prototype]


## Load Manga

In [2]:
import pandas as pd
import ast

manga = pd.read_csv('data/manga.csv')

columns_dtype_datetime = ['published_from', 'published_to']
for col in columns_dtype_datetime:
    manga[col] = pd.to_datetime(manga[col]).dt.date

columns_dtype_Int64 = ['volumes', 'chapters']
for col in columns_dtype_Int64:
    manga[col] = manga[col].astype('Int64')

columns_dtype_list = ['genres', 'themes', 'demographics', 'authors', 'serializations', 'title_synonyms']
for col in columns_dtype_list:
    manga[col] = manga[col].apply(ast.literal_eval)

pd.options.display.max_columns = None
manga.head(1)

Unnamed: 0,mal_id,title,type,score,scored_by,status,volumes,chapters,published_from,published_to,members,favorites,nsfw,pending_approval,genres,themes,demographics,authors,serializations,synopsis,background,main_picture,url,title_english,title_japanese,title_synonyms
0,2,Berserk,Manga,9.45,267095.0,Publishing,,,1989-08-25,NaT,548371,103266,False,False,"[Action, Adventure, Award Winning, Drama, Fant...","[Gore, Military, Mythology, Psychological]",[Seinen],"[(1868, Miura, Kentarou), (49592, Studio Gaga)]",[Young Animal],"Guts, a former mercenary now known as the ""Bla...",Berserk won the Award for Excellence at the si...,https://cdn.myanimelist.net/images/manga/1/157...,https://myanimelist.net/manga/2/Berserk,Berserk,ベルセルク,[Berserk: The Prototype]
