## Manga Cleaning

In [3]:
import pandas as pd
import numpy as np
import datetime

manga = pd.read_json('data/raw/manga_jikan.json')

# Useful when there's more id's
manga.rename(columns={'mal_id': 'manga_id'}, inplace=True)

# Drop Duplicates
old_size = manga.shape[0]
manga = manga.drop_duplicates(subset=['manga_id']).reset_index(drop=True)
print('Duplicates:', old_size - manga.shape[0])

# Remove useless columns (publishing column = Publishing status, explicit_genres is void, scored = score)
assert all(manga['publishing'] == (manga['status']=='Publishing'))
assert all((manga['score']==manga['scored']) | (manga['score'].isna() & manga['scored'].isna()))
assert all(manga.explicit_genres.astype('str')=='[]')
manga.drop(columns=['publishing', 'scored', 'explicit_genres'], inplace=True)

# Avoid unnecessary floats
for col in ['scored_by', 'chapters', 'volumes']:
    manga[col] = manga[col].astype('Int64')

# Better publication dates from the string
def format_date(date):
    if date=='Not available' or date=='?' or date is np.nan or len(date)==5 or len(date)==1:  
        return np.nan
    if len(date)==4:
        return date
    if len(date)==7 or len(date)==8 and date[0].isdigit():
        return datetime.datetime.strptime(date, '%m, %Y').strftime('%Y-%m')
    if len(date)==8:
        return datetime.datetime.strptime(date, '%b %Y').strftime('%Y-%m')
    return datetime.datetime.strptime(date, '%b %d, %Y').strftime('%Y-%m-%d')
manga['real_start_date'] = manga['published'].str['string'].str.split(' to ').str[0].apply(format_date)
manga['real_end_date'] = manga['published'].str['string'].str.split(' to ').str[1].apply(format_date)
manga.loc[~manga['published'].str['string'].str.contains('to'), 'real_end_date'] = manga['real_start_date']
manga['start_date'] = pd.to_datetime(manga['real_start_date'])
manga['end_date'] = pd.to_datetime(manga['real_end_date'])
manga.drop(columns=['published'], inplace=True)

# Use popularity=0 to detect 'pending approval' mangas
manga['approved'] = manga['popularity'] != 0

#  Drop rank and popularity, as they sort equal score / members alphabetically...
manga.drop(columns=['rank', 'popularity'], inplace=True)

# Missing synopsis and background
manga['synopsis'] = manga['synopsis'].replace('', np.nan).replace('N/A', np.nan).replace('None.', np.nan) \
    .replace('...', np.nan).replace('.', np.nan)
manga['background'] = manga['background'].replace('', np.nan).replace('N/A', np.nan)

# Only keep names
for col in ['serializations', 'genres', 'themes', 'demographics']:
    manga[col] = manga[col].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])
manga['genres'] = manga['genres'].apply(sorted)

# Authors: (id, name)
manga['authors'] = manga['authors'].apply(lambda x: [(dic['mal_id'], dic['name']) for dic in x] if not x is np.nan else [])

# Replace old Serialization name
manga['serializations'] = manga['serializations'].apply(lambda x: [s if s!='Asuka (Monthly)' else 'Asuka' for s in x])

# Mark R18+ Titles (not ranked)
manga['sfw'] = manga['genres'].apply(lambda x: 'Hentai' not in x and 'Erotica' not in x)

# Simplify main_picture, delete default. Options: .jpg, t.jpg, l.jpg, .webp, t.webp, l.webp
default_image = 'https://cdn.myanimelist.net/img/sp/icon/apple-touch-icon-256.png'
manga['main_picture'] = manga['images'].str['jpg'].str['large_image_url'].replace(default_image, np.nan)
manga.drop(columns=['images'], inplace=True)

# Missing strings
manga['type'] = manga['type'].replace('', np.nan)
manga['synopsis'] = manga['synopsis'].replace('n/a', np.nan).replace('None', np.nan)
manga['title_english'] = manga['title_english'].replace('N/A', np.nan)
manga['title_japanese'] = manga['title_japanese'].replace('', np.nan)

# Clean some string errors
manga['title_japanese'] = manga['title_japanese'].str.strip()

# Better order
order = ['manga_id', 'title', 'type', 'score', 'scored_by', 'status', 'volumes', 'chapters', 'start_date', 'end_date',
         'members', 'favorites', 'sfw', 'approved', 'real_start_date', 'real_end_date', 'genres', 'themes', 'demographics',
         'authors', 'serializations', 'synopsis', 'background', 'main_picture', 'url', 'title_english', 'title_japanese', 'title_synonyms']

manga = manga[order]

# Sort by Top Manga
manga['tmp'] = manga['score'].rank(ascending=False) + manga['scored_by'].rank(ascending=False)
manga = manga.sort_values('tmp').reset_index(drop=True)
manga.drop(columns=['tmp'], inplace=True)

# Save as csv
manga.to_csv('data/manga_jikan.csv', index=False)

print(manga.shape)

pd.options.display.max_columns = None
manga.head(1)

Duplicates: 8
(66388, 28)


Unnamed: 0,manga_id,title,type,score,scored_by,status,volumes,chapters,start_date,end_date,members,favorites,sfw,approved,real_start_date,real_end_date,genres,themes,demographics,authors,serializations,synopsis,background,main_picture,url,title_english,title_japanese,title_synonyms
0,2,Berserk,Manga,9.45,268669,Publishing,,,1989-08-25,NaT,551177,103805,True,True,1989-08-25,,"[Action, Adventure, Award Winning, Drama, Fant...","[Gore, Military, Mythology, Psychological]",[Seinen],"[(1868, Miura, Kentarou), (49592, Studio Gaga)]",[Young Animal],"Guts, a former mercenary now known as the ""Bla...",Berserk won the Award for Excellence at the si...,https://cdn.myanimelist.net/images/manga/1/157...,https://myanimelist.net/manga/2/Berserk,Berserk,ベルセルク,[Berserk: The Prototype]


## Load Manga

In [5]:
import pandas as pd
import ast

manga = pd.read_csv('data/manga_jikan.csv')

for col in ['start_date', 'end_date']:
    manga[col] = pd.to_datetime(manga[col]).dt.date

for col in ['scored_by', 'volumes', 'chapters']:
    manga[col] = manga[col].astype('Int64')

for col in ['genres', 'themes', 'demographics', 'authors', 'serializations', 'title_synonyms']:
    manga[col] = manga[col].apply(ast.literal_eval)

pd.options.display.max_columns = None
manga.head(1)

Unnamed: 0,manga_id,title,type,score,scored_by,status,volumes,chapters,start_date,end_date,members,favorites,sfw,approved,real_start_date,real_end_date,genres,themes,demographics,authors,serializations,synopsis,background,main_picture,url,title_english,title_japanese,title_synonyms
0,2,Berserk,Manga,9.45,268669,Publishing,,,1989-08-25,NaT,551177,103805,True,True,1989-08-25,,"[Action, Adventure, Award Winning, Drama, Fant...","[Gore, Military, Mythology, Psychological]",[Seinen],"[(1868, Miura, Kentarou), (49592, Studio Gaga)]",[Young Animal],"Guts, a former mercenary now known as the ""Bla...",Berserk won the Award for Excellence at the si...,https://cdn.myanimelist.net/images/manga/1/157...,https://myanimelist.net/manga/2/Berserk,Berserk,ベルセルク,[Berserk: The Prototype]


### Authors Format??