## Manga cleaning

In [1]:
import pandas as pd
import numpy as np
import datetime

manga = pd.read_json('data/raw/manga_mal.json')

# Usually no Duplicates, but can happen (it even happens in the website)
# ---------------------- BUT HERE THEY ARE REAL LOSSES!!!!!!!! ---------------------------------
old_size = manga.shape[0]
manga = manga.drop_duplicates(subset=['id']).reset_index(drop=True)
number_duplicates = old_size - manga.shape[0]
if number_duplicates:
    print('Duplicates:', number_duplicates)

# Shorter and better names, like Jikan API
manga.rename(columns={'id': 'manga_id', 'media_type': 'type', 'mean': 'score', 'num_list_users': 'members', 'num_scoring_users': 'scored_by', \
    'num_favorites': 'favorites', 'num_volumes': 'volumes', 'num_chapters': 'chapters'}, inplace=True)

# Avoid false zeroes and unnecessary floats 
manga['volumes'] = manga['volumes'].replace(0, np.nan).astype('Int64')
manga['chapters'] = manga['chapters'].replace(0, np.nan).astype('Int64')

# Without adding False day 1 or False month January (i.e 2005 -> 2005-1-1)
manga['real_start_date'] = manga['start_date']
manga['real_end_date'] = manga['end_date']

# Use Timestamps
manga['start_date'] = pd.to_datetime(manga['start_date'])
manga['end_date'] = pd.to_datetime(manga['end_date'])

# Use popularity=0 to detect 'pending approval' mangas
manga['approved'] = manga['popularity'] != 0

# Only keep names
manga['genres'] = manga['genres'].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])

genres = {'Action', 'Adventure', 'Avant Garde', 'Award Winning', 'Boys Love',  'Comedy', 'Drama', 'Ecchi', 'Erotica', 'Fantasy',
'Girls Love', 'Gourmet', 'Hentai', 'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Slice of Life', 'Sports', 'Supernatural', 'Suspense'}

themes = {'Adult Cast', 'Anthropomorphic', 'CGDCT', 'Childcare', 'Combat Sports', 'Crossdressing', 'Delinquents', 'Detective', 'Educational',
'Gag Humor', 'Gore', 'Harem', 'High Stakes Game', 'Historical', 'Idols (Female)', 'Idols (Male)', 'Isekai', 'Iyashikei', 'Love Polygon',
'Magical Sex Shift', 'Mahou Shoujo', 'Martial Arts', 'Mecha', 'Medical', 'Military', 'Music', 'Mythology', 'Organized Crime', 'Otaku Culture',
'Parody', 'Performing Arts', 'Pets', 'Psychological', 'Racing', 'Reincarnation', 'Reverse Harem', 'Romantic Subtext', 'Samurai', 'School',
'Showbiz', 'Space', 'Strategy Game', 'Super Power', 'Survival', 'Team Sports', 'Time Travel', 'Vampire', 'Video Game', 'Visual Arts',
'Workplace'} | {'Memoir', 'Villainess'}

demographics = {'Josei', 'Kids', 'Seinen', 'Shoujo', 'Shounen'}

# Split genres, themes and demographics
manga['themes'] = manga['genres'].apply(lambda x: [t for t in x if t in themes])
manga['demographics'] = manga['genres'].apply(lambda x: [t for t in x if t in demographics])
manga['genres'] = manga['genres'].apply(lambda x: [t for t in x if t in genres])

# Authors
def author_format(authors):
    if authors is np.nan:
        return []
    output = []
    for author in authors:
        output.append({'id': author['node']['id'], 'first_name': author['node']['first_name'], 'last_name': author['node']['last_name'], \
            'role': author['role']})
    return output
manga['authors']  = manga['authors'].apply(author_format)

# Mark R18+ Titles (not ranked)
manga['sfw'] = manga['genres'].apply(lambda x: 'Hentai' not in x and 'Erotica' not in x)

# Similar to the anime version, a lot of wrong labeled
manga.drop(columns=['nsfw'], inplace=True)

# MyAnimeList edits
for col in ['created_at', 'updated_at']:
    manga[col] = pd.to_datetime(manga[col])
    manga.loc[manga[col]=='1970-01-01 00:00:00+0000', col] = pd.NaT

# Looks like created_at it's not working??
assert all(manga['created_at'].isna())
manga.drop(columns=['created_at'], inplace=True)

# Make it manually
m = manga[manga['updated_at'].notna()].sort_values('updated_at')[['manga_id', 'updated_at']]
data = [m.iloc[0]]
for _, row in m.iterrows():
    if row['manga_id'] > data[-1]['manga_id']:
        data.append(row)
data.append({'manga_id': 2**63-1, 'updated_at': datetime.datetime.utcnow()})

created_at = []
manga.sort_values('manga_id', inplace=True)
pos = 0
for id in manga.manga_id:
    if id > data[pos]['manga_id']:
        pos += 1
    created_at.append(data[pos]['updated_at'])

manga['created_at_before'] = pd.to_datetime(created_at, utc=True)

# Avoid empty string
manga.loc[manga['synopsis'].isin(['', ' ', 'N/A', 'n/a']), 'synopsis'] = np.nan

# Simplify main picture
manga['main_picture'] = manga['main_picture'].str['large'].str.replace('api-', '')

# Normalize alternative titles
manga['title_english'] = manga['alternative_titles'].str['en'].replace('', np.nan)
manga['title_japanese'] = manga['alternative_titles'].str['ja'].replace('', np.nan)
manga['title_synonyms'] = manga['alternative_titles'].str['synonyms'].fillna('').apply(list)
manga.drop(columns=['alternative_titles'], inplace=True)

# Clean some string errors
for col in ['title', 'title_english', 'title_japanese']:
    manga[col] = manga[col].str.strip().str.replace('  ', ' ')
manga['title_synonyms'] = manga['title_synonyms'].apply(lambda x: [t.replace('  ', ' ') for t in x])

# Better order
order = ['manga_id', 'title', 'type', 'score', 'scored_by', 'status', 'volumes', 'chapters', 'start_date', 'end_date',
         'members', 'favorites', 'sfw', 'approved', 'created_at_before', 'updated_at', 'real_start_date', 'real_end_date',
         'genres', 'themes', 'demographics', 'authors', 'synopsis', 'main_picture', 'title_english', 'title_japanese', 'title_synonyms']

deleted = ['rank', 'popularity', 'nsfw']

missing = ['background', 'serializations', 'url']

manga = manga[order]

# Sort by Top Manga
manga['tmp'] = manga['score'].rank(ascending=False) + manga['scored_by'].rank(ascending=False)
manga = manga.sort_values(['tmp', 'members', 'favorites', 'manga_id'], \
    ascending=[True, False, False, True]).reset_index(drop=True)
manga.drop(columns=['tmp'], inplace=True)

# Save to csv
manga.to_csv('data/manga_mal.csv', index=False)

print(manga.shape)

pd.options.display.max_columns = None
manga.head(1)

Duplicates: 10
(67273, 27)


Unnamed: 0,manga_id,title,type,score,scored_by,status,volumes,chapters,start_date,end_date,members,favorites,sfw,approved,created_at_before,updated_at,real_start_date,real_end_date,genres,themes,demographics,authors,synopsis,main_picture,title_english,title_japanese,title_synonyms
0,2,Berserk,manga,9.45,268737,currently_publishing,,,1989-08-25,NaT,551266,103820,True,True,2007-07-17 20:14:45+00:00,2022-06-23 08:30:44+00:00,1989-08-25,,"[Action, Adventure, Award Winning, Drama, Fant...","[Gore, Military, Mythology, Psychological]",[Seinen],"[{'id': 1868, 'first_name': 'Kentarou', 'last_...","Guts, a former mercenary now known as the ""Bla...",https://cdn.myanimelist.net/images/manga/1/157...,Berserk,ベルセルク,[Berserk: The Prototype]


# Load Manga

In [2]:
import pandas as pd
import ast

manga = pd.read_csv('data/manga_mal.csv')

for col in ['start_date', 'end_date', 'created_at_before', 'updated_at']:
    manga[col] = pd.to_datetime(manga[col])

for col in ['volumes', 'chapters']:
    manga[col] = manga[col].astype('Int64')

for col in ['genres', 'themes', 'demographics', 'authors', 'title_synonyms']:
    manga[col] = manga[col].apply(ast.literal_eval)

pd.options.display.max_columns = None
manga.head(1)

Unnamed: 0,manga_id,title,type,score,scored_by,status,volumes,chapters,start_date,end_date,members,favorites,sfw,approved,created_at_before,updated_at,real_start_date,real_end_date,genres,themes,demographics,authors,synopsis,main_picture,title_english,title_japanese,title_synonyms
0,2,Berserk,manga,9.45,268737,currently_publishing,,,1989-08-25,NaT,551266,103820,True,True,2007-07-17 20:14:45+00:00,2022-06-23 08:30:44+00:00,1989-08-25,,"[Action, Adventure, Award Winning, Drama, Fant...","[Gore, Military, Mythology, Psychological]",[Seinen],"[{'id': 1868, 'first_name': 'Kentarou', 'last_...","Guts, a former mercenary now known as the ""Bla...",https://cdn.myanimelist.net/images/manga/1/157...,Berserk,ベルセルク,[Berserk: The Prototype]


### Author Format???

In [None]:
# Authors
def author_format(authors):
    if authors is np.nan:
        return []
    output = []
    for author in authors:
        if not author['node']['first_name']:
            output.append(f"{author['node']['last_name']} ({author['role']}")
        else:
            output.append(f"{author['node']['last_name']}, {author['node']['first_name']} ({author['role']})")
    return output
manga['authors']  = manga['authors'].apply(author_format)

(59950, 25)


Unnamed: 0,id,title,media_type,mean,num_scoring_users,status,num_volumes,num_chapters,start_date,end_date,num_list_users,popularity,num_favorites,rank,genres,authors,synopsis,nsfw,created_at,updated_at,main_picture_medium,main_picture_large,alternative_titles_en,alternative_titles_ja,alternative_titles_synonyms
0,23390,Shingeki no Kyojin,manga,8.57,363183,finished,34,141,2009-09-09,2021-04-09,589983,1,66937,87,"[Action, Award Winning, Drama, Gore, Military,...","[Isayama, Hajime (Story & Art)]","Hundreds of years ago, horrifying creatures wh...",white,1970-01-01 00:00:00+00:00,2022-04-18 05:10:58+00:00,https://api-cdn.myanimelist.net/images/manga/2...,https://api-cdn.myanimelist.net/images/manga/2...,Attack on Titan,進撃の巨人,[]
