## MAL API Setup

In [1]:
import requests

api_url = 'https://api.myanimelist.net/v2'

# A Client ID is needed (https://myanimelist.net/apiconfig)
with open('client_id.txt', 'r') as f:
    CLIENT_ID = f.read()

headers = {'X-MAL-CLIENT-ID': CLIENT_ID}

def get_data(endpoint, params=None):
    url = api_url + endpoint
    if params:
        url += '?' + '&'.join(f'{key}={value}' for key, value in params.items())
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return response.json()


## Scrap Manga

In [4]:
import json
import os

scraping_save_pages = 'data/data_tmp/manga_pages'

if not os.path.exists(scraping_save_pages): # Create saving directory if it doesn't exist
  os.makedirs(scraping_save_pages)

endpoint = f'/manga/ranking'
limit = 500

manga_keys = ['id', 'title', 'main_picture', 'alternative_titles', 'start_date', 'end_date', 'synopsis', 'mean', 'rank', 'popularity',
              'num_list_users', 'num_scoring_users', 'num_favorites', 'nsfw', 'genres', 'created_at', 'updated_at', 'media_type', 'status',
              'num_volumes', 'num_chapters', 'authors{first_name,last_name}']

def scrape_page(page):
    params = {'ranking_type': 'favorite', 'limit': limit, 'offset': page*limit, 'fields': ','.join(manga_keys)}
    data = get_data(endpoint, params)
    useful = [manga['node'] for manga in data['data']]
    with open(scraping_save_pages + f'/page{str(page).zfill(3)}.json', 'w') as f:
        json.dump(useful, f, indent=4)

In [3]:
import math

# 22 June 2022
previous_total_manga = 66_506
previous_last_page = math.ceil(previous_total_manga / limit) - 1

data = get_data(endpoint, {'ranking_type': 'favorite', 'limit': limit, 'offset': previous_last_page*limit, 'fields': ','.join(manga_keys)})
assert len(data['data']) > 0
assert 'next' not in data['paging']

last_page = previous_last_page

last_page

132

In [5]:
import tqdm
import time

for page in tqdm.trange(1):
    scrape_page(page)
    time.sleep(1)

100%|██████████| 1/1 [00:04<00:00,  4.13s/it]


In [32]:
import tqdm
import time

for page in tqdm.trange(last_page+1):   # Crashed in pages: 66, 82, 93, 96, 101 (had to scrap them without requesting the field alternative_titles
    scrape_page(page)                   # Some Manga might not have the alternative_tiltes key correctly and asking for it makes the request crash?
    time.sleep(1)

100%|██████████| 31/31 [01:37<00:00,  3.14s/it]


## Merge Files

In [6]:
import os

scraping_save_pages = 'data/data_tmp/manga_pages'

data = []
for file_name in os.listdir(scraping_save_pages):
    file_path = os.path.join(scraping_save_pages, file_name)
    with open(file_path, 'r') as f:
        file = json.load(f)
    data.extend(file)

len(data)

500

In [7]:
with open('data/data_tmp/manga_raw.json', 'w') as f:
    json.dump(data, f, indent=4)

## Manga cleaning

In [8]:
import pandas as pd
import numpy as np
import json

with open('data/data_tmp/manga_raw.json', 'r') as f:
    data = json.load(f)

manga = pd.json_normalize(data, sep='_')

In [10]:
import pandas as pd
import numpy as np
import json

with open('data/data_tmp/manga_raw.json', 'r') as f:
    data = json.load(f)

manga = pd.json_normalize(data, sep='_')

# Remove 12 duplcated in the first data gathering (avoid in the next iteration)
manga = manga.drop_duplicates(subset=['id']).reset_index().drop(columns='index')

# Use Timestamps
manga['start_date'] = pd.to_datetime(manga['start_date'])
manga['end_date'] = pd.to_datetime(manga['end_date'])

# Avoid floats and zeroes marking nsfw
manga['rank'] = manga['rank'].replace(0, np.nan).astype('Int64')
manga['popularity'] = manga['popularity'].replace(0, np.nan).astype('Int64')
manga['num_volumes'] = manga['num_volumes'].replace(0, np.nan).astype('Int64')
manga['num_chapters'] = manga['num_chapters'].replace(0, np.nan).astype('Int64')

# Only keep names
manga['genres'] = manga['genres'].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])

# Missing author names!!!
#manga['authors'] = manga['authors'].apply(lambda x: [{'author_id': dic['node']['id'], 'role': dic['role']} for dic in x] if not x is np.nan else [])

# MyAnimeList edits
manga['created_at'] = pd.to_datetime(manga['created_at']).dt.tz_convert(None)
manga['updated_at'] = pd.to_datetime(manga['updated_at']).dt.tz_convert(None)

# Avoid empty string
manga.loc[manga['synopsis'].isin(['', ' ', 'N/A', 'n/a']), 'synopsis'] = np.nan
manga.loc[manga['alternative_titles_en'].isin(['', 'N/A']), 'alternative_titles_en'] = np.nan
manga['alternative_titles_ja'] = manga['alternative_titles_ja'].replace('', np.nan)
manga['alternative_titles_synonyms'] = manga['alternative_titles_synonyms'].fillna('').apply(list)


order = ['id', 'title', 'media_type', 'mean', 'num_scoring_users',                          # 10 Most important attributes,
         'status', 'num_volumes', 'num_chapters', 'start_date', 'end_date',                 # appearing first on kaggle

         'num_list_users', 'popularity', 'num_favorites', 'rank',                           # Other important attrubtes
         
         'genres', 'authors',                                                               # Multivalued attributes
         'synopsis', 'nsfw', 'created_at', 'updated_at',                                    # Description, MyAnimeList edits

         'main_picture_medium', 'main_picture_large',                                       # Media data
         'alternative_titles_en', 'alternative_titles_ja', 'alternative_titles_synonyms']   # Other titles


manga = manga[order]

# Save to csv
manga.to_csv('data/manga.csv', index=False)

pd.options.display.max_columns = None
manga.head(1)

Unnamed: 0,id,title,media_type,mean,num_scoring_users,status,num_volumes,num_chapters,start_date,end_date,num_list_users,popularity,num_favorites,rank,genres,authors,synopsis,nsfw,created_at,updated_at,main_picture_medium,main_picture_large,alternative_titles_en,alternative_titles_ja,alternative_titles_synonyms
0,2,Berserk,manga,9.45,262200,currently_publishing,,,1989-08-25,NaT,539414,2,101608,1,"[Action, Adventure, Award Winning, Drama, Fant...","[{'node': {'id': 1868, 'first_name': 'Kentarou...","Guts, a former mercenary now known as the ""Bla...",white,1970-01-01,2022-06-23 08:30:44,https://api-cdn.myanimelist.net/images/manga/1...,https://api-cdn.myanimelist.net/images/manga/1...,Berserk,ベルセルク,[Berserk: The Prototype]


In [29]:
manga[manga['authors'].str[0].str['node'].str['first_name']=='']

Unnamed: 0,id,title,media_type,mean,num_scoring_users,status,num_volumes,num_chapters,start_date,end_date,num_list_users,popularity,num_favorites,rank,genres,authors,synopsis,nsfw,created_at,updated_at,main_picture_medium,main_picture_large,alternative_titles_en,alternative_titles_ja,alternative_titles_synonyms
8,121496,Solo Leveling,manhwa,8.73,202671,finished,,,2018-03-04,2021-12-29,357574,9,34884,44.0,"[Action, Adventure, Fantasy]","[{'node': {'id': 49667, 'first_name': '', 'las...","Ten years ago, ""the Gate"" appeared and connect...",white,1970-01-01,2022-04-18 05:00:27,https://api-cdn.myanimelist.net/images/manga/3...,https://api-cdn.myanimelist.net/images/manga/3...,Solo Leveling,나 혼자만 레벨업,"[Na Honjaman Level Up, I Level Up Alone]"
45,24692,Noragami,manga,8.42,55748,currently_publishing,,,2010-12-23,NaT,162542,43,11469,171.0,"[Action, Fantasy, Mythology, Shounen]","[{'node': {'id': 6644, 'first_name': '', 'last...",As a relatively unknown minor deity without an...,white,1970-01-01,2022-04-18 04:23:59,https://api-cdn.myanimelist.net/images/manga/3...,https://api-cdn.myanimelist.net/images/manga/3...,Noragami: Stray God,ノラガミ,[Stray Gods]
47,122663,Tower of God,manhwa,8.44,66459,currently_publishing,,,2010-07-05,NaT,154787,47,11262,156.0,"[Action, Adventure, Drama, Fantasy, Mystery]","[{'node': {'id': 49934, 'first_name': '', 'las...",Twenty-Fifth Bam had been alone his whole life...,white,1970-01-01,2022-04-18 00:52:48,https://api-cdn.myanimelist.net/images/manga/2...,https://api-cdn.myanimelist.net/images/manga/2...,Tower of God,신의 탑,[Sin-ui Tap]
77,110737,"Ijiranaide, Nagatoro-san",manga,7.69,64061,currently_publishing,,,2017-11-07,NaT,149128,49,7041,1578.0,"[Comedy, Romantic Subtext, School, Slice of Li...","[{'node': {'id': 27461, 'first_name': '', 'las...",High schooler Hayase Nagatoro loves to spend h...,white,1970-01-01,2022-05-26 14:33:32,https://api-cdn.myanimelist.net/images/manga/3...,https://api-cdn.myanimelist.net/images/manga/3...,"Don't Toy With Me, Miss Nagatoro",イジらないで、長瀞さん,"[Please don't bully me, Nagatoro]"
90,9,Tsubasa: RESERVoir CHRoNiCLE,manga,8.31,35475,finished,28.0,233.0,2003-05-21,2009-10-07,74009,157,6036,265.0,"[Action, Adventure, Drama, Fantasy, Shounen]","[{'node': {'id': 1877, 'first_name': '', 'last...",Warmhearted Syaoran has always been friends wi...,white,1970-01-01,2022-04-18 01:26:37,https://api-cdn.myanimelist.net/images/manga/3...,https://api-cdn.myanimelist.net/images/manga/3...,Tsubasa: RESERVoir CHRoNiCLE,ツバサ -RESERVoir CHRoNiCLE-,[TRC]
102,70261,Mushoku Tensei: Isekai Ittara Honki Dasu,light_novel,8.75,25046,currently_publishing,,,2014-01-23,NaT,62709,197,5217,40.0,"[Fantasy, Isekai, Reincarnation]","[{'node': {'id': 30675, 'first_name': '', 'las...",Killed while saving a stranger from a traffic ...,white,1970-01-01,2022-04-18 00:30:13,https://api-cdn.myanimelist.net/images/manga/3...,https://api-cdn.myanimelist.net/images/manga/3...,Mushoku Tensei: Jobless Reincarnation,無職転生 ～異世界行ったら本気だす～,[]
107,89087,Wotaku ni Koi wa Muzukashii,manga,8.36,35594,finished,11.0,85.0,2015-04-30,2021-07-16,108945,87,5047,218.0,"[Adult Cast, Comedy, Otaku Culture, Romance, W...","[{'node': {'id': 37903, 'first_name': '', 'las...",Narumi Momose is a petite and cute young woman...,white,1970-01-01,2022-04-17 23:51:24,https://api-cdn.myanimelist.net/images/manga/3...,https://api-cdn.myanimelist.net/images/manga/3...,Wotakoi: Love Is Hard for Otaku,ヲタクに恋は難しい,[It's Difficult to Love an Otaku]
111,60783,Mob Psycho 100,manga,8.6,38124,finished,16.0,109.0,2012-04-18,2017-12-22,96593,106,4976,77.0,"[Action, Award Winning, Comedy, Super Power, S...","[{'node': {'id': 16993, 'first_name': '', 'las...","Shigeo ""Mob"" Kageyama is an eighth-grade stude...",white,1970-01-01,2022-04-18 01:50:09,https://api-cdn.myanimelist.net/images/manga/2...,https://api-cdn.myanimelist.net/images/manga/2...,Mob Psycho 100,モブサイコ100,[One Hundred Mob Psycho]
118,10,xxxHOLiC,manga,8.38,30455,finished,19.0,213.0,2003-02-24,2011-02-09,74685,156,4770,199.0,"[Comedy, Drama, Mystery, Seinen, Supernatural]","[{'node': {'id': 1877, 'first_name': '', 'last...","Living alone after his parents passed away, Ki...",white,1970-01-01,2022-04-18 03:10:24,https://api-cdn.myanimelist.net/images/manga/3...,https://api-cdn.myanimelist.net/images/manga/3...,xxxHOLiC,xxxHOLiC,"[xxxHolic Rou, xxxHolic Cage]"
121,50145,Citrus,manga,7.58,36143,finished,10.0,50.0,2012-11-17,2018-08-18,78669,149,4733,,"[Drama, Erotica, Girls Love, School]","[{'node': {'id': 20176, 'first_name': '', 'las...",Yuzu Aihara is a girl who loves fashion and is...,gray,1970-01-01,2022-04-18 05:00:50,https://api-cdn.myanimelist.net/images/manga/3...,https://api-cdn.myanimelist.net/images/manga/3...,Citrus,citrus,[]


In [28]:
manga['authors'].str[0].str['node'].str['first_name'].value_counts()

            47
Hirohiko     8
Hiro         6
Yuki         6
Kentarou     5
            ..
Sorata       1
Kamome       1
Hiroaki      1
Aya          1
Kanna        1
Name: authors, Length: 306, dtype: int64

In [15]:
manga.loc[0, 'authors']

[{'node': {'id': 1868, 'first_name': 'Kentarou', 'last_name': 'Miura'},
  'role': 'Story & Art'},
 {'node': {'id': 49592, 'first_name': '', 'last_name': 'Studio Gaga'},
  'role': 'Art'}]

# Load Manga

In [151]:
from ast import literal_eval
import pandas as pd

manga = pd.read_csv('data/manga.csv')

columns_dtype_datetime = ['start_date', 'end_date', 'created_at', 'updated_at']
for col in columns_dtype_datetime:
    manga[col] = pd.to_datetime(manga[col])

columns_dtype_Int64 = ['num_volumes', 'num_chapters', 'popularity', 'rank']
for col in columns_dtype_Int64:
    manga[col] = manga[col].astype('Int64')

columns_dtype_list = ['genres', 'authors', 'alternative_titles_synonyms']
for col in columns_dtype_list:
    manga[col] = manga[col].apply(literal_eval)

manga.head(1)

Unnamed: 0,id,title,media_type,mean,num_scoring_users,status,num_volumes,num_chapters,start_date,end_date,num_list_users,popularity,num_favorites,rank,genres,authors,synopsis,nsfw,created_at,updated_at,main_picture_medium,main_picture_large,alternative_titles_en,alternative_titles_ja,alternative_titles_synonyms
0,2,Berserk,manga,9.45,261724,on_hiatus,41,380,1989-08-25,2021-09-10,538522,2,101419,1,"[Action, Adventure, Award Winning, Drama, Fant...","[{'author_id': 1868, 'role': 'Story & Art'}]","Guts, a former mercenary now known as the ""Bla...",white,1970-01-01,2022-06-08 23:10:04,https://api-cdn.myanimelist.net/images/manga/1...,https://api-cdn.myanimelist.net/images/manga/1...,Berserk,ベルセルク,[Berserk: The Prototype]
