## MAL API Setup

In [28]:
import requests

api_url = 'https://api.myanimelist.net/v2'

# A Client ID is needed (https://myanimelist.net/apiconfig)
with open('client_id.txt', 'r') as f:
    CLIENT_ID = f.read()

headers = {'X-MAL-CLIENT-ID': CLIENT_ID}

def get_data(endpoint, params=None):
    url = api_url + endpoint
    if params:
        url += '?' + '&'.join(f'{key}={value}' for key, value in params.items())
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return response.json()


## Scrap Manga

In [29]:
import json
import os

scraping_save_pages = 'data/data_tmp/manga_pages'

if not os.path.exists(scraping_save_pages): # Create saving directory if it doesn't exist
  os.makedirs(scraping_save_pages)

endpoint = f'/manga/ranking'
limit = 500

manga_keys = ['id', 'title', 'main_picture', 'alternative_titles', 'start_date', 'end_date', 'synopsis', 'mean', 'rank', 'popularity',
              'num_list_users', 'num_scoring_users', 'num_favorites', 'nsfw', 'genres', 'created_at', 'updated_at', 'media_type', 'status',
              'num_volumes', 'num_chapters', 'authors']

def scrape_page(page):
    params = {'ranking_type': 'favorite', 'limit': limit, 'offset': page*limit, 'fields': ','.join(manga_keys)}
    data = get_data(endpoint, params)
    useful = [anime['node'] for anime in data['data']]
    with open(scraping_save_pages + f'/page{str(page).zfill(3)}.json', 'w') as f:
        json.dump(useful, f, indent=4)

In [3]:
import math

# 21 June 2022 5 PM CEST
previous_total_manga = 66_493
previous_last_page = math.ceil(previous_total_manga / limit) - 1

data = get_data(endpoint, {'ranking_type': 'favorite', 'limit': limit, 'offset': previous_last_page*limit, 'fields': ','.join(manga_keys)})
assert len(data['data']) > 0
assert 'next' not in data['paging']

last_page = previous_last_page

last_page

132

In [32]:
import tqdm
import time

for page in tqdm.trange(last_page+1):   # Crashed in pages: 66, 82, 93, 96, 101 (had to scrap them without requesting the field alternative_titles
    scrape_page(page)                   # Some Manga might not have the alternative_tiltes key correctly and asking for it makes the request crash?
    time.sleep(1)

100%|██████████| 31/31 [01:37<00:00,  3.14s/it]


## Merge Files

In [48]:
import os

scraping_save_pages = 'data/data_tmp/manga_pages'

data = []
for file_name in os.listdir(scraping_save_pages):
    file_path = os.path.join(scraping_save_pages, file_name)
    with open(file_path, 'r') as f:
        file = json.load(f)
    data.extend(file)

len(data)

66493

In [49]:
with open('data/data_tmp/manga_raw.json', 'w') as f:
    json.dump(data, f, indent=4)

## Manga cleaning

In [138]:
import pandas as pd
import numpy as np
import json

with open('data/data_tmp/manga_raw.json', 'r') as f:
    data = json.load(f)

manga = pd.json_normalize(data, sep='_')

# Remove 12 duplcated in the first data gathering
manga = manga.drop_duplicates(subset=['id']).reset_index().drop(columns='index')

# Use Timestamps
manga['start_date'] = pd.to_datetime(manga['start_date'])
manga['end_date'] = pd.to_datetime(manga['end_date'])

# Avoid floats and zeroes marking nsfw
manga['rank'] = manga['rank'].replace(0, np.nan).astype('Int64')
manga['popularity'] = manga['popularity'].replace(0, np.nan).astype('Int64')
#anime['num_episodes'] = anime['num_episodes'].replace(0, np.nan).astype('Int64') 0 volumes 0 chapters?

# Only keep names
manga['genres'] = manga['genres'].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])

# Missing author names!!!
manga['authors'] = manga['authors'].apply(lambda x: [{'author_id': dic['node']['id'], 'role': dic['role']} for dic in x] if not x is np.nan else [])

# MyAnimeList edits
manga['created_at'] = pd.to_datetime(manga['created_at']).dt.tz_convert(None)
manga['updated_at'] = pd.to_datetime(manga['updated_at']).dt.tz_convert(None)

# Avoid empty string
manga['alternative_titles_en'] = manga['alternative_titles_en'].replace('', np.nan)
manga['alternative_titles_ja'] = manga['alternative_titles_ja'].replace('', np.nan)
manga['alternative_titles_synonyms'] = manga['alternative_titles_synonyms'].fillna('').apply(list)


order = ['id', 'title', 'media_type', 'mean', 'num_scoring_users',                          # 10 Most important attributes,
         'status', 'num_volumes', 'num_chapters', 'start_date', 'end_date',                 # appearing first on kaggle

         'num_list_users', 'popularity', 'num_favorites', 'rank',                           # Other important attrubtes
         
         'genres', 'authors',                                                               # Multivalued attributes
         'synopsis', 'nsfw', 'created_at', 'updated_at',                                    # Description, MyAnimeList edits

         'main_picture_medium', 'main_picture_large',                                       # Media data
         'alternative_titles_en', 'alternative_titles_ja', 'alternative_titles_synonyms']   # Other titles


manga = manga[order]

# Save to csv
manga.to_csv('data/manga.csv', index=False)

pd.options.display.max_columns = None
manga.head(1)

Unnamed: 0,id,title,media_type,mean,num_scoring_users,status,num_volumes,num_chapters,start_date,end_date,num_list_users,popularity,num_favorites,rank,genres,authors,synopsis,nsfw,created_at,updated_at,main_picture_medium,main_picture_large,alternative_titles_en,alternative_titles_ja,alternative_titles_synonyms
0,2,Berserk,manga,9.45,261724,on_hiatus,41,380,1989-08-25,2021-09-10,538522,2,101419,1,"[Action, Adventure, Award Winning, Drama, Fant...","[{'author_id': 1868, 'role': 'Story & Art'}]","Guts, a former mercenary now known as the ""Bla...",white,1970-01-01,2022-06-08 23:10:04,https://api-cdn.myanimelist.net/images/manga/1...,https://api-cdn.myanimelist.net/images/manga/1...,Berserk,ベルセルク,[Berserk: The Prototype]
