## Scrape the MAL Database

### MAL API Setup

In [1]:
import requests

api_url = 'https://api.myanimelist.net/v2'

# A Client ID is needed (https://myanimelist.net/apiconfig)
with open('client_id.txt', 'r') as f:
    CLIENT_ID = f.read()

headers = {'X-MAL-CLIENT-ID': CLIENT_ID}

def get_data(endpoint, params=None):
    url = api_url + endpoint
    response = requests.get(url, headers=headers, params=params)
    response.raise_for_status()
    return response.json()

### Scrape Ranking

In [2]:
import json

def scrape_ranking_page(database, ranking_type, page, fields, save_directory, length):
    params = {'ranking_type': ranking_type, 'limit': 500, 'offset': page*500, 'fields': fields}
    try:
        data = get_data(f'/{database}/ranking', params)
    except:
        data = manga_crash(f'/{database}/ranking', params)
    
    useful = [anime['node'] for anime in data['data']]
    with open(save_directory + f'/page{str(page).zfill(length)}.json', 'w') as f:
        json.dump(useful, f, indent=4)

In [3]:
import datetime
import tqdm
import time
import os

def scrape_ranking(database='anime', ranking_type='favorite'):

    base_directory = f'data/raw'
    save_file_path = base_directory + f'/{database}_mal.json'
    tmp_directory = base_directory + f'/tmp_{database}_mal'
    os.makedirs(tmp_directory)

    fields = ','.join(keys[database])
    last_page = get_last_page(database, ranking_type)
    length = len(str(last_page))

    start = datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")
    print('Scraped at:', start)
    for page in tqdm.trange(last_page+1):
        scrape_ranking_page(database, ranking_type, page, fields, tmp_directory, length)
        time.sleep(1)
    
    merge_anime(tmp_directory, save_file_path)

In [4]:
import math

def get_last_page(database, ranking_type):

    if database=='anime' and ranking_type=='favorite':
        number_entries =  24_189
    
    if database=='manga' and ranking_type=='bypopularity':
        number_entries = 62_697
    
    if database=='manga' and ranking_type=='favorite':
        number_entries = 70_303

    last_page = math.ceil(number_entries / 500) - 1

    # Test that it's still correct
    params = {'ranking_type': ranking_type, 'limit': 500, 'offset': last_page*500}
    data = get_data(f'/{database}/ranking', params)
    assert len(data['data']) > 0
    assert 'next' not in data['paging']

    return last_page

### Keys

In [5]:
common_keys = [
    'id', 'title', 'main_picture', 'alternative_titles', 'start_date', 'end_date', 'synopsis', 'mean', 'rank', 'popularity',
    'num_list_users', 'num_scoring_users', 'num_favorites', 'nsfw', 'genres', 'created_at', 'updated_at', 'media_type', 'status'
]

anime_keys = [*common_keys, 'num_episodes', 'start_season', 'broadcast', 'source', 'average_episode_duration', 'rating', 'studios']

manga_keys = [*common_keys, 'num_volumes', 'num_chapters', 'authors{id,first_name,last_name}']

keys = {'anime': anime_keys, 'manga': manga_keys}

### Merge Files

In [6]:
import shutil

def merge_anime(tmp_directory, save_file_path):

    data = []
    for file_name in os.listdir(tmp_directory):
        file_path = os.path.join(tmp_directory, file_name)
        with open(file_path, 'r') as f:
            file = json.load(f)
        data.extend(file)

    with open(save_file_path, 'w') as f:
        json.dump(data, f, indent=4)

    #shutil.rmtree(tmp_directory)

### Manga Alternative Titles Crash

In [9]:
known_fails = [116770, 144472, 115838, 143751, 146583, 148716]

def manga_crash(endpoint, params):
    page = params["offset"]//params["limit"]
    print(f'Crashed at page {page}')
    
    params['fields'] = params['fields'].replace('alternative_titles,', '')
    data = get_data(endpoint, params)

    ids = [manga['node']['id'] for manga in data['data']]

    present_fails = [id for id in ids if id in known_fails]

    if not present_fails:
        print('Fail unknown...')
        return data
    
    print('Fails:', present_fails)

    offset = page * params['limit']
    problems = [offset-1]
    for fail in known_fails:
        if fail in ids:
            problems.append(offset + ids.index(fail))
    problems.append(offset + params['limit'])

    alternative_titles = []
    params['fields'] = 'alternative_titles'
    for i in range(len(problems)-1):
        params['offset'] = problems[i] + 1
        params['limit'] = problems[i+1] - problems[i] - 1
        data_short = get_data(endpoint, params)
        alternative_titles.extend((manga['node']['id'], manga['node']['alternative_titles']) for manga in data_short['data'])
        time.sleep(1)

    for id, alt_tit in alternative_titles:
        data['data'][ids.index(id)]['node']['alternative_titles'] = alt_tit
    
    return data

## Actual Scraping

In [7]:
scrape_ranking('anime', 'favorite')

Scraped at: 2022-07-23 20.55.04


100%|██████████| 49/49 [02:26<00:00,  2.99s/it]


In [8]:
scrape_ranking('manga', 'bypopularity')

Scraped at: 2022-07-23 20.59.08


100%|██████████| 120/120 [08:09<00:00,  4.08s/it]


In [8]:
scrape_ranking('manga', 'favorite')

Scraped at: 2022-07-25 10.57.44


 49%|████▉     | 66/135 [04:27<04:42,  4.10s/it]

Crashed at page 66
Fails: [116770]


 61%|██████▏   | 83/135 [05:39<03:37,  4.19s/it]

Crashed at page 83
Fails: [144472]


 70%|██████▉   | 94/135 [06:31<02:58,  4.34s/it]

Crashed at page 94
Fails: [115838]


 72%|███████▏  | 97/135 [06:47<03:02,  4.81s/it]

Crashed at page 97
Fails: [143751]


 76%|███████▌  | 102/135 [07:13<02:34,  4.67s/it]

Crashed at page 102
Fails: [146583]


 95%|█████████▍| 128/135 [09:07<00:29,  4.18s/it]

Crashed at page 128
Fail unknown...


100%|██████████| 135/135 [09:33<00:00,  4.25s/it]


- Lost 499 alternative_titles at page 128