## Scrape the Jikan Database

In [1]:
import requests
import json

api_url = 'https://api.jikan.moe/v4'

def scrape_page(endpoint, page, file_path):
    response = requests.get(api_url + endpoint + f'?page={page}')
    response.raise_for_status()
    data = response.json()
    with open(file_path, 'w') as f:
        json.dump(data['data'], f, indent=4)

In [2]:
import datetime
import tqdm
import time
import os

wait = 1.2 # seconds, with 1.15 crashed

def scrape_jikan_db(database):

    tmp_directory = f'data/raw/tmp_{database}_jikan'
    os.makedirs(tmp_directory)
    
    last_page = requests.get(api_url + '/' + database).json()['pagination']['last_visible_page']
    length = len(str(last_page))

    start = datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")
    print('Scraped at:', start)
    
    for page in tqdm.trange(1, last_page + 1):
        start = time.perf_counter()
        scrape_page('/' + database, page, tmp_directory + f'/page{str(page).zfill(length)}.json')
        end = time.perf_counter()
        time.sleep(max(0, start + wait - end))
    
    merge_files(tmp_directory, database)

In [3]:
import shutil

def merge_files(tmp_directory, database):

    data = []
    for file_name in os.listdir(tmp_directory):
        file_path = os.path.join(tmp_directory, file_name)
        with open(file_path, 'r') as f:
            file = json.load(f)
        data.extend(file)
    
    with open(f'data/raw/{database}_jikan.json', 'w') as f:
        json.dump(data, f, indent=4)
    
    shutil.rmtree(tmp_directory)

## Actual Scraping

### Anime

In [4]:
scrape_jikan_db('anime')

Scraped at: 2022-07-23 20.48.32


100%|██████████| 988/988 [19:54<00:00,  1.21s/it]
100%|██████████| 988/988 [00:08<00:00, 114.55it/s]


### Manga

In [4]:
scrape_jikan_db('manga')

Scraped at: 2022-07-25 10.57.49


100%|██████████| 2656/2656 [53:22<00:00,  1.21s/it]
