## Scrape the Jikan Database

In [1]:
import requests
import json

api_url = 'https://api.jikan.moe/v4'

def scrape_page(endpoint, page, file_path):
    response = requests.get(api_url + endpoint + f'?page={page}')
    response.raise_for_status()
    data = response.json()
    with open(file_path, 'w') as f:
        json.dump(data['data'], f, indent=4)

In [2]:
import datetime
import tqdm
import time
import os

wait = 1.2 # seconds, with 1.15 crashed

def scrape_jikan_db(database):

    directory_path = f'data/raw/{database}'
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
    
    last_page = requests.get(api_url + '/' + database).json()['pagination']['last_visible_page']
    length = len(str(last_page))

    print('Started:', datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    
    for page in tqdm.trange(1, last_page + 1):
        start = time.perf_counter()
        scrape_page('/' + database, page, directory_path + f'/page{str(page).zfill(length)}.json')
        end = time.perf_counter()
        time.sleep(max(0, start + wait - end))
    
    print('Finished:', datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

## Merge Files

In [3]:
import shutil

def merge_files(database):

    directory_path = f'data/raw/{database}'

    data = []
    for file_name in tqdm.tqdm(os.listdir(directory_path)):
        file_path = os.path.join(directory_path, file_name)
        with open(file_path, 'r') as f:
            file = json.load(f)
        data.extend(file)
    
    with open(f'data/raw/{database}.json', 'w') as f:
        json.dump(data, f, indent=4)
    
    shutil.rmtree(directory_path)

## Actual Scraping

### Anime

In [4]:
scrape_jikan_db('anime')

Started: 2022-07-17 19:07:25


100%|██████████| 986/986 [19:55<00:00,  1.21s/it]

Finished: 2022-07-17 19:27:20





In [5]:
merge_files('anime')

100%|██████████| 986/986 [00:07<00:00, 134.38it/s]


### Manga

In [6]:
scrape_jikan_db('manga')

Started: 2022-07-17 19:28:13


100%|██████████| 2656/2656 [53:30<00:00,  1.21s/it]

Finished: 2022-07-17 20:21:44





In [7]:
merge_files('manga')

100%|██████████| 2656/2656 [00:20<00:00, 131.17it/s]


### People

In [8]:
scrape_jikan_db('people')

Started: 2022-07-17 20:22:39


100%|██████████| 680/680 [13:42<00:00,  1.21s/it]

Finished: 2022-07-17 20:36:21





In [9]:
merge_files('people')

100%|██████████| 680/680 [00:04<00:00, 150.79it/s]


### Characters

In [10]:
scrape_jikan_db('characters')

Started: 2022-07-17 20:36:34


100%|██████████| 5845/5845 [1:57:36<00:00,  1.21s/it]

Finished: 2022-07-17 22:34:10





In [11]:
merge_files('characters')

100%|██████████| 5845/5845 [00:38<00:00, 152.88it/s]


- 2022-07-17, 19:07:25 a 22:34:10 (3h 30 min)