## Scrap Anime

In [4]:
import requests
import json

anime_path = 'data/raw/anime_jikan'

url = 'https://api.jikan.moe/v4/anime'

def scrape_page(url, page):
    response = requests.get(url + f'?page={page}')
    response.raise_for_status()
    data = response.json()
    with open(anime_path + f'/page{str(page).zfill(3)}.json', 'w') as f:
        json.dump(data['data'], f, indent=4)

In [5]:
last_page = requests.get(url).json()['pagination']['last_visible_page']
last_page

984

In [7]:
import tqdm
import time

wait = 1.2 # seconds

for page in tqdm.trange(1, last_page+1):
    start = time.perf_counter()
    scrape_page(url, page)
    end = time.perf_counter()
    time.sleep(max(0, start + wait - end))

100%|██████████| 984/984 [19:49<00:00,  1.21s/it]


## Merge Files

In [8]:
import os

scraping_save_pages = 'data/raw/anime_jikan'

data = []
for file_name in os.listdir(scraping_save_pages):
    file_path = os.path.join(scraping_save_pages, file_name)
    with open(file_path, 'r') as f:
        file = json.load(f)
    data.extend(file)

len(data)

24595

In [9]:
data_unique, ids = [], set()
for anime in data:
    if anime['mal_id'] not in ids:
        ids.add(anime['mal_id'])
        data_unique.append(anime)
    else:
        print(anime['mal_id'], 'repeteated!')

len(data_unique)

50684 repeteated!
50684 repeteated!


24593

In [11]:
with open('data/raw/anime_jikan.json', 'w') as f:
    json.dump(data_unique, f, indent=4)

## Anime Cleaning

In [26]:
import pandas as pd
import numpy as np
import json

with open('data/raw/anime_jikan.json', 'r') as f:
    data = json.load(f)

anime = pd.DataFrame(data)

anime['tmp'] = anime['score'].rank(ascending=False) + anime['members'].rank(ascending=False)
anime.sort_values('tmp', inplace=True)
anime.drop(columns=['tmp'], inplace=True)

# Only keep names
for col in ['genres', 'themes', 'demographics', 'studios', 'producers', 'licensors']:
    anime[col] = anime[col].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])

# Drop useless column
assert all(anime.explicit_genres.astype('str')=='[]')
anime.drop(columns=['explicit_genres'], inplace=True)

# Save the url alone
anime['trailer_url'] = anime.trailer.str['url']

# Save to csv
anime.to_csv('data/anime_jikan.csv', index=False)

print(anime.shape)

pd.options.display.max_columns = None
anime.head(1)

(24593, 34)


Unnamed: 0,mal_id,url,images,trailer,title,title_english,title_japanese,title_synonyms,type,source,episodes,status,airing,aired,duration,rating,score,scored_by,rank,popularity,members,favorites,synopsis,background,season,year,broadcast,producers,licensors,studios,genres,themes,demographics,trailer_url
3962,5114,https://myanimelist.net/anime/5114/Fullmetal_A...,{'jpg': {'image_url': 'https://cdn.myanimelist...,"{'youtube_id': '--IcmZkvL0Q', 'url': 'https://...",Fullmetal Alchemist: Brotherhood,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,[Hagane no Renkinjutsushi: Fullmetal Alchemist...,TV,Manga,64.0,Finished Airing,False,"{'from': '2009-04-05T00:00:00+00:00', 'to': '2...",24 min per ep,R - 17+ (violence & profanity),9.13,1865027.0,2.0,3,2922248,204044,After a horrific alchemy experiment goes wrong...,,spring,2009.0,"{'day': 'Sundays', 'time': '17:00', 'timezone'...","[Aniplex, Square Enix, Mainichi Broadcasting S...","[Funimation, Aniplex of America]",[Bones],"[Action, Adventure, Drama, Fantasy]",[Military],[Shounen],https://www.youtube.com/watch?v=--IcmZkvL0Q


## Load Anime

In [20]:
import pandas as pd

anime = pd.read_csv('data/anime_jikan.csv')

anime.head(1)

Unnamed: 0,mal_id,url,images,trailer,title,title_english,title_japanese,title_synonyms,type,source,episodes,status,airing,aired,duration,rating,score,scored_by,rank,popularity,members,favorites,synopsis,background,season,year,broadcast,producers,licensors,studios,genres,explicit_genres,themes,demographics
0,5114,https://myanimelist.net/anime/5114/Fullmetal_A...,{'jpg': {'image_url': 'https://cdn.myanimelist...,"{'youtube_id': '--IcmZkvL0Q', 'url': 'https://...",Fullmetal Alchemist: Brotherhood,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,['Hagane no Renkinjutsushi: Fullmetal Alchemis...,TV,Manga,64.0,Finished Airing,False,"{'from': '2009-04-05T00:00:00+00:00', 'to': '2...",24 min per ep,R - 17+ (violence & profanity),9.13,1865027.0,2.0,3,2922248,204044,After a horrific alchemy experiment goes wrong...,,spring,2009.0,"{'day': 'Sundays', 'time': '17:00', 'timezone'...","[{'mal_id': 17, 'type': 'anime', 'name': 'Anip...","[{'mal_id': 102, 'type': 'anime', 'name': 'Fun...","[{'mal_id': 4, 'type': 'anime', 'name': 'Bones...","[{'mal_id': 1, 'type': 'anime', 'name': 'Actio...",[],"[{'mal_id': 38, 'type': 'anime', 'name': 'Mili...","[{'mal_id': 27, 'type': 'anime', 'name': 'Shou..."
