## MAL API Setup

In [1]:
import requests

api_url = 'https://api.myanimelist.net/v2'

# A Client ID is needed (https://myanimelist.net/apiconfig)
with open('client_id.txt', 'r') as f:
    CLIENT_ID = f.read()

headers = {'X-MAL-CLIENT-ID': CLIENT_ID}

def get_data(endpoint, params=None):
    url = api_url + endpoint
    response = requests.get(url, headers=headers, params=params)
    response.raise_for_status()
    return response.json()

## Scrap Anime

In [2]:
import json
import os

scraping_save_pages = 'data/data_tmp/anime_pages'

if not os.path.exists(scraping_save_pages): # Create saving directory if it doesn't exist
  os.makedirs(scraping_save_pages)

endpoint = f'/anime/ranking'
limit = 500

anime_keys = ['id', 'title', 'main_picture', 'alternative_titles', 'start_date', 'end_date', 'synopsis', 'mean', 'rank', 'popularity',
              'num_list_users', 'num_scoring_users', 'num_favorites', 'nsfw', 'genres', 'created_at', 'updated_at', 'media_type', 'status',
              'num_episodes', 'start_season', 'broadcast', 'source', 'average_episode_duration', 'rating', 'studios']

def scrape_page(page):
    params = {'ranking_type': 'bypopularity', 'limit': limit, 'offset': page*limit, 'fields': ','.join(anime_keys)}
    data = get_data(endpoint, params)
    useful = [anime['node'] for anime in data['data']]
    with open(scraping_save_pages + f'/page{str(page).zfill(2)}.json', 'w') as f:
        json.dump(useful, f, indent=4)

In [3]:
import math

# 3 July 2022
previous_total_anime = 20_602
previous_last_page = math.ceil(previous_total_anime / limit) - 1

data = get_data(endpoint, {'ranking_type': 'bypopularity', 'limit': limit, 'offset': previous_last_page*limit, 'fields': ','.join(anime_keys)})
assert len(data['data']) > 0
assert 'next' not in data['paging']

last_page = previous_last_page

last_page

41

In [4]:
import tqdm
import time

for page in tqdm.trange(last_page+1):
    scrape_page(page)
    time.sleep(1)

100%|██████████| 42/42 [02:48<00:00,  4.02s/it]


## Merge Files

In [5]:
import os

scraping_save_pages = 'data/data_tmp/anime_pages'

data = []
for file_name in os.listdir(scraping_save_pages):
    file_path = os.path.join(scraping_save_pages, file_name)
    with open(file_path, 'r') as f:
        file = json.load(f)
    data.extend(file)

len(data)

20602

In [6]:
with open('data/data_tmp/anime_raw.json', 'w') as f:
    json.dump(data, f, indent=4)

## Anime Cleaning

In [13]:
import pandas as pd
import numpy as np
import json

with open('data/data_tmp/anime_raw.json', 'r') as f:
    data = json.load(f)

anime = pd.json_normalize(data, sep='_')

# Use Timestamps
anime['start_date'] = pd.to_datetime(anime['start_date'])
anime['end_date'] = pd.to_datetime(anime['end_date'])

# Avoid false zeroes and unnecessary floats
anime['num_episodes'] = anime['num_episodes'].replace(0, np.nan).astype('Int64')
anime['rank'] = anime['rank'].astype('Int64')

# Use Timedelta
anime['average_episode_duration'] = pd.to_timedelta(anime['average_episode_duration'].replace(0, np.nan), unit='s')

# Avoid floats, as time
anime['start_season_year'] = anime['start_season_year'].astype('Int64')
anime['broadcast_start_time'] = pd.to_datetime(anime['broadcast_start_time']).dt.time

# Only keep names
anime['genres'] = anime['genres'].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])
anime['studios'] = anime['studios'].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])

# MyAnimeList edits
anime['created_at'] = pd.to_datetime(anime['created_at'])
anime['updated_at'] = pd.to_datetime(anime['updated_at'])

# Avoid empty string
anime['synopsis'] = anime['synopsis'].replace('', np.nan)
anime['alternative_titles_en'] = anime['alternative_titles_en'].replace('', np.nan)
anime['alternative_titles_ja'] = anime['alternative_titles_ja'].replace('', np.nan)
                      

order = ['id', 'title', 'media_type', 'mean', 'num_scoring_users',                          # 10 Most important attributes, 
         'status', 'num_episodes', 'start_date', 'end_date', 'source',                      # appearing first on kaggle

         'num_list_users', 'popularity', 'num_favorites', 'rank',                           # Other important
         'average_episode_duration', 'rating', 'start_season_year',                         # attributes
         'start_season_season', 'broadcast_day_of_the_week', 'broadcast_start_time',   

         'genres', 'studios',                                                               # Multivalued attributes
         'synopsis', 'nsfw', 'created_at', 'updated_at',                                    # Description, MyAnimeList edits
         
         'main_picture_medium', 'main_picture_large',                                       # Media data
         'alternative_titles_en', 'alternative_titles_ja', 'alternative_titles_synonyms']   # Other titles


anime = anime[order]

# Save to csv
anime.to_csv('data/anime.csv', index=False)

pd.options.display.max_columns = None
anime.head(1)

Unnamed: 0,id,title,media_type,mean,num_scoring_users,status,num_episodes,start_date,end_date,source,num_list_users,popularity,num_favorites,rank,average_episode_duration,rating,start_season_year,start_season_season,broadcast_day_of_the_week,broadcast_start_time,genres,studios,synopsis,nsfw,created_at,updated_at,main_picture_medium,main_picture_large,alternative_titles_en,alternative_titles_ja,alternative_titles_synonyms
0,16498,Shingeki no Kyojin,tv,8.53,2469488,finished_airing,25,2013-04-07,2013-09-29,manga,3452617,1,153103,103,0 days 00:24:00,r,2013,spring,sunday,01:58:00,"[Action, Drama, Gore, Military, Shounen, Survi...",[Wit Studio],"Centuries ago, mankind was slaughtered to near...",white,2012-12-05 12:03:21+00:00,2022-04-18 05:00:25+00:00,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,Attack on Titan,進撃の巨人,"[AoT, SnK]"


In [15]:
anime.shape

(20602, 31)

## Load Anime

In [19]:
from ast import literal_eval
import pandas as pd

anime = pd.read_csv('data/anime.csv')

columns_dtype_datetime = ['start_date', 'end_date', 'created_at', 'updated_at']
for col in columns_dtype_datetime:
    anime[col] = pd.to_datetime(anime[col])

columns_dtype_Int64 = ['num_episodes', 'popularity', 'rank', 'start_season_year']
for col in columns_dtype_Int64:
    anime[col] = anime[col].astype('Int64')

columns_dtype_list = ['genres', 'studios', 'alternative_titles_synonyms']
for col in columns_dtype_list:
    anime[col] = anime[col].apply(literal_eval)

anime['broadcast_start_time'] = pd.to_datetime(anime['broadcast_start_time']).dt.time   # Time of day

anime['average_episode_duration'] = pd.to_timedelta(anime['average_episode_duration'])  # Duration

anime.head(1)

Unnamed: 0,id,title,media_type,mean,num_scoring_users,status,num_episodes,start_date,end_date,source,num_list_users,popularity,num_favorites,rank,average_episode_duration,rating,start_season_year,start_season_season,broadcast_day_of_the_week,broadcast_start_time,genres,studios,synopsis,nsfw,created_at,updated_at,main_picture_medium,main_picture_large,alternative_titles_en,alternative_titles_ja,alternative_titles_synonyms
0,16498,Shingeki no Kyojin,tv,8.53,2469488,finished_airing,25,2013-04-07,2013-09-29,manga,3452617,1,153103,103,0 days 00:24:00,r,2013,spring,sunday,01:58:00,"[Action, Drama, Gore, Military, Shounen, Survi...",[Wit Studio],"Centuries ago, mankind was slaughtered to near...",white,2012-12-05 12:03:21+00:00,2022-04-18 05:00:25+00:00,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,Attack on Titan,進撃の巨人,"[AoT, SnK]"
