## MAL API Setup

In [1]:
import requests

api_url = 'https://api.myanimelist.net/v2'

# A Client ID is needed (https://myanimelist.net/apiconfig)
with open('client_id.txt', 'r') as f:
    CLIENT_ID = f.read()

headers = {'X-MAL-CLIENT-ID': CLIENT_ID}

def get_data(endpoint, params=None):
    url = api_url + endpoint
    if params:
        url += '?' + '&'.join(f'{key}={value}' for key, value in params.items())
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return response.json()


## Scrap Manga

In [74]:
import json
import time
import os

scraping_save_pages = 'data/data_tmp/manga_pages'

if not os.path.exists(scraping_save_pages): # Create saving directory if it doesn't exist
  os.makedirs(scraping_save_pages)

endpoint = f'/manga/ranking'
limit = 500

manga_keys = ['id', 'title', 'main_picture', 'alternative_titles', 'start_date', 'end_date', 'synopsis', 'mean', 'rank', 'popularity',
              'num_list_users', 'num_scoring_users', 'num_favorites', 'nsfw', 'genres', 'created_at', 'updated_at', 'media_type', 'status',
              'num_volumes', 'num_chapters', 'authors{id,first_name,last_name}']

short_keys = [key for key in manga_keys if key!='alternative_titles']

known_fails = [116770, 144472, 115838, 143751, 146583]

afegits = []

def scrape_page(page):
    params = {'ranking_type': 'favorite', 'limit': limit, 'offset': page*limit, 'fields': ','.join(manga_keys)}
    try:
        data = get_data(endpoint, params)
    except:
        print('Yikes')
        params['fields'] = ','.join(short_keys)
        data = get_data(endpoint, params)

        ids = [manga['node']['id'] for manga in data['data']]

        offset = page*limit
        problems = [offset-1]
        for fail in known_fails:
            if fail in ids:
                problems.append(offset + ids.index(fail))
        problems.append(offset + limit)

        alternative_titles = []
        params['fields'] = 'alternative_titles'
        for i in range(len(problems)-1):
            params['offset'] = problems[i] + 1
            params['limit'] = problems[i+1] - problems[i] - 1
            data_short = get_data(endpoint, params)
            alternative_titles.extend((manga['node']['id'], manga['node']['alternative_titles']) for manga in data_short['data'])
            time.sleep(1)

        for id, alt_tit in alternative_titles:
            afegits.append(alternative_titles)
            data['data'][ids.index(id)]['node']['alternative_titles'] = alt_tit
    
    useful = [manga['node'] for manga in data['data']]
    with open(scraping_save_pages + f'/page{str(page).zfill(3)}.json', 'w') as f:
        json.dump(useful, f, indent=4)

In [76]:
import math

# 25 June 2022
previous_total_manga = 66_605
previous_last_page = math.ceil(previous_total_manga / limit) - 1

data = get_data(endpoint, {'ranking_type': 'favorite', 'limit': limit, 'offset': previous_last_page*limit, 'fields': ','.join(manga_keys)})
assert len(data['data']) > 0
assert 'next' not in data['paging']

last_page = previous_last_page

last_page

133

In [77]:
import tqdm
import time

for page in tqdm.trange(last_page+1):
    scrape_page(page)
    time.sleep(1)

 49%|████▉     | 66/134 [04:30<04:31,  3.99s/it]

Yikes


 61%|██████    | 82/134 [05:40<03:38,  4.21s/it]

Yikes


 69%|██████▉   | 93/134 [06:31<02:54,  4.27s/it]

Yikes


 72%|███████▏  | 96/134 [06:50<03:13,  5.08s/it]

Yikes


 75%|███████▌  | 101/134 [07:16<02:32,  4.62s/it]

Yikes


100%|██████████| 134/134 [09:39<00:00,  4.33s/it]


## Merge Files

In [87]:
import os

scraping_save_pages = 'data/data_tmp/manga_pages'

data = []
for file_name in os.listdir(scraping_save_pages):
    file_path = os.path.join(scraping_save_pages, file_name)
    with open(file_path, 'r') as f:
        file = json.load(f)
    data.extend(file)

len(data)

66605

In [88]:
with open('data/data_tmp/manga_raw.json', 'w') as f:
    json.dump(data, f, indent=4)

## Manga cleaning

In [99]:
import pandas as pd
import numpy as np
import json

with open('data/data_tmp/manga_raw.json', 'r') as f:
    data = json.load(f)

manga = pd.json_normalize(data, sep='_')

# Remove 4 duplcated in the data; API issues? it happened twice between page 6 and 7...
manga = manga.drop_duplicates(subset=['id']).reset_index().drop(columns='index')

# Use Timestamps
manga['start_date'] = pd.to_datetime(manga['start_date'])
manga['end_date'] = pd.to_datetime(manga['end_date'])

# Avoid floats and zeroes marking nsfw
manga['rank'] = manga['rank'].replace(0, np.nan).astype('Int64')
manga['popularity'] = manga['popularity'].replace(0, np.nan).astype('Int64')
manga['num_volumes'] = manga['num_volumes'].replace(0, np.nan).astype('Int64')
manga['num_chapters'] = manga['num_chapters'].replace(0, np.nan).astype('Int64')

# Only keep names
manga['genres'] = manga['genres'].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])

# Authors
def author_format(authors):
    if authors is np.nan:
        return []
    output = []
    for author in authors:
        if not author['node']['first_name']:
            output.append(f"{author['node']['last_name']} ({author['role']}")
        else:
            output.append(f"{author['node']['last_name']}, {author['node']['first_name']} ({author['role']})")
    return output
manga['authors']  = manga['authors'].apply(author_format)

# MyAnimeList edits
manga['created_at'] = pd.to_datetime(manga['created_at']).dt.tz_convert(None)
manga['updated_at'] = pd.to_datetime(manga['updated_at']).dt.tz_convert(None)

# Avoid empty string
manga.loc[manga['synopsis'].isin(['', ' ', 'N/A', 'n/a']), 'synopsis'] = np.nan
manga.loc[manga['alternative_titles_en'].isin(['', 'N/A']), 'alternative_titles_en'] = np.nan
manga['alternative_titles_ja'] = manga['alternative_titles_ja'].replace('', np.nan)
manga['alternative_titles_synonyms'] = manga['alternative_titles_synonyms'].fillna('').apply(list)


order = ['id', 'title', 'media_type', 'mean', 'num_scoring_users',                          # 10 Most important attributes,
         'status', 'num_volumes', 'num_chapters', 'start_date', 'end_date',                 # appearing first on kaggle

         'num_list_users', 'popularity', 'num_favorites', 'rank',                           # Other important attrubtes
         
         'genres', 'authors',                                                               # Multivalued attributes
         'synopsis', 'nsfw', 'created_at', 'updated_at',                                    # Description, MyAnimeList edits

         'main_picture_medium', 'main_picture_large',                                       # Media data
         'alternative_titles_en', 'alternative_titles_ja', 'alternative_titles_synonyms']   # Other titles


manga = manga[order]

# Save to csv
manga.to_csv('data/manga.csv', index=False)

pd.options.display.max_columns = None
manga.head(1)

Unnamed: 0,id,title,media_type,mean,num_scoring_users,status,num_volumes,num_chapters,start_date,end_date,num_list_users,popularity,num_favorites,rank,genres,authors,synopsis,nsfw,created_at,updated_at,main_picture_medium,main_picture_large,alternative_titles_en,alternative_titles_ja,alternative_titles_synonyms
0,2,Berserk,manga,9.45,262620,currently_publishing,,,1989-08-25,NaT,540237,2,101786,1,"[Action, Adventure, Award Winning, Drama, Fant...","[Miura, Kentarou (Story & Art), Studio Gaga (Art]","Guts, a former mercenary now known as the ""Bla...",white,1970-01-01,2022-06-23 08:30:44,https://api-cdn.myanimelist.net/images/manga/1...,https://api-cdn.myanimelist.net/images/manga/1...,Berserk,ベルセルク,[Berserk: The Prototype]


# Load Manga

In [100]:
from ast import literal_eval
import pandas as pd

manga = pd.read_csv('data/manga.csv')

columns_dtype_datetime = ['start_date', 'end_date', 'created_at', 'updated_at']
for col in columns_dtype_datetime:
    manga[col] = pd.to_datetime(manga[col])

columns_dtype_Int64 = ['num_volumes', 'num_chapters', 'popularity', 'rank']
for col in columns_dtype_Int64:
    manga[col] = manga[col].astype('Int64')

columns_dtype_list = ['genres', 'authors', 'alternative_titles_synonyms']
for col in columns_dtype_list:
    manga[col] = manga[col].apply(literal_eval)

manga.head(1)

Unnamed: 0,id,title,media_type,mean,num_scoring_users,status,num_volumes,num_chapters,start_date,end_date,num_list_users,popularity,num_favorites,rank,genres,authors,synopsis,nsfw,created_at,updated_at,main_picture_medium,main_picture_large,alternative_titles_en,alternative_titles_ja,alternative_titles_synonyms
0,2,Berserk,manga,9.45,262620,currently_publishing,,,1989-08-25,NaT,540237,2,101786,1,"[Action, Adventure, Award Winning, Drama, Fant...","[Miura, Kentarou (Story & Art), Studio Gaga (Art]","Guts, a former mercenary now known as the ""Bla...",white,1970-01-01,2022-06-23 08:30:44,https://api-cdn.myanimelist.net/images/manga/1...,https://api-cdn.myanimelist.net/images/manga/1...,Berserk,ベルセルク,[Berserk: The Prototype]
