## Scrap Manga

In [1]:
import mal_api
import json
import os

scraping_save_pages = 'data/raw/manga'

if not os.path.exists(scraping_save_pages): # Create saving directory if it doesn't exist
  os.makedirs(scraping_save_pages)

endpoint = f'/manga/ranking'
limit = 500

manga_keys = ['id', 'title', 'main_picture', 'alternative_titles', 'start_date', 'end_date', 'synopsis', 'mean', 'rank', 'popularity',
              'num_list_users', 'num_scoring_users', 'num_favorites', 'nsfw', 'genres', 'created_at', 'updated_at', 'media_type', 'status',
              'num_volumes', 'num_chapters', 'authors{id,first_name,last_name}']

def scrape_page(page):
    params = {'ranking_type': 'bypopularity', 'limit': limit, 'offset': page*limit, 'fields': ','.join(manga_keys)}
    data = mal_api.get_data(endpoint, params)
    useful = [manga['node'] for manga in data['data']]
    with open(scraping_save_pages + f'/page{str(page).zfill(3)}.json', 'w') as f:
        json.dump(useful, f, indent=4)

In [2]:
import math

# 12 July 2022
previous_total_manga = 59_767
previous_last_page = math.ceil(previous_total_manga / limit) - 1

data = mal_api.get_data(endpoint, {'ranking_type': 'bypopularity', 'limit': limit, 'offset': previous_last_page*limit, 'fields': ','.join(manga_keys)})
assert len(data['data']) > 0
assert 'next' not in data['paging']

last_page = previous_last_page

last_page

119

In [3]:
import tqdm
import time

for page in tqdm.trange(last_page+1):
    scrape_page(page)
    time.sleep(1)

100%|██████████| 120/120 [08:13<00:00,  4.11s/it]


## Merge Files

In [4]:
import os

scraping_save_pages = 'data/raw/manga'

data = []
for file_name in os.listdir(scraping_save_pages):
    file_path = os.path.join(scraping_save_pages, file_name)
    with open(file_path, 'r') as f:
        file = json.load(f)
    data.extend(file)

len(data)

59767

In [5]:
with open('data/raw/manga.json', 'w') as f:
    json.dump(data, f, indent=4)

## Manga cleaning

In [8]:
import pandas as pd
import numpy as np
import json

with open('data/raw/manga.json', 'r') as f:
    data = json.load(f)

manga = pd.json_normalize(data, sep='_')

# Use Timestamps
manga['start_date'] = pd.to_datetime(manga['start_date'])
manga['end_date'] = pd.to_datetime(manga['end_date'])

# Avoid false zeroes and unnecessary floats 
manga['num_volumes'] = manga['num_volumes'].replace(0, np.nan).astype('Int64')
manga['num_chapters'] = manga['num_chapters'].replace(0, np.nan).astype('Int64')
manga['rank'] = manga['rank'].astype('Int64')

# Only keep names
manga['genres'] = manga['genres'].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])

# Authors
def author_format(authors):
    if authors is np.nan:
        return []
    output = []
    for author in authors:
        if not author['node']['first_name']:
            output.append(f"{author['node']['last_name']} ({author['role']}")
        else:
            output.append(f"{author['node']['last_name']}, {author['node']['first_name']} ({author['role']})")
    return output
manga['authors']  = manga['authors'].apply(author_format)

# MyAnimeList edits
manga['created_at'] = pd.to_datetime(manga['created_at'])
manga['updated_at'] = pd.to_datetime(manga['updated_at'])

# Avoid empty string
manga.loc[manga['synopsis'].isin(['', ' ', 'N/A', 'n/a']), 'synopsis'] = np.nan
manga.loc[manga['alternative_titles_en']=='', 'alternative_titles_en'] = np.nan
manga['alternative_titles_ja'] = manga['alternative_titles_ja'].replace('', np.nan)
manga['alternative_titles_synonyms'] = manga['alternative_titles_synonyms'].fillna('').apply(list)


order = ['id', 'title', 'media_type', 'mean', 'num_scoring_users',                          # 10 Most important attributes,
         'status', 'num_volumes', 'num_chapters', 'start_date', 'end_date',                 # appearing first on kaggle

         'num_list_users', 'popularity', 'num_favorites', 'rank',                           # Other important attrubtes
         
         'genres', 'authors',                                                               # Multivalued attributes
         'synopsis', 'nsfw', 'created_at', 'updated_at',                                    # Description, MyAnimeList edits

         'main_picture_medium', 'main_picture_large',                                       # Media data
         'alternative_titles_en', 'alternative_titles_ja', 'alternative_titles_synonyms']   # Other titles


manga = manga[order]

# Save to csv
manga.to_csv('data/manga.csv', index=False)

print(manga.shape)

pd.options.display.max_columns = None
manga.head(1)

(59767, 25)


Unnamed: 0,id,title,media_type,mean,num_scoring_users,status,num_volumes,num_chapters,start_date,end_date,num_list_users,popularity,num_favorites,rank,genres,authors,synopsis,nsfw,created_at,updated_at,main_picture_medium,main_picture_large,alternative_titles_en,alternative_titles_ja,alternative_titles_synonyms
0,23390,Shingeki no Kyojin,manga,8.57,361834,finished,34,141,2009-09-09,2021-04-09,588155,1,66856,87,"[Action, Award Winning, Drama, Gore, Military,...","[Isayama, Hajime (Story & Art)]","Hundreds of years ago, horrifying creatures wh...",white,1970-01-01 00:00:00+00:00,2022-04-18 05:10:58+00:00,https://api-cdn.myanimelist.net/images/manga/2...,https://api-cdn.myanimelist.net/images/manga/2...,Attack on Titan,進撃の巨人,[]


### Ranking

(cal eliminar!, potser quan afegeixi Jikan?)

7 Juliol 2022

No hi ha en el ranking el concepte de no publicats

- 1 a 15.851: ordenats per nota
- 15.852 a 41.085: ordenats per id... (= created_at probably)
  
- 41.086 a 46.997: ordenats per nota
- 46.998 a 59.642: ordenats per id... (= created_at probably)


Not Ranked <--> Hentai or Erotica (92349 was temporally wrong (not ranked), as it was in not_yet_aired, the only one)

nsfw is a bit weirder

# Load Manga

In [9]:
from ast import literal_eval
import pandas as pd

manga = pd.read_csv('data/manga.csv')

columns_dtype_datetime = ['start_date', 'end_date', 'created_at', 'updated_at']
for col in columns_dtype_datetime:
    manga[col] = pd.to_datetime(manga[col])

columns_dtype_Int64 = ['num_volumes', 'num_chapters', 'popularity', 'rank']
for col in columns_dtype_Int64:
    manga[col] = manga[col].astype('Int64')

columns_dtype_list = ['genres', 'authors', 'alternative_titles_synonyms']
for col in columns_dtype_list:
    manga[col] = manga[col].apply(literal_eval)

pd.options.display.max_columns = None
manga.head(1)

Unnamed: 0,id,title,media_type,mean,num_scoring_users,status,num_volumes,num_chapters,start_date,end_date,num_list_users,popularity,num_favorites,rank,genres,authors,synopsis,nsfw,created_at,updated_at,main_picture_medium,main_picture_large,alternative_titles_en,alternative_titles_ja,alternative_titles_synonyms
0,23390,Shingeki no Kyojin,manga,8.57,361834,finished,34,141,2009-09-09,2021-04-09,588155,1,66856,87,"[Action, Award Winning, Drama, Gore, Military,...","[Isayama, Hajime (Story & Art)]","Hundreds of years ago, horrifying creatures wh...",white,1970-01-01 00:00:00+00:00,2022-04-18 05:10:58+00:00,https://api-cdn.myanimelist.net/images/manga/2...,https://api-cdn.myanimelist.net/images/manga/2...,Attack on Titan,進撃の巨人,[]
