# Fetch Data

Fetches all Anilist data needed for practicing data visualization and machine learning.

In [5]:
import csv
import json
import os
import traceback
from datetime import datetime
from anilist_fetch import fetch as anilist

# setup
data_dir = os.path.abspath('data')
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

today = datetime.today().strftime('%Y%m%d')

## Fetch User Data

In [6]:
ANILIST_USER_ID = 247578
user = anilist.get_user(ANILIST_USER_ID)
print(user['user'])

# write user anime entries to CSV
with open(os.path.join(data_dir, f"user-{ANILIST_USER_ID}-{today}.csv"), 'w+', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['media_id', 'status', 'score', 'progress', 'completedAt'])

    for media_list in user['lists']:
        for entry in media_list['entries']:
            data = [entry['media']['id'], entry['status'], entry['score'], entry['progress'], None]
            completed_at = entry['completedAt']

            if completed_at['year'] and completed_at['month'] and completed_at['day']:
                data[4] = f"{completed_at['month']:02d}-{completed_at['day']:02d}-{completed_at['year']}"

            writer.writerow(data)

{'name': 'barrettotte', 'createdAt': 1552085204, 'statistics': {'anime': {'count': 605, 'meanScore': 68.93, 'standardDeviation': 15.9, 'minutesWatched': 198339, 'episodesWatched': 8119}}}


## Fetch All Anime Entries

Takes around 10-15 minutes to fetch everything while not hitting Anilist's API rate limiter.

In [7]:
anime_count = anilist.download_anime_range(os.path.join(data_dir, f"anime-{today}-raw.csv"))
print(f"\n{anime_count} entries downloaded")

Downloading page 332 => Entries 16551-16600
16551 entries downloaded


## Clean/Wrangle Anime Data

Probably should have done this during initial fetch...But, let's call it "low tier data wrangling" practice :)

In [8]:
csv_raw = f'anime-{today}-raw.csv'
csv_anime = f'anime-{today}-clean.csv'

anime_header = [
    'id', 'title_english', 'title_romaji', 'title_native', 'type', 'format', 'status', 
    'description', 'start_date', 'end_date', 'season', 'season_year', 'episodes', 
    'duration_mins', 'country_of_origin', 'genres', 'average_score', 'mean_score', 
    'popularity', 'source', 'tags', 'studios'
]

def clean_row(row: list) -> list:
    if not row[9] and row[5] == 'MOVIE':
        row[9] = row[8] # end_date = start_date
    
    # add missing seasons
    if not row[10]:
        m = int(row[8][5:7])
        if m >= 9 and m <= 11:
            row[10] = 'FALL'
        elif m >= 6 and m <= 8:
            row[10] = 'SUMMER'
        elif m >= 3 and m <= 5:
            row[10] = 'SPRING'
        else:
            row[10] = 'WINTER'
    
    if not row[11]:
        row[11] = row[8][0:4] # year of start date
    
    row.pop(12) # seasonInt unreliable, removed

    # set episodes to nextAiringEpisode and remove the column
    if not row[13]:
        row[13] = None if row[20] == 'null' else json.loads(row[20])['episode']
    
    row.pop(20) # nextAiringEpisode not needed anymore, removed

    row[20] = [tag['name'] for tag in json.loads(row[21])]
    if len(row[21]) > 0:
        row[21] = [studio['name'] for studio in json.loads(row[21])]
    return row

def include_row(row: list) -> bool:
    return (row[5] in ['TV', 'MOVIE']
        and row[8])

# clean each row of raw CSV and expand some entities to separate tables
with open(os.path.join(data_dir, csv_anime), 'w+', encoding='utf-8') as of:
    anime_writer = csv.writer(of)

    with open(os.path.join(data_dir, csv_raw), 'r', encoding='utf-8') as rf:
        anime_reader = csv.reader(rf)
        next(anime_reader) # skip header
        anime_writer.writerow(anime_header)
        for row in anime_reader:
            if include_row(row):
                anime_writer.writerow(clean_row(row))                
