# Clean

Clean up and remove unnecessary Anilist data

## Setup

In [288]:
import csv
import json
import os
import pandas as pd
from pandas import DataFrame
from datetime import datetime

data_dir = os.path.abspath('data')
# today = datetime.today().strftime('%Y%m%d')
today = datetime.strptime('2022-09-27', '%Y-%m-%d').strftime('%Y%m%d')

anime_df = pd.read_csv(os.path.join(data_dir, f'anime-{today}-raw.csv'), parse_dates=[8,9]).sort_values('id')
user_df = pd.read_csv(os.path.join(data_dir, f'user-{today}-raw.csv'), parse_dates=[4]).sort_values('media_id')

print('user', user_df.shape[0], 'rows')
print('anime', anime_df.shape[0], 'rows')

user 760 rows
anime 16586 rows


## Check Missing Data

In [289]:
def get_missing(df: DataFrame) -> DataFrame:
    df = DataFrame(data=df.isnull().mean().round(4).mul(100).sort_values(ascending=False), columns=['percent_missing'])
    return df[df['percent_missing'] > 0]

def get_missing_user(df: DataFrame) -> DataFrame:
    return get_missing(df[df['status'].isin(['COMPLETED','CURRENT'])])

def get_missing_anime(df: DataFrame) -> DataFrame:
    df['genres'] = df['genres'].mask(df['genres'] == '[]', None)
    df['tags'] = df['tags'].mask(df['tags'] == '[]', None)
    df['studios'] = df['studios'].mask(df['studios'] == '[]', None)
    return get_missing(df)

print(f"Missing User Data:\n{get_missing_user(user_df)}\n")
print(f"Missing Anime Data:\n{get_missing_anime(anime_df)}")

Missing User Data:
             percent_missing
completedAt            15.22

Missing Anime Data:
                     percent_missing
next_airing_episode            98.89
title_english                  53.69
studios                        45.88
season                         34.73
season_year                    34.73
season_int                     33.44
average_score                  32.56
tags                           23.81
source                         13.79
genres                         13.64
description                     6.71
end_date                        5.67
duration_mins                   5.13
mean_score                      4.12
episodes                        3.80
start_date                      2.45
title_native                    1.44
format                          0.33
status                          0.01


## Clean User Data

In [290]:
def clean_user_data(df: DataFrame) -> DataFrame:
    df.drop(df[~df['status'].isin(['CURRENT','COMPLETED'])].index, axis=0, inplace=True)
    return df.drop(['completedAt','progress'], axis=1, errors='ignore')

user_df = clean_user_data(user_df)
user_df.to_csv(os.path.join(data_dir, f'user-{today}-clean.csv'), index=False)

## Clean Anime Data

In [291]:
def fill_season(dt: datetime) -> str:
    if dt.month >= 9 and dt.month <= 11:
        return 'FALL'
    elif dt.month >= 6 and dt.month <= 8:
        return 'SUMMER'
    elif dt.month >= 3 and dt.month <= 5:
        return 'SPRING'
    return 'WINTER'

def extract_names(s: str) -> str:
    return json.dumps([s['name'].upper() for s in json.loads(s)]) if s else None

def extract_episodes(s: str) -> str:
    return json.loads(s)['episode'] if s else None

def clean_anime_data(df: DataFrame) -> DataFrame:
    # drop unusable columns
    df.drop([
        'title_english', 'title_romaji', 'title_native', 'type', 'description', 'popularity', 
        'mean_score', 'average_score', 'season_int', 'end_date','duration_mins', 'country'
    ], axis=1, errors='ignore', inplace=True)

    # drop unusable rows
    df.drop(df[~df['status'].isin(['FINISHED','RELEASING'])].index, axis=0, inplace=True)
    df.drop(df[~df['format'].isin(['TV','MOVIE','SPECIAL','ONA','OVA'])].index, axis=0, inplace=True)

    # fix remaining data
    df['episodes'] = df['episodes'].fillna(df['next_airing_episode'][df['next_airing_episode'].notna()].apply(extract_episodes))
    df['source'] = df['source'].fillna('OTHER')
    df['season'] = df['season'].fillna(df['start_date'].apply(fill_season))
    df['season_year'] = df['season_year'].fillna(df['start_date'].apply(lambda x: x.year if x else None))
    df['studios'] = df['studios'].apply(extract_names)
    df['tags'] = df['tags'].apply(extract_names)
    df['genres'] = df['genres'].apply(lambda x: x.upper() if x else None)

    # drop unusable rows
    df.dropna(subset=['episodes','season_year', 'tags', 'genres', 'studios'], inplace=True)

    # drop vestigial data
    df.drop(['next_airing_episode', 'start_date', 'status', 'media_id'], axis=1, errors='ignore', inplace=True)

    # fix up data types
    df['episodes'] = df['episodes'].astype(int)
    df['season_year'] = df['season_year'].astype(int)

    return df

anime_df = clean_anime_data(anime_df)
anime_df.to_csv(os.path.join(data_dir, f'anime-{today}-clean.csv'), index=False)

## Verify Missing Data Resolved

In [292]:
print(f"Missing User Data:\n{get_missing_user(user_df)}\n")
print(f"Missing Anime Data:\n{get_missing_anime(anime_df)}")

Missing User Data:
Empty DataFrame
Columns: [percent_missing]
Index: []

Missing Anime Data:
Empty DataFrame
Columns: [percent_missing]
Index: []


# Combine Anime and User Data

In [293]:
enriched_df = pd.merge(user_df[user_df['status'].isin(['CURRENT','COMPLETED'])], 
    anime_df, left_on='media_id', right_on='id', how='inner')
enriched_df.drop(['status','media_id'], axis=1, inplace=True)

# ensure no unwatched anime included
enriched_df.drop(enriched_df[enriched_df['score'] == 0].index, axis=0, inplace=True)
print(f"Missing Data):\n{get_missing(enriched_df)}\n")

enriched_df.to_csv(os.path.join(data_dir, f'user-{today}-enriched.csv'), index=False)

print(f'{enriched_df.shape[0]}', 'rows written')

Missing Data):
Empty DataFrame
Columns: [percent_missing]
Index: []

556 rows written


## Encode Data For Training

In [294]:
# binary encode a column
#
# Note: this could have been achieved this with scikitlearn.preprocessing, 
# but I wasn't honestly sure how it would like my JSON array columns and
# I didn't find any easy answers
#
def binary_encode(df: DataFrame, col: str) -> DataFrame:
    try:
        df[col] = df[col].apply(json.loads) # deserialize JSON array
    except ValueError: pass

    encoded = df.explode(col) # expand categorical data into rows
    encoded = pd.concat([df, pd.get_dummies(encoded[col], prefix=col, prefix_sep='_')], axis=1) # convert to indicators
    encoded = encoded.groupby('id').max().reset_index() # squash rows
    return encoded.drop([col], axis=1) # drop encoded column

encoded_df = enriched_df

# binary encode each categorical column
cols = ['format', 'season', 'source', 'studios', 'genres', 'tags']
for col in cols:
    encoded_df = binary_encode(encoded_df, col)
encoded_df.drop(['id'], axis=1, inplace=True)

encoded_df.to_csv(os.path.join(data_dir, f'user-{today}-encoded.csv'), index=False)
print(encoded_df.shape)

(556, 426)
