In [None]:
import time

import pandas as pd
import numpy as np

import sys
if ".." not in sys.path:
    sys.path.insert(0, "..")

from src.dataset.download_data import get_anime_data, get_anime_episodes
from src.dataset.preprocessing import preprocess_anime_data

In [None]:
anime = pd.read_csv("../data/external/anime.csv", encoding="utf-8")

In [None]:
anime.head()

# Gather more Anime metadata: english title and release date

In [None]:
var_names = ['title_english', 'year']

anime_ids = []
titles = []
release_dates = []

request_counter = 0
total_requests = len(anime.index.values)

requests_per_minute = 0
requests_per_second = 0

max_requests_per_second = 2
max_requests_per_minute = 59

for anime_id in anime.index.values:

    if requests_per_second == max_requests_per_second:
        time.sleep(2.5)
        requests_per_second = 0
    if requests_per_minute == max_requests_per_minute:
        time.sleep(5)
        requests_per_minute = 0

    data = get_anime_data(anime_id, var_names)
    if len(data) > 0:
        title, year = data
    else:
        title = None
        year = None
    
    anime_ids.append(int(anime_id))
    titles.append(title)
    if year is not None:
        release_dates.append(year)
    else:
        release_dates.append(np.nan)
    
    requests_per_second += 1
    requests_per_minute += 1

    request_counter += 1
    print(f"{request_counter} out of {total_requests} requests completed")

In [None]:
df = pd.DataFrame({
    'anime_id': anime_ids,
    'en_title': titles,
    'year': release_dates
})
df.to_csv('../data/raw/anime_metadata.csv', index=False)

In [None]:
dates_df = pd.DataFrame({
    'anime_id': anime_ids,
    'year': release_dates
})
dates_df.to_csv('../data/raw/anime_dates.csv', index=False)

# Update Episode feature for animes not airing anymore

In [None]:
var_names = ['episodes']

df = anime[anime.episodes == 'Unknown']

anime_ids = []
episodes = []

request_counter = 0
total_requests = len(df.index.values)

requests_per_minute = 0
requests_per_second = 0

max_requests_per_second = 2
max_requests_per_minute = 59

for anime_id in df.index.values:

    if requests_per_second == max_requests_per_second:
        time.sleep(2.5)
        requests_per_second = 0
    if requests_per_minute == max_requests_per_minute:
        time.sleep(5)
        requests_per_minute = 0

    data = get_anime_data(anime_id, var_names)
    if len(data) > 0:
        nb_eps = data[0]
    else:
        nb_eps = None
    
    anime_ids.append(int(anime_id))
    if nb_eps is not None:
        episodes.append(nb_eps)
    else:
        episodes.append(np.nan)

    requests_per_second += 1
    requests_per_minute += 1

    request_counter += 1
    print(f"{request_counter} out of {total_requests} requests completed")

In [None]:
episodes_df = pd.DataFrame({
    'anime_id': anime_ids,
    'episodes': episodes
})
episodes_df.to_csv('../data/raw/anime_episodes.csv', index=False)

# Update score for animes that weren't airing then

In [None]:
var_names = ['score']

df = anime[anime.rating.isna()]

anime_ids = []
scores = []

request_counter = 0
total_requests = len(df.index.values)

requests_per_minute = 0
requests_per_second = 0

max_requests_per_second = 2
max_requests_per_minute = 59

for anime_id in df.index.values:

    if requests_per_second == max_requests_per_second:
        time.sleep(2.5)
        requests_per_second = 0
    if requests_per_minute == max_requests_per_minute:
        time.sleep(5)
        requests_per_minute = 0

    data = get_anime_data(anime_id, var_names)
    if len(data) > 0:
        score = data[0]
    else:
        score = None
    
    anime_ids.append(int(anime_id))
    if score is not None:
        scores.append(score)
    else:
        scores.append(np.nan)

    requests_per_second += 1
    requests_per_minute += 1

    request_counter += 1
    print(f"{request_counter} out of {total_requests} requests completed")

In [None]:
scores_df = pd.DataFrame({
    'anime_id': anime_ids,
    'scores': scores
})
scores_df.to_csv('../data/raw/anime_scores.csv', index=False)