In [None]:
import gzip
import pandas as pd
import json

from functools import reduce

from src.db import (
    get_table,
)

In [None]:
from toolz import merge, merge_with, reduceby, dissoc, compose, curry

@curry
def add_dict_namespace(namespace, d):
  return {k:{namespace: v} for k,v in d.items()}

def to_dict_simple(df):
  return df.to_dict(orient='index')


def to_dict_nested(namespace, df):
  return compose(
    add_dict_namespace(namespace),
    to_dict_simple
  )(df)


def to_dict_reduceby(key, df):
  return reduceby(
    key,
    lambda d1, d2: d1 + [dissoc(d2, key)],
    df.to_dict(orient='records'),
    []
  )


def to_dict_reduceby_multiple(namespace, key, *dfs):
  return add_dict_namespace(
    namespace,
    merge_with(
      curry(reduce)(lambda l1, l2=[]: l1+l2),
      *[to_dict_reduceby(key, df) for df in dfs]
    )
  )


def merge_with_merge(*dicts):
  return merge_with(
    merge,
    *dicts
  )


def get_dict_animedb_from_dfs(itemType, df_main, *dfs_relations):
  idx_col = 'id'
  simple_cols = [
    'itemType',
    'imgUrl',
    'imgAlt',
    'name',
    'cumweight'
  ]
  return list(merge_with_merge(
    to_dict_simple(df_main\
      [[idx_col] + simple_cols]\
      .set_index(idx_col, drop=False)
    ),
    to_dict_nested(
      'paragraphs',
      df_main\
        .drop(columns=simple_cols)\
        .set_index(idx_col)
    ),
    to_dict_reduceby_multiple(
      'relations',
      f'{itemType}_id',
      *dfs_relations
    )
  ).values())


def get_dict_animedb_full(*document_list):
  return reduce(lambda d1, d2: d1+d2, document_list)


def fillna_default(df):
    df[df.select_dtypes('object').columns] = df.select_dtypes('object').fillna('')
    df[df.select_dtypes('float64').columns] = df.select_dtypes('float64').fillna(0)
    return df


def compress_data(data):
  return gzip.compress(json.dumps(data, indent=1).encode('utf-8'))

In [None]:
db = 'animedb.sqlite'

In [None]:
df_anime = get_table('anime', db, [
    'anime_id',
    'image_webp',
    'status',
    'scored_by',
    'rank',
    'rating',
    'type'
])

df_anime_title = get_table('anime_title', db)

df_character = get_table('character', db, [
    'character_id',
    'name',
    'images_webp_image_url'
])

df_voiceactor = get_table('voiceactor', db, [
    'voiceactor_id',
    'name',
    'image_url',
    'favorites'
])

df_staff = get_table('staff', db, [
    'staff_id',
    'image_url',
    'name'
])

df_anime_character = get_table('anime_character', db)

df_character_voiceactor = get_table('character_voiceactor', db)

df_anime_staff = get_table('anime_staff', db)

In [None]:
df_anime_f = df_anime[
    ~df_anime['rating'].isin(['Rx - Hentai']) &
    (df_anime['scored_by'] > 0) &
    (df_anime['type'] != 'Music')# &
    #(df_anime['anime_id'].isin([5081, 16498]))
].drop(columns='type')

df_anime_f = df_anime_f\
    .merge(df_anime_title[df_anime_title['type'] == 'Default'])\
    .sort_values(['scored_by', 'title'], ascending=[False, True])\
    .drop(['type', 'rating'], axis=1)

df_anime_f['image_webp'] = df_anime_f['image_webp']
df_anime_f['scored_by'] = df_anime_f['scored_by'].fillna(0).astype(int)
df_anime_f['rank'] = df_anime_f['rank'].fillna(0).astype(int)
anime_total_scores = df_anime_f['scored_by'].sum()
df_anime_f['anime_prob'] = df_anime_f['scored_by'] / anime_total_scores


In [None]:
df_character_anime_f = df_character\
    .merge(df_anime_character)\
    .merge(df_anime_f[['anime_id', 'scored_by', 'title', 'anime_prob']])\
    .drop('role', axis=1)

df_character_anime_f['images_webp_image_url'] = df_character_anime_f['images_webp_image_url'].str.split('?').str[0]

df_character_f = df_character_anime_f\
    .sort_values(['favorites', 'name'], ascending=[False, True])\
    .drop(['anime_id', 'scored_by', 'title'], axis=1)\
    .drop_duplicates(subset='character_id')
character_total_scores = df_character_f['favorites'].sum()
df_character_anime_f['character_prob'] = df_character_anime_f['favorites'] / character_total_scores
df_character_anime_f['character_anime_prob'] = 0.3*df_character_anime_f['anime_prob'] + 0.7*df_character_anime_f['character_prob']
df_character_f = pd.concat([
  df_character_f.set_index('character_id'),
  df_character_anime_f\
    .sort_values('character_anime_prob', ascending=False)\
    .drop_duplicates('character_id')\
    .set_index('character_id')['character_anime_prob']\
    .cumsum()\
    .rename('cumweight')
], axis=1).reset_index()
df_character_f = df_character_f.sort_values('cumweight')
  

df_anime_character_ff = df_character_anime_f\
    .sort_values(['anime_id', 'favorites', 'name'], ascending=[True, False, True])\
    [['anime_id', 'character_id']]\
    .drop_duplicates()
df_character_anime_ff = df_character_anime_f\
    .sort_values(['character_id', 'scored_by', 'title'], ascending=[True, False, True])\
    [['character_id', 'anime_id']]\
    .drop_duplicates()

In [None]:
df_voiceactor_character_f = df_voiceactor\
    .merge(df_character_voiceactor)\
    .merge(df_character_f[['character_id', 'favorites', 'name']].rename({
        'favorites': 'favorites_c',
        'name': 'name_c'
    }, axis=1))
df_voiceactor_character_f['image_url'] = df_voiceactor_character_f['image_url']
df_voiceactor_f = df_voiceactor_character_f\
    .sort_values(['favorites', 'name'], ascending=[False, True])\
    .drop(['character_id', 'favorites_c', 'name_c'], axis=1)\
    .drop_duplicates(subset='voiceactor_id')
df_character_voiceactor_ff = df_voiceactor_character_f\
    .sort_values(['character_id', 'favorites', 'name'], ascending=[True, False, True])\
    [['character_id', 'voiceactor_id']]\
    .drop_duplicates()
df_voiceactor_character_ff = df_voiceactor_character_f\
    .sort_values(['voiceactor_id', 'favorites_c', 'name_c'], ascending=[True, False, True])\
    [['voiceactor_id', 'character_id']]\
    .drop_duplicates()

In [None]:
df_staff_anime_f = df_staff\
    .merge(df_anime_staff)\
    .merge(df_anime_f[['anime_id', 'scored_by', 'title']])
df_staff_anime_f = df_staff_anime_f\
    [
      df_staff_anime_f['position'].str.contains(
        '^(?:Theme Song|Producer|Planning|Original)',
        regex=True)
    ]
df_staff_anime_f['image_url'] = df_staff_anime_f['image_url'].str.split('?').str[0]
df_staff_f = df_staff_anime_f\
    .sort_values(['scored_by', 'name', 'position'], ascending=[False, True, True])\
    .drop(['anime_id', 'scored_by', 'title'], axis=1)\
    .drop_duplicates(subset='staff_id')
df_anime_staff_ff = df_staff_anime_f\
    .sort_values(['anime_id', 'name'], ascending=[True, True])\
    [['anime_id', 'staff_id']]\
    .drop_duplicates()
df_staff_anime_ff = df_staff_anime_f\
    .sort_values(['staff_id', 'scored_by', 'title'], ascending=[True, False, True])\
    [['staff_id', 'anime_id']]\
    .drop_duplicates()

In [None]:
df_anime_fff = df_anime_f\
  .rename({
    'anime_id': 'id',
    'image_webp': 'imgUrl'
  }, axis=1)\
  .assign(itemType='anime')
df_anime_fff['imgAlt'] = df_anime_fff['title']
df_anime_fff['name'] = df_anime_fff['title']
df_anime_fff['cumweight'] = (df_anime_fff['scored_by']//10).cumsum()
df_anime_fff = df_anime_fff[[
  'id',
  'itemType',
  'imgUrl',
  'imgAlt',
  'name',
  'cumweight',
  'status',
  'rank',
  'scored_by'
]]

df_anime_character_fff = df_anime_character_ff\
  .rename({'character_id': 'id'}, axis=1)\
  .assign(itemType='character')

df_anime_staff_fff = df_anime_staff_ff\
  .rename({'staff_id': 'id'}, axis=1)\
  .assign(itemType='staff')

In [None]:
df_character_fff = df_character_f\
  .rename({
    'character_id': 'id',
    'images_webp_image_url': 'imgUrl'
  }, axis=1)\
  .assign(itemType='character')
df_character_fff['imgAlt'] = df_character_fff['name']
df_character_fff = df_character_fff[[
  'id',
  'itemType',
  'imgUrl',
  'imgAlt',
  'name',
  'cumweight',
  'favorites'
]]

df_character_anime_fff = df_character_anime_ff\
  .rename({'anime_id': 'id'}, axis=1)\
  .assign(itemType='anime')

df_character_voiceactor_fff = df_character_voiceactor_ff\
  .rename({'voiceactor_id': 'id'}, axis=1)\
  .assign(itemType='voiceactor')

In [None]:
df_voiceactor_fff = df_voiceactor_f\
  .rename({
    'voiceactor_id': 'id',
    'image_url': 'imgUrl'
  }, axis=1)\
  .assign(itemType='voiceactor')
df_voiceactor_fff['imgAlt'] = df_voiceactor_fff['name']
df_voiceactor_fff['cumweight'] = (1+df_voiceactor_fff['favorites']).cumsum()
df_voiceactor_fff = df_voiceactor_fff[[
  'id',
  'itemType',
  'imgUrl',
  'imgAlt',
  'name',
  'cumweight',
  'favorites'
]]

df_voiceactor_character_fff = df_voiceactor_character_ff\
  .rename({'character_id': 'id'}, axis=1)\
  .assign(itemType='character')

In [None]:
df_staff_fff = df_staff_f\
  .rename({
    'staff_id': 'id',
    'image_url': 'imgUrl'
  }, axis=1)\
  .assign(itemType='staff')
df_staff_fff['imgAlt'] = df_staff_fff['name']
df_staff_fff['cumweight'] = range(1, 1 + len(df_staff_fff))
df_staff_fff = df_staff_fff[[
  'id',
  'itemType',
  'imgUrl',
  'imgAlt',
  'name',
  'cumweight',
  'position'
]]

df_staff_anime_fff = df_staff_anime_ff\
  .rename({'anime_id': 'id'}, axis=1)\
  .assign(itemType='anime')

In [None]:
dict_db = get_dict_animedb_full(
  get_dict_animedb_from_dfs(
    'anime',
    df_anime_fff,
    df_anime_character_fff,
    df_anime_staff_fff
  ),
  get_dict_animedb_from_dfs(
    'character',
    df_character_fff,
    df_character_voiceactor_fff,
    df_character_anime_fff
  ),
    get_dict_animedb_from_dfs(
    'voiceactor',
    df_voiceactor_fff,
    df_voiceactor_character_fff
  ),
    get_dict_animedb_from_dfs(
    'staff',
    df_staff_fff,
    df_staff_anime_fff
  )
)

In [None]:
with open('animu_erdos/public/animedb.json.gz', 'wb') as f:
    f.write(compress_data(dict_db))

In [None]:
a

In [None]:
with open('animu_erdos/public/animedb.json', 'w') as f:
    json.dump(dict_db, f)