In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import sqlalchemy as sql

In [2]:
with open('../tools/credentials.json') as file:
    credentials = json.load(file)
    
username = credentials["dblogin"]["username"]
password = credentials["dblogin"]["password"]

In [3]:
db_string = f"postgresql://{username}:{password}@192.168.0.3:5432/animeplanet"
db = sql.create_engine(db_string)

In [4]:
# query = f"""
#         SELECT title, year, avg, status, eps, times_watched, rating, anime_url, username
#         FROM watch_list
#         WHERE rating IS NOT NULL
#         AND year IS NOT NULL
#         AND eps IS NOT NULL
#         AND avg IS NOT NULL
#         AND "type" = 'TV'
#         AND status IN ('Watched', 'Dropped', 'Watching', 'Stalled');
#         """

# df = pd.read_sql(sql.text(query), db)
# df = df.drop_duplicates(['anime_url', 'username'], keep='last', ignore_index=True)
# df.to_csv('../data/watch_list_raw.csv', index=False)

In [5]:
# %%bash
# cd ../data
# rm watch_list_raw.csv.xz
# xz -vT0 watch_list_raw.csv

In [6]:
df = pd.read_csv('../data/watch_list_raw.csv.xz')

In [7]:
df

Unnamed: 0,title,year,avg,status,eps,times_watched,rating,anime_url,username
0,Day Break Illusion: il sole penetra le illusioni,2013,2.90,Watched,13,1.0,3.0,https://www.anime-planet.com/anime/day-break-i...,Ruth
1,Dog Days,2011,3.50,Watched,13,1.0,3.5,https://www.anime-planet.com/anime/dog-days,Ruth
2,Dog Days',2012,3.70,Watched,13,1.0,3.0,https://www.anime-planet.com/anime/dog-days-2,Ruth
3,Fantasista Doll,2013,2.47,Watched,12,1.0,2.5,https://www.anime-planet.com/anime/fantasista-...,Ruth
4,Fate/Kaleid Liner Prisma Illya,2013,3.54,Watched,10,1.0,3.5,https://www.anime-planet.com/anime/fate-kaleid...,Ruth
...,...,...,...,...,...,...,...,...,...
9205777,The Ancient Magus' Bride,2017,4.39,Watched,24,2.0,5.0,https://www.anime-planet.com/anime/the-ancient...,Rutendo
9205778,The Disastrous Life of Saiki K.,2016,4.47,Watched,120,2.0,5.0,https://www.anime-planet.com/anime/the-disastr...,Rutendo
9205779,The Seven Deadly Sins,2014,4.35,Watched,24,1.0,5.0,https://www.anime-planet.com/anime/the-seven-d...,Rutendo
9205780,The Seven Deadly Sins: Revival of The Commandm...,2018,4.37,Watched,24,1.0,5.0,https://www.anime-planet.com/anime/the-seven-d...,Rutendo


### Dealing with missing values

In [8]:
df.isnull().sum()

title                 0
year                  0
avg                   0
status                0
eps                   0
times_watched    992993
rating                0
anime_url             0
username              0
dtype: int64

In [9]:
df['times_watched'] = df['times_watched'].fillna(0)

In [10]:
df

Unnamed: 0,title,year,avg,status,eps,times_watched,rating,anime_url,username
0,Day Break Illusion: il sole penetra le illusioni,2013,2.90,Watched,13,1.0,3.0,https://www.anime-planet.com/anime/day-break-i...,Ruth
1,Dog Days,2011,3.50,Watched,13,1.0,3.5,https://www.anime-planet.com/anime/dog-days,Ruth
2,Dog Days',2012,3.70,Watched,13,1.0,3.0,https://www.anime-planet.com/anime/dog-days-2,Ruth
3,Fantasista Doll,2013,2.47,Watched,12,1.0,2.5,https://www.anime-planet.com/anime/fantasista-...,Ruth
4,Fate/Kaleid Liner Prisma Illya,2013,3.54,Watched,10,1.0,3.5,https://www.anime-planet.com/anime/fate-kaleid...,Ruth
...,...,...,...,...,...,...,...,...,...
9205777,The Ancient Magus' Bride,2017,4.39,Watched,24,2.0,5.0,https://www.anime-planet.com/anime/the-ancient...,Rutendo
9205778,The Disastrous Life of Saiki K.,2016,4.47,Watched,120,2.0,5.0,https://www.anime-planet.com/anime/the-disastr...,Rutendo
9205779,The Seven Deadly Sins,2014,4.35,Watched,24,1.0,5.0,https://www.anime-planet.com/anime/the-seven-d...,Rutendo
9205780,The Seven Deadly Sins: Revival of The Commandm...,2018,4.37,Watched,24,1.0,5.0,https://www.anime-planet.com/anime/the-seven-d...,Rutendo


### Fixing column errors & changing data types

In [11]:
df.memory_usage(deep=True) / 10**6

Index               0.000128
title             714.344322
year               73.646256
avg                73.646256
status            589.524658
eps                73.646256
times_watched      73.646256
rating             73.646256
anime_url        1024.809379
username          607.012562
dtype: float64

In [12]:
sum(df.memory_usage(deep=True) / 10**6)

3303.922329

In [13]:
df.dtypes

title             object
year               int64
avg              float64
status            object
eps                int64
times_watched    float64
rating           float64
anime_url         object
username          object
dtype: object

In [14]:
df['title'].unique()

array(['Day Break Illusion: il sole penetra le illusioni', 'Dog Days',
       "Dog Days'", ..., 'The [email\xa0protected]TER',
       'The [email\xa0protected]STER: Cinderella Girls',
       'The [email\xa0protected]ER: Xenoglossia'], dtype=object)

In [15]:
df.loc[df['title'].str.contains('\[email\xa0protected\]'), 'title'].unique()

array(['The [email\xa0protected]',
       'The [email\xa0protected]: Xenoglossia',
       'The [email\xa0protected]: Cinderella Girls',
       'The [email\xa0protected]: Cinderella Girls Second Series',
       '[email\xa0protected]',
       'The [email\xa0protected] SideM: Wake Atte Mini!',
       'The [email\xa0protected] SideM',
       'The [email\xa0protected]: Cinderella Girls Theater',
       'The [email\xa0protected]: Cinderella Girls Theater 2nd Season',
       'The [email\xa0protected]: Cinderella Girls Theater 3rd Season',
       'The [email\xa0protected]: Cinderella Girls Theater Climax Season',
       'The iDO[email\xa0protected]', '[email\xa0protected]i',
       'The [email\xa0protected]R', '[email\xa0protected]anbaranai',
       'The iD[email\xa0protected]: Cinderella Girls Second Series',
       'The IDOLM[email\xa0protected]: Cinderella Girls Theater',
       'The [email\xa0protected]TER: Cinderella Girls Second Series',
       'Sas[email\xa0protected]', '[email\xa0prote

In [16]:
df.loc[df['title'].str.contains('\[email\xa0protected\]'), 'anime_url'].unique()

array(['https://www.anime-planet.com/anime/the-idolmaster',
       'https://www.anime-planet.com/anime/the-idolmster-xenoglossia',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-second-series',
       'https://www.anime-planet.com/anime/sasami-san-at-ganbaranai',
       'https://www.anime-planet.com/anime/the-idolmaster-side-m-wake-atte-mini',
       'https://www.anime-planet.com/anime/the-idolmaster-side-m',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-2nd-season',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-3rd-season',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-climax-season'],
      dtype=object)

In [17]:
url_title_map = \
{
    'https://www.anime-planet.com/anime/the-idolmaster': 'The iDOLM@STER',
    'https://www.anime-planet.com/anime/sasami-san-at-ganbaranai': 'Sasami-san@Ganbaranai',
    'https://www.anime-planet.com/anime/the-idolmster-xenoglossia': 'The iDOLM@STER: Xenoglossia',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls': 'The iDOLM@STER: Cinderella Girls',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-second-series': 
                                                            'The iDOLM@STER: Cinderella Girls Second Series',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater': 'The IDOLM@STER: Cinderella Girls Theater',
    'https://www.anime-planet.com/anime/the-idolmaster-side-m': 'The iDOLM@STER SideM',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-2nd-season': 
                                                            'The iDOLM@STER: Cinderella Girls Theater 2nd Season',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-3rd-season':
                                                            'The iDOLM@STER: Cinderella Girls Theater 3rd Season',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-climax-season':
                                                            'The iDOLM@STER: Cinderella Girls Theater Climax Season',
    'https://www.anime-planet.com/anime/the-idolmaster-side-m-wake-atte-mini': 'The iDOLM@STER SideM: Wake Atte Mini!'
}

In [18]:
df.loc[df['title'].str.contains('\[email\xa0protected\]'), 'title'] = \
    df.loc[df['title'].str.contains('\[email\xa0protected\]'), 'anime_url'].map(url_title_map)

In [19]:
df['title'].nunique()

4489

In [20]:
df['title'] = df['title'].astype('category')

In [21]:
df['year'] = df['year'].astype('int')
df['year'] = pd.Categorical(df['year'].astype('int'), categories=sorted(df['year'].unique()), ordered=True)

In [22]:
df['avg'] = df['avg'].astype('float32')

In [23]:
df['status'] = pd.Categorical(df['status'], categories=['Dropped', 'Stalled', 'Watching', 'Watched'], ordered=True)

In [24]:
df['eps'] = df['eps'].astype('uint16')

In [25]:
df['times_watched'] = df['times_watched'].astype('uint16')

In [26]:
df['rating'] = df['rating'].astype('float32')

In [27]:
df['anime_url'] = df['anime_url'].astype('category')

In [28]:
df['username'] = df['username'].astype('category')

In [29]:
df.memory_usage(deep=True) / 10**6

Index             0.000128
title            18.894795
year              9.208374
avg              36.823128
status            9.206211
eps              18.411564
times_watched    18.411564
rating           36.823128
anime_url        19.046868
username         48.811591
dtype: float64

In [30]:
sum(df.memory_usage(deep=True) / 10**6)

215.63735099999997

In [31]:
df

Unnamed: 0,title,year,avg,status,eps,times_watched,rating,anime_url,username
0,Day Break Illusion: il sole penetra le illusioni,2013,2.90,Watched,13,1,3.0,https://www.anime-planet.com/anime/day-break-i...,Ruth
1,Dog Days,2011,3.50,Watched,13,1,3.5,https://www.anime-planet.com/anime/dog-days,Ruth
2,Dog Days',2012,3.70,Watched,13,1,3.0,https://www.anime-planet.com/anime/dog-days-2,Ruth
3,Fantasista Doll,2013,2.47,Watched,12,1,2.5,https://www.anime-planet.com/anime/fantasista-...,Ruth
4,Fate/Kaleid Liner Prisma Illya,2013,3.54,Watched,10,1,3.5,https://www.anime-planet.com/anime/fate-kaleid...,Ruth
...,...,...,...,...,...,...,...,...,...
9205777,The Ancient Magus' Bride,2017,4.39,Watched,24,2,5.0,https://www.anime-planet.com/anime/the-ancient...,Rutendo
9205778,The Disastrous Life of Saiki K.,2016,4.47,Watched,120,2,5.0,https://www.anime-planet.com/anime/the-disastr...,Rutendo
9205779,The Seven Deadly Sins,2014,4.35,Watched,24,1,5.0,https://www.anime-planet.com/anime/the-seven-d...,Rutendo
9205780,The Seven Deadly Sins: Revival of The Commandm...,2018,4.37,Watched,24,1,5.0,https://www.anime-planet.com/anime/the-seven-d...,Rutendo


### Total Episodes of a show

In [32]:
url_total_eps_dict = dict(df.groupby('anime_url')['eps'].max())

In [33]:
df['total_eps'] = df['anime_url'].map(url_total_eps_dict).astype('uint16')

In [34]:
df = df[['title', 'year', 'avg', 'status', 'eps', 'total_eps', 'times_watched', 'rating', 'anime_url', 'username']]

In [35]:
df

Unnamed: 0,title,year,avg,status,eps,total_eps,times_watched,rating,anime_url,username
0,Day Break Illusion: il sole penetra le illusioni,2013,2.90,Watched,13,13,1,3.0,https://www.anime-planet.com/anime/day-break-i...,Ruth
1,Dog Days,2011,3.50,Watched,13,13,1,3.5,https://www.anime-planet.com/anime/dog-days,Ruth
2,Dog Days',2012,3.70,Watched,13,13,1,3.0,https://www.anime-planet.com/anime/dog-days-2,Ruth
3,Fantasista Doll,2013,2.47,Watched,12,12,1,2.5,https://www.anime-planet.com/anime/fantasista-...,Ruth
4,Fate/Kaleid Liner Prisma Illya,2013,3.54,Watched,10,10,1,3.5,https://www.anime-planet.com/anime/fate-kaleid...,Ruth
...,...,...,...,...,...,...,...,...,...,...
9205777,The Ancient Magus' Bride,2017,4.39,Watched,24,24,2,5.0,https://www.anime-planet.com/anime/the-ancient...,Rutendo
9205778,The Disastrous Life of Saiki K.,2016,4.47,Watched,120,120,2,5.0,https://www.anime-planet.com/anime/the-disastr...,Rutendo
9205779,The Seven Deadly Sins,2014,4.35,Watched,24,24,1,5.0,https://www.anime-planet.com/anime/the-seven-d...,Rutendo
9205780,The Seven Deadly Sins: Revival of The Commandm...,2018,4.37,Watched,24,24,1,5.0,https://www.anime-planet.com/anime/the-seven-d...,Rutendo


In [36]:
df.dtypes

title            category
year             category
avg               float32
status           category
eps                uint16
total_eps          uint16
times_watched      uint16
rating            float32
anime_url        category
username         category
dtype: object

In [37]:
sum(df.memory_usage(deep=True) / 10**6)

234.04891499999997

### Save Data

In [38]:
df.to_csv('../data/watch_list_clean.csv', index=False)

In [39]:
df.to_pickle('../data/watch_list_clean.pkl')

In [40]:
%%bash
cd ../data
rm watch_list_clean.csv.xz
xz -vT0 watch_list_clean.csv

rm watch_list_clean.pkl.xz
xz -vT0 watch_list_clean.pkl

watch_list_clean.csv: 65.2 MiB / 1,018.5 MiB = 0.064, 27 MiB/s, 0:37
watch_list_clean.pkl: 40.8 MiB / 212.4 MiB = 0.192, 9.4 MiB/s, 0:22
