In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import sqlalchemy as sql

In [2]:
with open('../tools/credentials.json') as file:
    credentials = json.load(file)
    
username = credentials["dblogin"]["username"]
password = credentials["dblogin"]["password"]

In [3]:
db_string = f"postgresql://{username}:{password}@192.168.0.3:5432/animeplanet"
db = sql.create_engine(db_string)

In [4]:
# query = f"""
#         SELECT title, year, avg, status, eps, times_watched, rating, anime_url, username
#         FROM watch_list
#         WHERE rating IS NOT NULL
#         AND year IS NOT NULL
#         AND eps IS NOT NULL
#         AND avg IS NOT NULL
#         AND "type" = 'TV'
#         AND status IN ('Watched', 'Dropped', 'Watching', 'Stalled');
#         """

# df = pd.read_sql(sql.text(query), db)
# df = df.drop_duplicates(['anime_url', 'username'], keep='last', ignore_index=True)
# df.to_csv('../data/watch_list_raw.csv', index=False)

In [5]:
# %%bash
# cd ../data
# rm watch_list_raw.csv.xz
# xz -vT0 watch_list_raw.csv

In [6]:
df = pd.read_csv('../data/watch_list_raw.csv.xz')
df = df.rename(columns={'anime_url':'url'})

### Dealing with missing values

In [7]:
df.isnull().sum()

title                 0
year                  0
avg                   0
status                0
eps                   0
times_watched    992993
rating                0
url                   0
username              0
dtype: int64

In [8]:
df['times_watched'] = df['times_watched'].fillna(0)

### Fixing column errors & changing data types

In [9]:
mem_usage = df.memory_usage(deep=True) / 10**6
mem_usage

Index               0.000128
title             714.344322
year               73.646256
avg                73.646256
status            589.524658
eps                73.646256
times_watched      73.646256
rating             73.646256
url              1024.809379
username          607.012562
dtype: float64

In [10]:
sum(mem_usage)

3303.922329

In [11]:
df.dtypes

title             object
year               int64
avg              float64
status            object
eps                int64
times_watched    float64
rating           float64
url               object
username          object
dtype: object

In [12]:
df['title'].unique()

array(['Day Break Illusion: il sole penetra le illusioni', 'Dog Days',
       "Dog Days'", ..., 'The [email\xa0protected]TER',
       'The [email\xa0protected]STER: Cinderella Girls',
       'The [email\xa0protected]ER: Xenoglossia'], dtype=object)

In [13]:
df.loc[df['title'].str.contains('\[email\xa0protected\]'), 'title'].unique()

array(['The [email\xa0protected]',
       'The [email\xa0protected]: Xenoglossia',
       'The [email\xa0protected]: Cinderella Girls',
       'The [email\xa0protected]: Cinderella Girls Second Series',
       '[email\xa0protected]',
       'The [email\xa0protected] SideM: Wake Atte Mini!',
       'The [email\xa0protected] SideM',
       'The [email\xa0protected]: Cinderella Girls Theater',
       'The [email\xa0protected]: Cinderella Girls Theater 2nd Season',
       'The [email\xa0protected]: Cinderella Girls Theater 3rd Season',
       'The [email\xa0protected]: Cinderella Girls Theater Climax Season',
       'The iDO[email\xa0protected]', '[email\xa0protected]i',
       'The [email\xa0protected]R', '[email\xa0protected]anbaranai',
       'The iD[email\xa0protected]: Cinderella Girls Second Series',
       'The IDOLM[email\xa0protected]: Cinderella Girls Theater',
       'The [email\xa0protected]TER: Cinderella Girls Second Series',
       'Sas[email\xa0protected]', '[email\xa0prote

In [14]:
df.loc[df['title'].str.contains('\[email\xa0protected\]'), 'url'].unique()

array(['https://www.anime-planet.com/anime/the-idolmaster',
       'https://www.anime-planet.com/anime/the-idolmster-xenoglossia',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-second-series',
       'https://www.anime-planet.com/anime/sasami-san-at-ganbaranai',
       'https://www.anime-planet.com/anime/the-idolmaster-side-m-wake-atte-mini',
       'https://www.anime-planet.com/anime/the-idolmaster-side-m',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-2nd-season',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-3rd-season',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-climax-season'],
      dtype=object)

In [15]:
url_title_map = \
{
    'https://www.anime-planet.com/anime/the-idolmaster': 'The iDOLM@STER',
    'https://www.anime-planet.com/anime/sasami-san-at-ganbaranai': 'Sasami-san@Ganbaranai',
    'https://www.anime-planet.com/anime/the-idolmster-xenoglossia': 'The iDOLM@STER: Xenoglossia',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls': 'The iDOLM@STER: Cinderella Girls',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-second-series': 
                                                            'The iDOLM@STER: Cinderella Girls Second Series',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater': 'The IDOLM@STER: Cinderella Girls Theater',
    'https://www.anime-planet.com/anime/the-idolmaster-side-m': 'The iDOLM@STER SideM',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-2nd-season': 
                                                            'The iDOLM@STER: Cinderella Girls Theater 2nd Season',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-3rd-season':
                                                            'The iDOLM@STER: Cinderella Girls Theater 3rd Season',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-climax-season':
                                                            'The iDOLM@STER: Cinderella Girls Theater Climax Season',
    'https://www.anime-planet.com/anime/the-idolmaster-side-m-wake-atte-mini': 'The iDOLM@STER SideM: Wake Atte Mini!'
}

In [16]:
df.loc[df['title'].str.contains('\[email\xa0protected\]'), 'title'] = \
    df.loc[df['title'].str.contains('\[email\xa0protected\]'), 'url'].map(url_title_map)

In [17]:
df.loc[df['title'].str.contains('\[email\xa0protected\]'), 'title'].unique()

array([], dtype=object)

In [18]:
df['title'].nunique()

4489

In [19]:
df.columns

Index(['title', 'year', 'avg', 'status', 'eps', 'times_watched', 'rating',
       'url', 'username'],
      dtype='object')

In [20]:
df['title'] = df['title'].astype('category')

In [21]:
df['url'] = df['url'].astype('category')

In [22]:
df['username'] = df['username'].astype('category')

In [23]:
df['status'] = pd.Categorical(df['status'], categories=['Dropped', 'Stalled', 'Watching', 'Watched'], ordered=True)

In [24]:
df['times_watched'] = df['times_watched'].astype('uint16')

In [25]:
df['rating'] = df['rating'].astype('float')

In [26]:
df = df.drop(['avg', 'eps', 'year'], axis=1)

In [27]:
df.memory_usage(deep=True) / 10**6

Index             0.000128
title            18.894795
status            9.206211
times_watched    18.411564
rating           73.646256
url              19.046868
username         48.811591
dtype: float64

In [28]:
sum(df.memory_usage(deep=True) / 10**6)

188.01741299999998

In [29]:
df

Unnamed: 0,title,status,times_watched,rating,url,username
0,Day Break Illusion: il sole penetra le illusioni,Watched,1,3.0,https://www.anime-planet.com/anime/day-break-i...,Ruth
1,Dog Days,Watched,1,3.5,https://www.anime-planet.com/anime/dog-days,Ruth
2,Dog Days',Watched,1,3.0,https://www.anime-planet.com/anime/dog-days-2,Ruth
3,Fantasista Doll,Watched,1,2.5,https://www.anime-planet.com/anime/fantasista-...,Ruth
4,Fate/Kaleid Liner Prisma Illya,Watched,1,3.5,https://www.anime-planet.com/anime/fate-kaleid...,Ruth
...,...,...,...,...,...,...
9205777,The Ancient Magus' Bride,Watched,2,5.0,https://www.anime-planet.com/anime/the-ancient...,Rutendo
9205778,The Disastrous Life of Saiki K.,Watched,2,5.0,https://www.anime-planet.com/anime/the-disastr...,Rutendo
9205779,The Seven Deadly Sins,Watched,1,5.0,https://www.anime-planet.com/anime/the-seven-d...,Rutendo
9205780,The Seven Deadly Sins: Revival of The Commandm...,Watched,1,5.0,https://www.anime-planet.com/anime/the-seven-d...,Rutendo


### Save Data

In [30]:
df.to_pickle('../data/watch_list_clean.pkl')

In [31]:
%%bash
cd ../data

rm watch_list_clean.pkl.xz
xz -vT14 watch_list_clean.pkl

watch_list_clean.pkl: 21.3 MiB / 168.5 MiB = 0.126, 12 MiB/s, 0:14


In [32]:
df.to_csv('../data/watch_list_clean.csv', index=False)

In [33]:
%%bash
cd ../data

rm watch_list_clean.csv.xz
xz -vT14 watch_list_clean.csv

watch_list_clean.csv: 56.1 MiB / 878.8 MiB = 0.064, 27 MiB/s, 0:32
