In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import sqlalchemy as sql

In [26]:
with open('../tools/credentials.json') as file:
    credentials = json.load(file)
    
username = credentials["dblogin"]["username"]
password = credentials["dblogin"]["password"]

In [27]:
db_string = f"postgresql://{username}:{password}@192.168.0.3:5432/animeplanet"
db = sql.create_engine(db_string)

In [2]:
anime = pd.read_pickle('../data/anime_raw.pkl.xz')

#### `num_eps`

In [3]:
anime['num_eps']

0                TV (12 eps x 5 min)
1               Movie (1 ep x 5 min)
2              TV (104 eps x 23 min)
3                                Web
4              Movie (1 ep x 80 min)
                    ...             
17122                   Movie (1 ep)
17123     DVD Special (1 ep x 6 min)
17124    DVD Special (1 ep x 24 min)
17125                    TV (12 eps)
17126           Web (20 eps x 2 min)
Name: num_eps, Length: 17127, dtype: object

In [4]:
pattern = r"""(?P<type>(?:TV\sSpecial|TV|Movie|OVA|Music\sVideo|Other|DVD\sSpecial|Web)+)(?:\s+\((?P<num_eps>\d+)(?P<is_ongoing>\+)?\seps?(?:\sx\s(?P<duration>\d+)\smin)?\))?"""
tmp = anime['num_eps'].str.extract(pattern)
tmp['is_ongoing'] = tmp['is_ongoing'].notnull()

In [5]:
anime[['type', 'num_eps', 'is_ongoing', 'duration']] = tmp

In [6]:
anime

Unnamed: 0,title,num_eps,studio,start_end_years,season_year,rating,synopsis,tags,content_warnings,url,type,is_ongoing,duration
0,Gag Manga Biyori 2,12,Artland,2006,Summer 2006,3.583 out of 5 from 233 votes,"The lupine detective Usami-chan is back, and r...","[Comedy, Shounen, Crude, Episodic, Gag, Short ...",,https://www.anime-planet.com/anime/gag-manga-b...,TV,False,5
1,Fu Yu Nu,1,,2016,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Chinese Animation, Shorts]",,https://www.anime-planet.com/anime/fu-yu-nu,Movie,False,5
2,Kijeu CSI: Gwahaksusadae,104,,2012 - 2014,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Mystery, Korean Animation]",,https://www.anime-planet.com/anime/kijeu-csi-g...,TV,False,23
3,Zuoshou Shanglan,,,TBA,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Sports, Basketball, Chinese Animation]",,https://www.anime-planet.com/anime/zuoshou-sha...,Web,False,
4,Jeonsa Ryan,1,,1997,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Fantasy, Family Friendly, Korean A...",,https://www.anime-planet.com/anime/jeonsa-ryan,Movie,False,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,BanG Dream! Movie: Episode of Roselia - Part I...,1,SANZIGEN,2021,,3.528 out of 5 from 31 votes,No synopsis yet - check back soon!,"[Idols, Music]",,https://www.anime-planet.com/anime/bang-dream-...,Movie,False,
17123,Yu Yu Hakusho Picture Drama,1,Pierrot,2009,,3.528 out of 5 from 811 votes,No synopsis yet - check back soon!,"[Comedy, Shounen, Picture Drama, Based on a Ma...",,https://www.anime-planet.com/anime/yu-yu-hakus...,DVD Special,False,6
17124,Kannagi: If You Are a Shrine Maiden,1,A-1 Pictures,2009,,"3.528 out of 5 from 2,082 votes",One day Shino and Takako find a money clip in ...,"[Comedy, Fantasy, Shoujo, Japanese Mythology, ...",,https://www.anime-planet.com/anime/kannagi-if-...,DVD Special,False,24
17125,KADO: The Right Answer,12,Toei Animation,2017,Spring 2017,"3.527 out of 5 from 3,203 votes",Koujiro Shindo is a highly-skilled negotiator ...,"[Drama, Sci Fi, Political, CG Animation, Origi...",,https://www.anime-planet.com/anime/kado-the-ri...,TV,False,


#### `start_end_years`

In [7]:
anime['start_end_years'] = anime['start_end_years'].str.strip()

In [8]:
anime[['start_year', 'end_year']] = \
    anime['start_end_years'].str.extract(r'(?P<start_year>(?:(?:\d+)|TBA))(?:\s-\s(?P<end_year>(?:(?:\d+)|\?)))?')

In [9]:
anime['end_year'] = anime['end_year'].replace({'?': 'TBA'})

In [10]:
anime.loc[anime['end_year'].isnull(), 'end_year'] = \
    anime.loc[anime['end_year'].isnull(), 'start_year']

In [11]:
anime[['start_year', 'end_year']].isnull().sum()

start_year    0
end_year      0
dtype: int64

In [12]:
del anime['start_end_years']

#### `season_year`

In [13]:
anime['season'] = anime['season_year'].str.extract(r'(?P<season>[A-Za-z]+)')
anime = anime.drop(columns=['season_year'])

In [14]:
anime

Unnamed: 0,title,num_eps,studio,rating,synopsis,tags,content_warnings,url,type,is_ongoing,duration,start_year,end_year,season
0,Gag Manga Biyori 2,12,Artland,3.583 out of 5 from 233 votes,"The lupine detective Usami-chan is back, and r...","[Comedy, Shounen, Crude, Episodic, Gag, Short ...",,https://www.anime-planet.com/anime/gag-manga-b...,TV,False,5,2006,2006,Summer
1,Fu Yu Nu,1,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Chinese Animation, Shorts]",,https://www.anime-planet.com/anime/fu-yu-nu,Movie,False,5,2016,2016,
2,Kijeu CSI: Gwahaksusadae,104,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Mystery, Korean Animation]",,https://www.anime-planet.com/anime/kijeu-csi-g...,TV,False,23,2012,2014,
3,Zuoshou Shanglan,,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Sports, Basketball, Chinese Animation]",,https://www.anime-planet.com/anime/zuoshou-sha...,Web,False,,TBA,TBA,
4,Jeonsa Ryan,1,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Fantasy, Family Friendly, Korean A...",,https://www.anime-planet.com/anime/jeonsa-ryan,Movie,False,80,1997,1997,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,BanG Dream! Movie: Episode of Roselia - Part I...,1,SANZIGEN,3.528 out of 5 from 31 votes,No synopsis yet - check back soon!,"[Idols, Music]",,https://www.anime-planet.com/anime/bang-dream-...,Movie,False,,2021,2021,
17123,Yu Yu Hakusho Picture Drama,1,Pierrot,3.528 out of 5 from 811 votes,No synopsis yet - check back soon!,"[Comedy, Shounen, Picture Drama, Based on a Ma...",,https://www.anime-planet.com/anime/yu-yu-hakus...,DVD Special,False,6,2009,2009,
17124,Kannagi: If You Are a Shrine Maiden,1,A-1 Pictures,"3.528 out of 5 from 2,082 votes",One day Shino and Takako find a money clip in ...,"[Comedy, Fantasy, Shoujo, Japanese Mythology, ...",,https://www.anime-planet.com/anime/kannagi-if-...,DVD Special,False,24,2009,2009,
17125,KADO: The Right Answer,12,Toei Animation,"3.527 out of 5 from 3,203 votes",Koujiro Shindo is a highly-skilled negotiator ...,"[Drama, Sci Fi, Political, CG Animation, Origi...",,https://www.anime-planet.com/anime/kado-the-ri...,TV,False,,2017,2017,Spring


#### `rating`

In [15]:
anime['rating'] = anime['rating'].str.replace('.* needed to calculate an average', '', regex=True).replace('', np.NaN)

In [16]:
anime['rating']

0          3.583 out of 5 from 233 votes
1                                    NaN
2                                    NaN
3                                    NaN
4                                    NaN
                      ...               
17122       3.528 out of 5 from 31 votes
17123      3.528 out of 5 from 811 votes
17124    3.528 out of 5 from 2,082 votes
17125    3.527 out of 5 from 3,203 votes
17126       3.527 out of 5 from 14 votes
Name: rating, Length: 17127, dtype: object

In [17]:
anime[['rating', 'num_votes']] = anime['rating'].str.extract(r'(?P<rating>\d\.?\d*) out of 5 from (?P<num_votes>[\d,]+) votes')
anime['num_votes'] = anime['num_votes'].str.replace(',','')

In [18]:
anime

Unnamed: 0,title,num_eps,studio,rating,synopsis,tags,content_warnings,url,type,is_ongoing,duration,start_year,end_year,season,num_votes
0,Gag Manga Biyori 2,12,Artland,3.583,"The lupine detective Usami-chan is back, and r...","[Comedy, Shounen, Crude, Episodic, Gag, Short ...",,https://www.anime-planet.com/anime/gag-manga-b...,TV,False,5,2006,2006,Summer,233
1,Fu Yu Nu,1,,,No synopsis yet - check back soon!,"[Chinese Animation, Shorts]",,https://www.anime-planet.com/anime/fu-yu-nu,Movie,False,5,2016,2016,,
2,Kijeu CSI: Gwahaksusadae,104,,,No synopsis yet - check back soon!,"[Adventure, Mystery, Korean Animation]",,https://www.anime-planet.com/anime/kijeu-csi-g...,TV,False,23,2012,2014,,
3,Zuoshou Shanglan,,,,No synopsis yet - check back soon!,"[Sports, Basketball, Chinese Animation]",,https://www.anime-planet.com/anime/zuoshou-sha...,Web,False,,TBA,TBA,,
4,Jeonsa Ryan,1,,,No synopsis yet - check back soon!,"[Adventure, Fantasy, Family Friendly, Korean A...",,https://www.anime-planet.com/anime/jeonsa-ryan,Movie,False,80,1997,1997,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,BanG Dream! Movie: Episode of Roselia - Part I...,1,SANZIGEN,3.528,No synopsis yet - check back soon!,"[Idols, Music]",,https://www.anime-planet.com/anime/bang-dream-...,Movie,False,,2021,2021,,31
17123,Yu Yu Hakusho Picture Drama,1,Pierrot,3.528,No synopsis yet - check back soon!,"[Comedy, Shounen, Picture Drama, Based on a Ma...",,https://www.anime-planet.com/anime/yu-yu-hakus...,DVD Special,False,6,2009,2009,,811
17124,Kannagi: If You Are a Shrine Maiden,1,A-1 Pictures,3.528,One day Shino and Takako find a money clip in ...,"[Comedy, Fantasy, Shoujo, Japanese Mythology, ...",,https://www.anime-planet.com/anime/kannagi-if-...,DVD Special,False,24,2009,2009,,2082
17125,KADO: The Right Answer,12,Toei Animation,3.527,Koujiro Shindo is a highly-skilled negotiator ...,"[Drama, Sci Fi, Political, CG Animation, Origi...",,https://www.anime-planet.com/anime/kado-the-ri...,TV,False,,2017,2017,Spring,3203


In [19]:
anime['duration'].astype('float').max()

235.0

In [20]:
anime['start_year'] = anime['start_year'].replace('TBA', np.NaN).astype('float')

In [21]:
anime['end_year'] = anime['end_year'].replace('TBA', np.NaN).astype('float')

In [22]:
anime['season'] = pd.Categorical(anime['season'], categories=['Winter', 'Spring', 'Summer', 'Fall'], ordered=True)

#### Change datatypes

In [23]:
dtypes_dict = \
{
    'title': 'string',
    'type': 'category',
    'num_eps': 'float',
    'is_ongoing': 'bool',
    'duration': 'float',
    'studio': 'category',
    'start_year': 'category',
    'end_year': 'category',
    'season': 'category',
    'rating': 'float',
    'num_votes': 'float',
    'synopsis': 'string',
    'tags': 'object',
    'content_warnings': 'object',
    'url': 'string'
}
anime = anime.astype(dtypes_dict)[list(dtypes_dict.keys())]

In [24]:
anime['season'].dtype

CategoricalDtype(categories=['Winter', 'Spring', 'Summer', 'Fall'], ordered=True)

In [28]:
anime.to_csv('../data/anime.csv.xz', index=False)
anime.to_pickle('../data/anime.pkl.xz')
anime.to_sql('anime', db, if_exists='replace', index=False, method='multi')