In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import sqlalchemy as sql

In [2]:
with open('../tools/credentials.json') as file:
    credentials = json.load(file)
    
username = credentials["dblogin"]["username"]
password = credentials["dblogin"]["password"]

In [3]:
db_string = f"postgresql://{username}:{password}@192.168.0.3:5432/animeplanet"
db = sql.create_engine(db_string)

In [4]:
# query = f"""
#         SELECT title, year, avg, status, eps, times_watched, rating, anime_url, username
#         FROM watch_list
#         WHERE rating IS NOT NULL
#         AND year IS NOT NULL
#         AND eps IS NOT NULL
#         AND avg IS NOT NULL
#         AND "type" = 'TV'
#         AND status IN ('Watched', 'Dropped', 'Watching', 'Stalled');
#         """

# df = pd.read_sql(sql.text(query), db)
# df = df.drop_duplicates(['anime_url', 'username'], keep='last', ignore_index=True)
# df.to_csv('../data/watch_list_raw.csv', index=False)

In [5]:
# %%bash
# cd ../data
# rm watch_list_raw.csv.xz
# xz -vT0 watch_list_raw.csv

### Cleaning Anime Data

In [6]:
anime = pd.read_pickle('../data/anime_raw.pkl.xz')

#### `num_eps`

In [7]:
anime['num_eps']

0                TV (12 eps x 5 min)
1               Movie (1 ep x 5 min)
2              TV (104 eps x 23 min)
3                                Web
4              Movie (1 ep x 80 min)
                    ...             
17122                   Movie (1 ep)
17123     DVD Special (1 ep x 6 min)
17124    DVD Special (1 ep x 24 min)
17125                    TV (12 eps)
17126           Web (20 eps x 2 min)
Name: num_eps, Length: 17127, dtype: object

In [8]:
pattern = r"""(?P<type>(?:TV\sSpecial|TV|Movie|OVA|Music\sVideo|Other|DVD\sSpecial|Web)+)(?:\s+\((?P<num_eps>\d+)(?P<is_ongoing>\+)?\seps?(?:\sx\s(?P<duration>\d+)\smin)?\))?"""
tmp = anime['num_eps'].str.extract(pattern)
tmp['is_ongoing'] = tmp['is_ongoing'].notnull()

In [9]:
anime[['type', 'num_eps', 'is_ongoing', 'duration']] = tmp

In [10]:
anime

Unnamed: 0,title,num_eps,studio,start_end_years,season_year,rating,synopsis,tags,content_warnings,url,type,is_ongoing,duration
0,Gag Manga Biyori 2,12,Artland,2006,Summer 2006,3.583 out of 5 from 233 votes,"The lupine detective Usami-chan is back, and r...","[Comedy, Shounen, Crude, Episodic, Gag, Short ...",,https://www.anime-planet.com/anime/gag-manga-b...,TV,False,5
1,Fu Yu Nu,1,,2016,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Chinese Animation, Shorts]",,https://www.anime-planet.com/anime/fu-yu-nu,Movie,False,5
2,Kijeu CSI: Gwahaksusadae,104,,2012 - 2014,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Mystery, Korean Animation]",,https://www.anime-planet.com/anime/kijeu-csi-g...,TV,False,23
3,Zuoshou Shanglan,,,TBA,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Sports, Basketball, Chinese Animation]",,https://www.anime-planet.com/anime/zuoshou-sha...,Web,False,
4,Jeonsa Ryan,1,,1997,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Fantasy, Family Friendly, Korean A...",,https://www.anime-planet.com/anime/jeonsa-ryan,Movie,False,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,BanG Dream! Movie: Episode of Roselia - Part I...,1,SANZIGEN,2021,,3.528 out of 5 from 31 votes,No synopsis yet - check back soon!,"[Idols, Music]",,https://www.anime-planet.com/anime/bang-dream-...,Movie,False,
17123,Yu Yu Hakusho Picture Drama,1,Pierrot,2009,,3.528 out of 5 from 811 votes,No synopsis yet - check back soon!,"[Comedy, Shounen, Picture Drama, Based on a Ma...",,https://www.anime-planet.com/anime/yu-yu-hakus...,DVD Special,False,6
17124,Kannagi: If You Are a Shrine Maiden,1,A-1 Pictures,2009,,"3.528 out of 5 from 2,082 votes",One day Shino and Takako find a money clip in ...,"[Comedy, Fantasy, Shoujo, Japanese Mythology, ...",,https://www.anime-planet.com/anime/kannagi-if-...,DVD Special,False,24
17125,KADO: The Right Answer,12,Toei Animation,2017,Spring 2017,"3.527 out of 5 from 3,203 votes",Koujiro Shindo is a highly-skilled negotiator ...,"[Drama, Sci Fi, Political, CG Animation, Origi...",,https://www.anime-planet.com/anime/kado-the-ri...,TV,False,


#### `start_end_years`

In [11]:
anime['start_end_years'] = anime['start_end_years'].str.strip()

In [12]:
anime[['start_year', 'end_year']] = \
    anime['start_end_years'].str.extract(r'(?P<start_year>(?:(?:\d+)|TBA))(?:\s-\s(?P<end_year>(?:(?:\d+)|\?)))?')

In [13]:
anime['end_year'] = anime['end_year'].replace({'?': 'TBA'})

In [14]:
anime.loc[anime['end_year'].isnull(), 'end_year'] = \
    anime.loc[anime['end_year'].isnull(), 'start_year']

In [15]:
anime[['start_year', 'end_year']].isnull().sum()

start_year    0
end_year      0
dtype: int64

In [16]:
del anime['start_end_years']

#### `season_year`

In [17]:
anime['season'] = anime['season_year'].str.extract(r'(?P<season>[A-Za-z]+)')
anime = anime.drop(columns=['season_year'])

In [18]:
anime

Unnamed: 0,title,num_eps,studio,rating,synopsis,tags,content_warnings,url,type,is_ongoing,duration,start_year,end_year,season
0,Gag Manga Biyori 2,12,Artland,3.583 out of 5 from 233 votes,"The lupine detective Usami-chan is back, and r...","[Comedy, Shounen, Crude, Episodic, Gag, Short ...",,https://www.anime-planet.com/anime/gag-manga-b...,TV,False,5,2006,2006,Summer
1,Fu Yu Nu,1,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Chinese Animation, Shorts]",,https://www.anime-planet.com/anime/fu-yu-nu,Movie,False,5,2016,2016,
2,Kijeu CSI: Gwahaksusadae,104,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Mystery, Korean Animation]",,https://www.anime-planet.com/anime/kijeu-csi-g...,TV,False,23,2012,2014,
3,Zuoshou Shanglan,,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Sports, Basketball, Chinese Animation]",,https://www.anime-planet.com/anime/zuoshou-sha...,Web,False,,TBA,TBA,
4,Jeonsa Ryan,1,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Fantasy, Family Friendly, Korean A...",,https://www.anime-planet.com/anime/jeonsa-ryan,Movie,False,80,1997,1997,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,BanG Dream! Movie: Episode of Roselia - Part I...,1,SANZIGEN,3.528 out of 5 from 31 votes,No synopsis yet - check back soon!,"[Idols, Music]",,https://www.anime-planet.com/anime/bang-dream-...,Movie,False,,2021,2021,
17123,Yu Yu Hakusho Picture Drama,1,Pierrot,3.528 out of 5 from 811 votes,No synopsis yet - check back soon!,"[Comedy, Shounen, Picture Drama, Based on a Ma...",,https://www.anime-planet.com/anime/yu-yu-hakus...,DVD Special,False,6,2009,2009,
17124,Kannagi: If You Are a Shrine Maiden,1,A-1 Pictures,"3.528 out of 5 from 2,082 votes",One day Shino and Takako find a money clip in ...,"[Comedy, Fantasy, Shoujo, Japanese Mythology, ...",,https://www.anime-planet.com/anime/kannagi-if-...,DVD Special,False,24,2009,2009,
17125,KADO: The Right Answer,12,Toei Animation,"3.527 out of 5 from 3,203 votes",Koujiro Shindo is a highly-skilled negotiator ...,"[Drama, Sci Fi, Political, CG Animation, Origi...",,https://www.anime-planet.com/anime/kado-the-ri...,TV,False,,2017,2017,Spring


#### `rating`

In [19]:
anime['rating'] = anime['rating'].str.replace('.* needed to calculate an average', '', regex=True).replace('', np.NaN)

In [20]:
anime['rating']

0          3.583 out of 5 from 233 votes
1                                    NaN
2                                    NaN
3                                    NaN
4                                    NaN
                      ...               
17122       3.528 out of 5 from 31 votes
17123      3.528 out of 5 from 811 votes
17124    3.528 out of 5 from 2,082 votes
17125    3.527 out of 5 from 3,203 votes
17126       3.527 out of 5 from 14 votes
Name: rating, Length: 17127, dtype: object

In [21]:
anime[['rating', 'num_votes']] = anime['rating'].str.extract(r'(?P<rating>\d\.?\d*) out of 5 from (?P<num_votes>[\d,]+) votes')
anime['num_votes'] = anime['num_votes'].str.replace(',','')

In [22]:
anime

Unnamed: 0,title,num_eps,studio,rating,synopsis,tags,content_warnings,url,type,is_ongoing,duration,start_year,end_year,season,num_votes
0,Gag Manga Biyori 2,12,Artland,3.583,"The lupine detective Usami-chan is back, and r...","[Comedy, Shounen, Crude, Episodic, Gag, Short ...",,https://www.anime-planet.com/anime/gag-manga-b...,TV,False,5,2006,2006,Summer,233
1,Fu Yu Nu,1,,,No synopsis yet - check back soon!,"[Chinese Animation, Shorts]",,https://www.anime-planet.com/anime/fu-yu-nu,Movie,False,5,2016,2016,,
2,Kijeu CSI: Gwahaksusadae,104,,,No synopsis yet - check back soon!,"[Adventure, Mystery, Korean Animation]",,https://www.anime-planet.com/anime/kijeu-csi-g...,TV,False,23,2012,2014,,
3,Zuoshou Shanglan,,,,No synopsis yet - check back soon!,"[Sports, Basketball, Chinese Animation]",,https://www.anime-planet.com/anime/zuoshou-sha...,Web,False,,TBA,TBA,,
4,Jeonsa Ryan,1,,,No synopsis yet - check back soon!,"[Adventure, Fantasy, Family Friendly, Korean A...",,https://www.anime-planet.com/anime/jeonsa-ryan,Movie,False,80,1997,1997,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,BanG Dream! Movie: Episode of Roselia - Part I...,1,SANZIGEN,3.528,No synopsis yet - check back soon!,"[Idols, Music]",,https://www.anime-planet.com/anime/bang-dream-...,Movie,False,,2021,2021,,31
17123,Yu Yu Hakusho Picture Drama,1,Pierrot,3.528,No synopsis yet - check back soon!,"[Comedy, Shounen, Picture Drama, Based on a Ma...",,https://www.anime-planet.com/anime/yu-yu-hakus...,DVD Special,False,6,2009,2009,,811
17124,Kannagi: If You Are a Shrine Maiden,1,A-1 Pictures,3.528,One day Shino and Takako find a money clip in ...,"[Comedy, Fantasy, Shoujo, Japanese Mythology, ...",,https://www.anime-planet.com/anime/kannagi-if-...,DVD Special,False,24,2009,2009,,2082
17125,KADO: The Right Answer,12,Toei Animation,3.527,Koujiro Shindo is a highly-skilled negotiator ...,"[Drama, Sci Fi, Political, CG Animation, Origi...",,https://www.anime-planet.com/anime/kado-the-ri...,TV,False,,2017,2017,Spring,3203


In [23]:
anime['duration'].astype('float').max()

235.0

#### Change datatypes

In [24]:
dtypes_dict = \
{
    'title': 'string',
    'type': 'category',
    'num_eps': 'float32',
    'is_ongoing': 'bool',
    'duration': 'float32',
    'studio': 'category',
    'start_year': 'category',
    'end_year': 'category',
    'season': 'category',
    'rating': 'float32',
    'num_votes': 'float32',
    'synopsis': 'string',
    'tags': 'object',
    'content_warnings': 'object',
    'url': 'string'
}
anime = anime.astype(dtypes_dict)[list(dtypes_dict.keys())]

In [25]:
anime

Unnamed: 0,title,type,num_eps,is_ongoing,duration,studio,start_year,end_year,season,rating,num_votes,synopsis,tags,content_warnings,url
0,Gag Manga Biyori 2,TV,12.0,False,5.0,Artland,2006,2006,Summer,3.583,233.0,"The lupine detective Usami-chan is back, and r...","[Comedy, Shounen, Crude, Episodic, Gag, Short ...",,https://www.anime-planet.com/anime/gag-manga-b...
1,Fu Yu Nu,Movie,1.0,False,5.0,,2016,2016,,,,No synopsis yet - check back soon!,"[Chinese Animation, Shorts]",,https://www.anime-planet.com/anime/fu-yu-nu
2,Kijeu CSI: Gwahaksusadae,TV,104.0,False,23.0,,2012,2014,,,,No synopsis yet - check back soon!,"[Adventure, Mystery, Korean Animation]",,https://www.anime-planet.com/anime/kijeu-csi-g...
3,Zuoshou Shanglan,Web,,False,,,TBA,TBA,,,,No synopsis yet - check back soon!,"[Sports, Basketball, Chinese Animation]",,https://www.anime-planet.com/anime/zuoshou-sha...
4,Jeonsa Ryan,Movie,1.0,False,80.0,,1997,1997,,,,No synopsis yet - check back soon!,"[Adventure, Fantasy, Family Friendly, Korean A...",,https://www.anime-planet.com/anime/jeonsa-ryan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,BanG Dream! Movie: Episode of Roselia - Part I...,Movie,1.0,False,,SANZIGEN,2021,2021,,3.528,31.0,No synopsis yet - check back soon!,"[Idols, Music]",,https://www.anime-planet.com/anime/bang-dream-...
17123,Yu Yu Hakusho Picture Drama,DVD Special,1.0,False,6.0,Pierrot,2009,2009,,3.528,811.0,No synopsis yet - check back soon!,"[Comedy, Shounen, Picture Drama, Based on a Ma...",,https://www.anime-planet.com/anime/yu-yu-hakus...
17124,Kannagi: If You Are a Shrine Maiden,DVD Special,1.0,False,24.0,A-1 Pictures,2009,2009,,3.528,2082.0,One day Shino and Takako find a money clip in ...,"[Comedy, Fantasy, Shoujo, Japanese Mythology, ...",,https://www.anime-planet.com/anime/kannagi-if-...
17125,KADO: The Right Answer,TV,12.0,False,,Toei Animation,2017,2017,Spring,3.527,3203.0,Koujiro Shindo is a highly-skilled negotiator ...,"[Drama, Sci Fi, Political, CG Animation, Origi...",,https://www.anime-planet.com/anime/kado-the-ri...


In [26]:
anime.to_csv('../data/anime.csv.xz', index=False)
anime.to_pickle('../data/anime.pkl.xz')
anime.to_sql('anime', db, if_exists='replace', index=False, method='multi')

### Merging & Additional Cleaning

In [27]:
watch_list = pd.read_csv('../data/watch_list_raw.csv.xz')
anime = pd.read_pickle('../data/anime.pkl.xz')

watch_list = watch_list.rename(columns={'rating': 'user_rating', 'anime_url':'url'})
anime = anime.rename(columns={'rating': 'avg_rating', 'ongoing': 'is_ongoing'})

In [28]:
watch_list

Unnamed: 0,title,year,avg,status,eps,times_watched,user_rating,url,username
0,Day Break Illusion: il sole penetra le illusioni,2013,2.90,Watched,13,1.0,3.0,https://www.anime-planet.com/anime/day-break-i...,Ruth
1,Dog Days,2011,3.50,Watched,13,1.0,3.5,https://www.anime-planet.com/anime/dog-days,Ruth
2,Dog Days',2012,3.70,Watched,13,1.0,3.0,https://www.anime-planet.com/anime/dog-days-2,Ruth
3,Fantasista Doll,2013,2.47,Watched,12,1.0,2.5,https://www.anime-planet.com/anime/fantasista-...,Ruth
4,Fate/Kaleid Liner Prisma Illya,2013,3.54,Watched,10,1.0,3.5,https://www.anime-planet.com/anime/fate-kaleid...,Ruth
...,...,...,...,...,...,...,...,...,...
9205777,The Ancient Magus' Bride,2017,4.39,Watched,24,2.0,5.0,https://www.anime-planet.com/anime/the-ancient...,Rutendo
9205778,The Disastrous Life of Saiki K.,2016,4.47,Watched,120,2.0,5.0,https://www.anime-planet.com/anime/the-disastr...,Rutendo
9205779,The Seven Deadly Sins,2014,4.35,Watched,24,1.0,5.0,https://www.anime-planet.com/anime/the-seven-d...,Rutendo
9205780,The Seven Deadly Sins: Revival of The Commandm...,2018,4.37,Watched,24,1.0,5.0,https://www.anime-planet.com/anime/the-seven-d...,Rutendo


In [29]:
anime

Unnamed: 0,title,type,num_eps,is_ongoing,duration,studio,start_year,end_year,season,avg_rating,num_votes,synopsis,tags,content_warnings,url
0,Gag Manga Biyori 2,TV,12.0,False,5.0,Artland,2006,2006,Summer,3.583,233.0,"The lupine detective Usami-chan is back, and r...","[Comedy, Shounen, Crude, Episodic, Gag, Short ...",,https://www.anime-planet.com/anime/gag-manga-b...
1,Fu Yu Nu,Movie,1.0,False,5.0,,2016,2016,,,,No synopsis yet - check back soon!,"[Chinese Animation, Shorts]",,https://www.anime-planet.com/anime/fu-yu-nu
2,Kijeu CSI: Gwahaksusadae,TV,104.0,False,23.0,,2012,2014,,,,No synopsis yet - check back soon!,"[Adventure, Mystery, Korean Animation]",,https://www.anime-planet.com/anime/kijeu-csi-g...
3,Zuoshou Shanglan,Web,,False,,,TBA,TBA,,,,No synopsis yet - check back soon!,"[Sports, Basketball, Chinese Animation]",,https://www.anime-planet.com/anime/zuoshou-sha...
4,Jeonsa Ryan,Movie,1.0,False,80.0,,1997,1997,,,,No synopsis yet - check back soon!,"[Adventure, Fantasy, Family Friendly, Korean A...",,https://www.anime-planet.com/anime/jeonsa-ryan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,BanG Dream! Movie: Episode of Roselia - Part I...,Movie,1.0,False,,SANZIGEN,2021,2021,,3.528,31.0,No synopsis yet - check back soon!,"[Idols, Music]",,https://www.anime-planet.com/anime/bang-dream-...
17123,Yu Yu Hakusho Picture Drama,DVD Special,1.0,False,6.0,Pierrot,2009,2009,,3.528,811.0,No synopsis yet - check back soon!,"[Comedy, Shounen, Picture Drama, Based on a Ma...",,https://www.anime-planet.com/anime/yu-yu-hakus...
17124,Kannagi: If You Are a Shrine Maiden,DVD Special,1.0,False,24.0,A-1 Pictures,2009,2009,,3.528,2082.0,One day Shino and Takako find a money clip in ...,"[Comedy, Fantasy, Shoujo, Japanese Mythology, ...",,https://www.anime-planet.com/anime/kannagi-if-...
17125,KADO: The Right Answer,TV,12.0,False,,Toei Animation,2017,2017,Spring,3.527,3203.0,Koujiro Shindo is a highly-skilled negotiator ...,"[Drama, Sci Fi, Political, CG Animation, Origi...",,https://www.anime-planet.com/anime/kado-the-ri...


In [30]:
left_df = watch_list[['title', 'url', 'username', 'status', 'times_watched', 'user_rating']]
df = left_df.merge(anime.drop(columns=['title', 'type']), how='left', on='url')

In [31]:
df

Unnamed: 0,title,url,username,status,times_watched,user_rating,num_eps,is_ongoing,duration,studio,start_year,end_year,season,avg_rating,num_votes,synopsis,tags,content_warnings
0,Day Break Illusion: il sole penetra le illusioni,https://www.anime-planet.com/anime/day-break-i...,Ruth,Watched,1.0,3.0,13.0,False,,AIC,2013,2013,Summer,3.158,1980.0,Akari has always known two things: she’s a ski...,"[Drama, Fantasy, Horror, Magical Girl, Contemp...",
1,Dog Days,https://www.anime-planet.com/anime/dog-days,Ruth,Watched,1.0,3.5,13.0,False,,Seven Arcs,2011,2011,Spring,3.524,9447.0,"In the magical land of Flonyard, animal-eared ...","[Action, Adventure, Ecchi, Fantasy, Animal Cha...",
2,Dog Days',https://www.anime-planet.com/anime/dog-days-2,Ruth,Watched,1.0,3.0,13.0,False,,Seven Arcs,2012,2012,Summer,3.651,5345.0,Three months have passed since Shinku returned...,"[Action, Adventure, Ecchi, Fantasy, Animal Cha...",
3,Fantasista Doll,https://www.anime-planet.com/anime/fantasista-...,Ruth,Watched,1.0,2.5,12.0,False,,Hoods Entertainment,2013,2013,Summer,2.918,1206.0,Uzume Uno was on her way to class one day when...,"[Magical Girl, Sci Fi, Seinen, Slice of Life, ...",
4,Fate/Kaleid Liner Prisma Illya,https://www.anime-planet.com/anime/fate-kaleid...,Ruth,Watched,1.0,3.5,10.0,False,,SILVER LINK.,2013,2013,Summer,3.513,5655.0,Illya loves magical girls; but more than anyth...,"[Action, Fantasy, Magical Girl, Shounen, Conte...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9205777,The Ancient Magus' Bride,https://www.anime-planet.com/anime/the-ancient...,Rutendo,Watched,2.0,5.0,24.0,False,,WIT Studio,2017,2018,Fall,4.227,22637.0,Chise Hitori was a child nobody wanted. Told b...,"[Drama, Fantasy, Romance, Apprenticeship, Cont...","[Animal Abuse, Mature Themes, Physical Abuse, ..."
9205778,The Disastrous Life of Saiki K.,https://www.anime-planet.com/anime/the-disastr...,Rutendo,Watched,2.0,5.0,120.0,False,4.0,J.C.Staff,2016,2016,Summer,4.308,23850.0,Kusuo Saiki is a typical 16-year-old high scho...,"[Comedy, Shounen, Slice of Life, Breaking the ...",
9205779,The Seven Deadly Sins,https://www.anime-planet.com/anime/the-seven-d...,Rutendo,Watched,1.0,5.0,24.0,False,,A-1 Pictures,2014,2015,Fall,4.187,59396.0,The kingdom of Liones is thrown into turmoil w...,"[Action, Adventure, Fantasy, Shounen, Demons, ...","[Animal Abuse, Mature Themes, Violence]"
9205780,The Seven Deadly Sins: Revival of The Commandm...,https://www.anime-planet.com/anime/the-seven-d...,Rutendo,Watched,1.0,5.0,24.0,False,,A-1 Pictures,2018,2018,Winter,4.210,27525.0,Only six of the Seven Deadly Sins have been lo...,"[Action, Adventure, Fantasy, Shounen, Demons, ...",[Violence]


### Dealing with missing values

In [32]:
df.isnull().sum()

title                     0
url                       0
username                  0
status                    0
times_watched        992993
user_rating               0
num_eps                 148
is_ongoing              148
duration            8871208
studio                11586
start_year              148
end_year                148
season               244827
avg_rating              148
num_votes               148
synopsis                148
tags                    385
dtype: int64

In [33]:
df['times_watched'] = df['times_watched'].fillna(0)

In [34]:
df = df.loc[df['avg_rating'].notnull()].reset_index(drop=True)

In [35]:
df.isnull().sum()

title                     0
url                       0
username                  0
status                    0
times_watched             0
user_rating               0
num_eps                   0
is_ongoing                0
duration            8871060
studio                11438
start_year                0
end_year                  0
season               244679
avg_rating                0
num_votes                 0
synopsis                  0
tags                    237
dtype: int64

### Fixing column errors & changing data types

In [36]:
df.memory_usage(deep=True) / 10**6

Index                  0.000128
title                714.333238
url                 1024.793195
username             607.002817
status               589.515176
times_watched         73.645072
user_rating           73.645072
num_eps               36.822536
is_ongoing           294.819120
duration              36.822536
studio                18.471786
start_year             9.216145
end_year               9.216145
season                 9.206056
avg_rating            36.822536
num_votes             36.822536
synopsis            7943.215022
tags                1491.798256
dtype: float64

In [37]:
sum(df.memory_usage(deep=True) / 10**6)

13465.036372

In [38]:
df.dtypes

title                 object
url                   object
username              object
status                object
times_watched        float64
user_rating          float64
num_eps              float32
is_ongoing            object
duration             float32
studio              category
start_year          category
end_year            category
season              category
avg_rating           float32
num_votes            float32
synopsis              string
tags                  object
dtype: object

In [39]:
df['title'].unique()

array(['Day Break Illusion: il sole penetra le illusioni', 'Dog Days',
       "Dog Days'", ..., 'The [email\xa0protected]TER',
       'The [email\xa0protected]STER: Cinderella Girls',
       'The [email\xa0protected]ER: Xenoglossia'], dtype=object)

In [40]:
df.loc[df['title'].str.contains('\[email\xa0protected\]'), 'title'].unique()

array(['The [email\xa0protected]',
       'The [email\xa0protected]: Xenoglossia',
       'The [email\xa0protected]: Cinderella Girls',
       'The [email\xa0protected]: Cinderella Girls Second Series',
       '[email\xa0protected]',
       'The [email\xa0protected] SideM: Wake Atte Mini!',
       'The [email\xa0protected] SideM',
       'The [email\xa0protected]: Cinderella Girls Theater',
       'The [email\xa0protected]: Cinderella Girls Theater 2nd Season',
       'The [email\xa0protected]: Cinderella Girls Theater 3rd Season',
       'The [email\xa0protected]: Cinderella Girls Theater Climax Season',
       'The iDO[email\xa0protected]', '[email\xa0protected]i',
       'The [email\xa0protected]R', '[email\xa0protected]anbaranai',
       'The iD[email\xa0protected]: Cinderella Girls Second Series',
       'The IDOLM[email\xa0protected]: Cinderella Girls Theater',
       'The [email\xa0protected]TER: Cinderella Girls Second Series',
       'Sas[email\xa0protected]', '[email\xa0prote

In [41]:
df.loc[df['title'].str.contains('\[email\xa0protected\]'), 'url'].unique()

array(['https://www.anime-planet.com/anime/the-idolmaster',
       'https://www.anime-planet.com/anime/the-idolmster-xenoglossia',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-second-series',
       'https://www.anime-planet.com/anime/sasami-san-at-ganbaranai',
       'https://www.anime-planet.com/anime/the-idolmaster-side-m-wake-atte-mini',
       'https://www.anime-planet.com/anime/the-idolmaster-side-m',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-2nd-season',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-3rd-season',
       'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-climax-season'],
      dtype=object)

In [42]:
url_title_map = \
{
    'https://www.anime-planet.com/anime/the-idolmaster': 'The iDOLM@STER',
    'https://www.anime-planet.com/anime/sasami-san-at-ganbaranai': 'Sasami-san@Ganbaranai',
    'https://www.anime-planet.com/anime/the-idolmster-xenoglossia': 'The iDOLM@STER: Xenoglossia',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls': 'The iDOLM@STER: Cinderella Girls',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-second-series': 
                                                            'The iDOLM@STER: Cinderella Girls Second Series',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater': 'The IDOLM@STER: Cinderella Girls Theater',
    'https://www.anime-planet.com/anime/the-idolmaster-side-m': 'The iDOLM@STER SideM',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-2nd-season': 
                                                            'The iDOLM@STER: Cinderella Girls Theater 2nd Season',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-3rd-season':
                                                            'The iDOLM@STER: Cinderella Girls Theater 3rd Season',
    'https://www.anime-planet.com/anime/the-idolmaster-cinderella-girls-theater-climax-season':
                                                            'The iDOLM@STER: Cinderella Girls Theater Climax Season',
    'https://www.anime-planet.com/anime/the-idolmaster-side-m-wake-atte-mini': 'The iDOLM@STER SideM: Wake Atte Mini!'
}

In [43]:
df.loc[df['title'].str.contains('\[email\xa0protected\]'), 'title'] = \
    df.loc[df['title'].str.contains('\[email\xa0protected\]'), 'url'].map(url_title_map)

In [44]:
df['title'].nunique()

4473

In [45]:
df.columns

Index(['title', 'url', 'username', 'status', 'times_watched', 'user_rating',
       'num_eps', 'is_ongoing', 'duration', 'studio', 'start_year', 'end_year',
       'season', 'avg_rating', 'num_votes', 'synopsis', 'tags',
      dtype='object')

In [46]:
df['title'] = df['title'].astype('category')

In [47]:
df['url'] = df['url'].astype('category')

In [48]:
df['username'] = df['username'].astype('category')

In [49]:
df['status'] = pd.Categorical(df['status'], categories=['Dropped', 'Stalled', 'Watching', 'Watched'], ordered=True)

In [50]:
df['times_watched'] = df['times_watched'].astype('uint16')

In [51]:
df['user_rating'] = df['user_rating'].astype('float32')

In [52]:
df['num_eps'] = df['num_eps'].astype('uint16')

In [53]:
df['is_ongoing'] = df['is_ongoing'].astype('bool')

In [54]:
df['duration'] = df['duration'].astype('float32')

In [55]:
df['studio'] = df['studio'].astype('category')

In [56]:
df['start_year'] = df['start_year'].replace('TBA', np.NaN).astype('float')

In [57]:
df['end_year'] = df['end_year'].replace('TBA', np.NaN).astype('float')

In [58]:
df['season'] = pd.Categorical(df['season'], categories=['Winter', 'Spring', 'Summer', 'Fall'], ordered=True)

In [59]:
df['avg_rating'] = df['avg_rating'].astype('float32')

In [60]:
df['num_votes'] = df['num_votes'].astype('uint32')

In [61]:
df['synopsis'] = df['synopsis'].astype('string')

In [62]:
df.memory_usage(deep=True) / 10**6

Index                  0.000128
title                 18.893276
url                   19.044799
username              48.810999
status                 9.206063
times_watched         18.411268
user_rating           36.822536
num_eps               18.411268
is_ongoing             9.205634
duration              36.822536
studio                18.471786
start_year            73.645072
end_year              73.645072
season                 9.206056
avg_rating            36.822536
num_votes             36.822536
synopsis            7943.215022
tags                1491.798256
dtype: float64

In [63]:
sum(df.memory_usage(deep=True) / 10**6)

10358.123843000001

In [64]:
df

Unnamed: 0,title,url,username,status,times_watched,user_rating,num_eps,is_ongoing,duration,studio,start_year,end_year,season,avg_rating,num_votes,synopsis,tags,content_warnings
0,Day Break Illusion: il sole penetra le illusioni,https://www.anime-planet.com/anime/day-break-i...,Ruth,Watched,1,3.0,13,False,,AIC,2013.0,2013.0,Summer,3.158,1980,Akari has always known two things: she’s a ski...,"[Drama, Fantasy, Horror, Magical Girl, Contemp...",
1,Dog Days,https://www.anime-planet.com/anime/dog-days,Ruth,Watched,1,3.5,13,False,,Seven Arcs,2011.0,2011.0,Spring,3.524,9447,"In the magical land of Flonyard, animal-eared ...","[Action, Adventure, Ecchi, Fantasy, Animal Cha...",
2,Dog Days',https://www.anime-planet.com/anime/dog-days-2,Ruth,Watched,1,3.0,13,False,,Seven Arcs,2012.0,2012.0,Summer,3.651,5345,Three months have passed since Shinku returned...,"[Action, Adventure, Ecchi, Fantasy, Animal Cha...",
3,Fantasista Doll,https://www.anime-planet.com/anime/fantasista-...,Ruth,Watched,1,2.5,12,False,,Hoods Entertainment,2013.0,2013.0,Summer,2.918,1206,Uzume Uno was on her way to class one day when...,"[Magical Girl, Sci Fi, Seinen, Slice of Life, ...",
4,Fate/Kaleid Liner Prisma Illya,https://www.anime-planet.com/anime/fate-kaleid...,Ruth,Watched,1,3.5,10,False,,SILVER LINK.,2013.0,2013.0,Summer,3.513,5655,Illya loves magical girls; but more than anyth...,"[Action, Fantasy, Magical Girl, Shounen, Conte...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9205629,The Ancient Magus' Bride,https://www.anime-planet.com/anime/the-ancient...,Rutendo,Watched,2,5.0,24,False,,WIT Studio,2017.0,2018.0,Fall,4.227,22637,Chise Hitori was a child nobody wanted. Told b...,"[Drama, Fantasy, Romance, Apprenticeship, Cont...","[Animal Abuse, Mature Themes, Physical Abuse, ..."
9205630,The Disastrous Life of Saiki K.,https://www.anime-planet.com/anime/the-disastr...,Rutendo,Watched,2,5.0,120,False,4.0,J.C.Staff,2016.0,2016.0,Summer,4.308,23850,Kusuo Saiki is a typical 16-year-old high scho...,"[Comedy, Shounen, Slice of Life, Breaking the ...",
9205631,The Seven Deadly Sins,https://www.anime-planet.com/anime/the-seven-d...,Rutendo,Watched,1,5.0,24,False,,A-1 Pictures,2014.0,2015.0,Fall,4.187,59396,The kingdom of Liones is thrown into turmoil w...,"[Action, Adventure, Fantasy, Shounen, Demons, ...","[Animal Abuse, Mature Themes, Violence]"
9205632,The Seven Deadly Sins: Revival of The Commandm...,https://www.anime-planet.com/anime/the-seven-d...,Rutendo,Watched,1,5.0,24,False,,A-1 Pictures,2018.0,2018.0,Winter,4.210,27525,Only six of the Seven Deadly Sins have been lo...,"[Action, Adventure, Fantasy, Shounen, Demons, ...",[Violence]


### Save Data

In [65]:
df.to_pickle('../data/watch_list_clean.pkl')

In [66]:
%%bash
cd ../data

rm watch_list_clean.pkl.xz
xz -vT14 watch_list_clean.pkl

watch_list_clean.pkl: 78.3 MiB / 544.5 MiB = 0.144, 12 MiB/s, 0:44


In [67]:
df.to_csv('../data/watch_list_clean.csv', index=False)

In [68]:
%%bash
cd ../data

rm watch_list_clean.csv.xz
xz -vT14 watch_list_clean.csv

rm: cannot remove 'watch_list_clean.csv.xz': No such file or directory
watch_list_clean.csv: 302.0 MiB / 7,567.5 MiB = 0.040, 28 MiB/s, 4:30
