# Import libraries and Setting displays

In [12]:
import pandas as pd
import numpy as np
import os
import re

import matplotlib.pyplot as plt 
import seaborn as sns

In [13]:
pd.set_option('display.max_rows', 100) 
pd.set_option('display.max_columns', None) 

# pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', None)     # Show all rows
# pd.set_option('display.max_colwidth', None) # Show entire content of each column
pd.set_option('display.width', None)        # No limit on display width
pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping

# Opening file and creating dataframe

In [14]:
# Setting paths
current_dir = os.getcwd()  # Use os.getcwd() to get the current working directory
parent_dir = os.path.dirname(current_dir)
data_dir = os.path.join(parent_dir, "data")
file_path = os.path.join(data_dir, "")

top_anime_dataset_v2_csv_path = os.path.join(data_dir, "top_anime_dataset_v2.csv")
#----------------------------------------------------------------------------------------------------
# Creating dataframes
df_anime = pd.read_csv(top_anime_dataset_v2_csv_path)

df_anime.sample(5)

Unnamed: 0,anime_id,anime_url,image_url,name,english_name,japanese_names,score,genres,themes,demographics,synopsis,type,episodes,premiered,producers,studios,source,duration,rating,rank,popularity,favorites,scored_by,members
8043,46381,https://myanimelist.net/anime/46381/Shinkansen...,https://cdn.myanimelist.net/images/anime/1381/...,Shinkansen Henkei Robo Shinkalion Z the Animation,,新幹線変形ロボ シンカリオンZ THE ANIMATION,6.51,"Action, Sci-Fi",Mecha,Kids,In order to protect the peace and safety of Ja...,TV,41.0,spring 2021,"TV Tokyo, Shogakukan-Shueisha Productions",OLM,Other,24 min per ep,PG - Children,7033.0,12412,7,302,1692
363,52578,https://myanimelist.net/anime/52578/Boku_no_Ko...,https://cdn.myanimelist.net/images/anime/1545/...,Boku no Kokoro no Yabai Yatsu,The Dangers in My Heart,僕の心のヤバイやつ,8.23,"Comedy, Romance",School,Shounen,Kyoutarou Ichikawa may look like a shy and res...,TV,12.0,spring 2023,"Dentsu, TV Asahi, Movic, Magic Capsule, Avex P...",Shin-Ei Animation,Manga,23 min per ep,PG-13 - Teens 13 or older,364.0,681,5489,188945,370756
14422,39842,https://myanimelist.net/anime/39842/Onna_Maou_...,https://cdn.myanimelist.net/images/anime/1456/...,Onna Maou Melissa no H na Boukenki,Sexy Chronicles of Demon Queen Melissa,女魔王メリッサのHな冒険記,5.66,"Fantasy, Hentai",Mythology,,,ONA,2.0,,BOMB! CUTE! BOMB!,Office No. 8,Other,15 min per ep,Rx - Hentai,,11263,6,824,2428
8844,12921,https://myanimelist.net/anime/12921/Morita-san...,https://cdn.myanimelist.net/images/anime/3/362...,Morita-san wa Mukuchi. Specials,,森田さんは無口。,6.41,"Comedy, Slice of Life",School,,"Unaired episodes of the ""Morita-san wa Mukuchi...",Special,2.0,,"DAX Production, Takeshobo",Seven,4-koma manga,3 min per ep,PG-13 - Teens 13 or older,7638.0,9100,0,2436,4728
851,2450,https://myanimelist.net/anime/2450/Crayon_Shin...,https://cdn.myanimelist.net/images/anime/1824/...,Crayon Shin-chan Movie 09: Arashi wo Yobu Mour...,,映画　クレヨンしんちゃん　嵐を呼ぶモーレツ！オトナ帝国の逆襲,7.9,"Comedy, Sci-Fi",,Seinen,Adult people all over Japan had been captivate...,Movie,1.0,,"TV Asahi, Asatsu DK",Shin-Ei Animation,Manga,1 hr 29 min,G - All Ages,829.0,6345,113,6447,11562


# Seeking for nulls

In [15]:
df_anime.columns.to_list()

['anime_id',
 'anime_url',
 'image_url',
 'name',
 'english_name',
 'japanese_names',
 'score',
 'genres',
 'themes',
 'demographics',
 'synopsis',
 'type',
 'episodes',
 'premiered',
 'producers',
 'studios',
 'source',
 'duration',
 'rating',
 'rank',
 'popularity',
 'favorites',
 'scored_by',
 'members']

In [16]:
def check_anime_nulls(df_anime):
   
    nulls = df_anime.isnull().sum()
    null_percentage = (nulls / len(df_anime) * 100).round(2)
    null_report = pd.DataFrame({'Null Count': nulls, 'Null %': null_percentage})
    print(f"Qty of rows: ({len(df_anime)})")
    print(null_report)


check_anime_nulls(df_anime)

Qty of rows: (15000)
                Null Count  Null %
anime_id                 0    0.00
anime_url                0    0.00
image_url                0    0.00
name                     0    0.00
english_name          6642   44.28
japanese_names          46    0.31
score                    0    0.00
genres                1601   10.67
themes                5077   33.85
demographics         10592   70.61
synopsis               468    3.12
type                     1    0.01
episodes               112    0.75
premiered            10316   68.77
producers             5429   36.19
studios               2379   15.86
source                   0    0.00
duration                 0    0.00
rating                  68    0.45
rank                  3080   20.53
popularity               0    0.00
favorites                0    0.00
scored_by                0    0.00
members                  0    0.00


# Looking for Duplicate values

In [17]:
def check_duplicates(df, subset=None):
    dups = df[df.duplicated(subset=subset, keep=False)]
    print(f"Duplicates ({len(dups)}):\n{dups}" if not dups.empty else "No duplicates found.")

check_duplicates(df_anime) 

Duplicates (68):
       anime_id                                          anime_url                                          image_url                                               name                                       english_name                         japanese_names  score                                             genres                               themes demographics                                           synopsis        type  episodes    premiered                                          producers                             studios       source       duration                          rating     rank  popularity  favorites  scored_by  members
30        59571  https://myanimelist.net/anime/59571/Shingeki_n...  https://cdn.myanimelist.net/images/anime/1379/...  Shingeki no Kyojin Movie: Kanketsu-hen - The L...                   Attack on Titan: The Last Attack          劇場版 進撃の巨人 完結編 THE LAST ATTACK   8.81                            Action, Drama, Suspense             Go

In [18]:
# Very quick check for more accuracy
df_anime['concat'] = df_anime['anime_id'].astype(str) + '_' + df_anime['name'].astype(str) + '_' + df_anime['popularity'].astype(str)

len(df_anime) # 15000
df_anime['concat'].nunique() # 14966

df_anime.drop('concat', axis=1, inplace=True)

## Droping duplicates 

In [19]:
def drop_duplicates(df, subset=None, inplace=False):
    df.drop_duplicates(subset=subset, keep='first', inplace=inplace)
    if not inplace:
        return df

drop_duplicates(df_anime, inplace=True)
len(df_anime)

14966

# Checking data to impute/inferred

In [20]:
# check_anime_nulls(df_anime)
df_anime.sample(7)

Unnamed: 0,anime_id,anime_url,image_url,name,english_name,japanese_names,score,genres,themes,demographics,synopsis,type,episodes,premiered,producers,studios,source,duration,rating,rank,popularity,favorites,scored_by,members
8796,21167,https://myanimelist.net/anime/21167/Escha___Lo...,https://cdn.myanimelist.net/images/anime/2/617...,Escha & Logy no Atelier: Tasogare no Sora no R...,Atelier Escha & Logy: Alchemists of the Dusk Sky,エスカ&ロジーのアトリエ ～黄昏の空の錬金術士～,6.42,Fantasy,,,"This world has gone through many Dusks, and is...",TV,12.0,spring 2014,"Sotsu, Pony Canyon",Studio Gokumi,Game,23 min per ep,PG-13 - Teens 13 or older,7576.0,2887,80,24908,64991
4450,30988,https://myanimelist.net/anime/30988/Futsuu_no_...,https://cdn.myanimelist.net/images/anime/4/782...,Futsuu no Joshikousei ga [Locodol] Yattemita. OVA,,普通の女子校生が【ろこどる】やってみた流。OVA,7.05,"Comedy, Slice of Life",Idols (Female),,A new OVA for the idol girls series Futsuu no ...,OVA,2.0,,DAX Production,feel.,4-koma manga,24 min per ep,PG-13 - Teens 13 or older,4139.0,7262,4,3808,8396
6433,9005,https://myanimelist.net/anime/9005/To_Heart_2_...,https://cdn.myanimelist.net/images/anime/8/754...,To Heart 2 Adnext,,ToHeart2 adnext,6.72,"Comedy, Romance",Harem,,Komaki Manaka daydreams about classmate Kono T...,OVA,2.0,,,Chaos Project,Visual novel,28 min per ep,PG-13 - Teens 13 or older,5785.0,8548,4,2096,5637
8039,5895,https://myanimelist.net/anime/5895/Tistou_Mido...,https://cdn.myanimelist.net/images/anime/6/688...,Tistou Midori no Oyayubi,Tistou the Green Thumb,チスト　みどりのおやゆび,6.51,,Historical,,When eight-year-old Tistou is sent home from s...,Movie,1.0,,,Production I.G,Original,1 hr 14 min,G - All Ages,7021.0,14115,2,195,1043
4775,52104,https://myanimelist.net/anime/52104/Boku_wa_Ch...,https://cdn.myanimelist.net/images/anime/1099/...,Boku wa Chiisana Succubus no Shimobe,,僕は小さな淫魔〈サキュバス〉のしもべ,6.99,"Supernatural, Hentai",,,,OVA,2.0,,Mary Jane,New Generation,Manga,15 min per ep,Rx - Hentai,,8712,37,1946,5358
5954,11161,https://myanimelist.net/anime/11161/Hoshizora_...,https://cdn.myanimelist.net/images/anime/10/30...,Hoshizora e Kakaru Hashi: Kakaru ka? Gakuensai...,,星空へ架かる橋　架かるか？学園祭に恋の橋,6.79,"Comedy, Romance, Ecchi",School,,"Shortly before the upcoming festival, a year a...",Special,1.0,,,Doga Kobo,Visual novel,26 min,R+ - Mild Nudity,5423.0,4228,6,15962,31033
3364,57648,https://myanimelist.net/anime/57648/Nihon_e_Yo...,https://cdn.myanimelist.net/images/anime/1650/...,Nihon e Youkoso Elf-san.,"Welcome to Japan, Ms. Elf!",日本へようこそエルフさん。,7.23,"Comedy, Fantasy",Isekai,,Kazuhiro Kitase's only hobby is sleeping. Ever...,TV,12.0,winter 2025,"Lantis, Toei Video, Mainichi Broadcasting Syst...",Zero-G,Light novel,23 min per ep,PG-13 - Teens 13 or older,3222.0,3203,163,7277,53154


### Column: [''premiered']:
Separate it in 2 column for better readness, one with year and the other one with season

In [21]:
df_anime['premiered'].sample(5)

# Extract the season only with regex and make a new column for it
df_anime['premiered_season'] = df_anime['premiered'].str.extract(r'(?i)(spring|summer|fall|winter)')

# Extract the year only and make a new column for it.
df_anime['premiered_year'] = df_anime['premiered'].str.extract(r'(\d{4})')

# Fill missing values with 'unknown'
df_anime['premiered_season'].fillna('unknown', inplace=True)
df_anime['premiered_year'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['premiered_season'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['premiered_year'].fillna('', inplace=True)


### Column: ['type']:

By checking, it has sense because of the duration that the Nanit has be 'TV'.

In [22]:
# df_anime['type'].unique()
# # --------------------------------------------------------------------------
# df_anime['type'].isnull().sum()
# df_anime[df_anime['type'].isin(['nan'])][['type', 'duration']].sample(5)
# # --------------------------------------------------------------------------
# null_type_df = df_anime[df_anime['type'].isnull()][['type', 'duration']]

# if len(null_type_df) > 0:
#     muestra_tipos_vacios = null_type_df.sample(min(5, len(null_type_df)))
#     print(muestra_tipos_vacios)
# else:
#     print("No rows with null 'type'.")
# --------------------------------------------------------------------------
# check_type_duration = df_anime[['type', 'duration']]
# check_type_duration.describe()
# --------------------------------------------------------------------------
df_anime['type'].fillna('TV', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['type'].fillna('TV', inplace=True)


### Column: ['episodes']:

Since it has only a 0.71% of nulls, it will be inferred taking into account the types and using the median.


In [23]:
check_ona = df_anime[df_anime['type'].isin(['ONA'])][['episodes', 'type']]
check_ova = df_anime[df_anime['type'].isin(['OVA'])][['episodes', 'type']]
# --------------------------------------------------------------------------
check_ona.describe()
check_ova.describe()
# --------------------------------------------------------------------------
episodes_ova = df_anime[df_anime['type'] == 'OVA']['episodes'].dropna()
if not episodes_ova.empty:
    min_ova = episodes_ova.min()
    max_ova = episodes_ova.max()
    mean_ova = episodes_ova.mean().round(2)
    median_ova = episodes_ova.median()
    mode_ova = episodes_ova.mode().tolist()
    print(f"OVA: min: {min_ova} max: {max_ova}  mean: {mean_ova} median: {median_ova} mode: {mode_ova}")

print("___"*18)

episodes_ona = df_anime[df_anime['type'] == 'ONA']['episodes'].dropna()
if not episodes_ova.empty:
    min_ona = episodes_ova.min()
    max_ona = episodes_ova.max()
    mean_ona = episodes_ona.mean().round(2)
    median_ona = episodes_ona.median()
    mode_ona = episodes_ona.mode().tolist()
    print(f"ONA: min: {min_ona} max: {max_ona}  mean: {mean_ona} median: {median_ona} mode: {mode_ona}")
# --------------------------------------------------------------------------
# Using median to fill the nulls

df_anime['episodes'] = df_anime.groupby('type')['episodes'].transform(lambda x: x.fillna(x.median()))

OVA: min: 1.0 max: 110.0  mean: 2.61 median: 2.0 mode: [1.0]
______________________________________________________
ONA: min: 1.0 max: 110.0  mean: 15.17 median: 10.0 mode: [1.0]


### Column ['']

In [24]:
df_anime[['themes', 'synopsis', 'producers', 'studios', 'source']].sample(5)

Unnamed: 0,themes,synopsis,producers,studios,source
3902,"Music, Urban Fantasy",The United States of America has been in chaos...,"DeNA, Bandai Namco Arts","Madhouse, MAPPA",Mixed media
2874,Music,Music video for the song Obenkyou Shitoite yo ...,,,Original
4011,"Idols (Female), Music","Movie sequel of Wake Up, Girls!, announced at ...","TV Tokyo, Tatsunoko Production, AT-X, TOHO, Ul...","Ordet, Millepensee",Original
11873,,Hello Kitty has a fight with her mom. And she ...,Sanrio,,Unknown
1948,"Anthropomorphic, Educational, Medical","Due to poor lifestyle choices, a certain human...","Aniplex, Kodansha, Studio Mausu, Tokyo MX, Net...",LIDENFILMS,Manga


### Column: ['english_name'] 

- First try was to use the library googletrans to translate the column from japaneses_names to english_name. Problem was with package httpx since it was not compatible with the specific version needed to different important libraries.
- Also, due to the huge complexity of the language and the subtlety of the of titles adaptation this type of "hard translation" might not be the best or adecuate.
- By checking the column [name] it has a close similarity to the column [english_name], since [name] has 0% of nulls, it will be used to imferred the [english_name] nulls.


In [25]:
df_anime['english_name'] = df_anime['english_name'].fillna(df_anime['name'])

### Column ['demographics']
There are only 6 unique values which correspond to: 

- array(['Shounen', nan, 'Seinen', 'Shoujo', 'Josei', 'Kids, Shounen','Kids', 'Kids, Shoujo'], dtype=object)
- Young boys, Adult men, Young girls, Adult women, For children

Since there's a 70.60% of nulls, the column will be droped.

In [26]:
df_anime['demographics'].unique()
df_anime.drop(columns=['demographics'], inplace=True)

In [27]:
df_anime.sample(5)

Unnamed: 0,anime_id,anime_url,image_url,name,english_name,japanese_names,score,genres,themes,synopsis,type,episodes,premiered,producers,studios,source,duration,rating,rank,popularity,favorites,scored_by,members,premiered_season,premiered_year
7844,19825,https://myanimelist.net/anime/19825/Saikyou_Gi...,https://cdn.myanimelist.net/images/anime/1462/...,Saikyou Ginga Ultimate Zero: Battle Spirits,Saikyou Ginga Ultimate Zero: Battle Spirits,最強銀河 究極[アルティメット]ゼロ~バトルスピリッツ~,6.53,Adventure,"Space, Strategy Game","In the new, whimsical era of Battle Spirits, c...",TV,49.0,fall 2013,"Asatsu DK, Nagoya Broadcasting Network",Sunrise,Card game,23 min per ep,PG - Children,6918.0,11147,4,938,2502,fall,2013.0
7215,8062,https://myanimelist.net/anime/8062/Mai-HiME__K...,https://cdn.myanimelist.net/images/anime/11/19...,Mai-HiME: Kuro no Mai/Saigo no Bansan,Mai-HiME: Kuro no Mai/Saigo no Bansan,舞-HiME: 黒の舞／最後の晩餐,6.62,Comedy,,Mai invites Reito to go with her to the beach ...,Special,1.0,,,Sunrise,Original,4 min,PG-13 - Teens 13 or older,6368.0,8096,3,2613,6381,unknown,
8756,52755,https://myanimelist.net/anime/52755/Alita_De_S...,https://cdn.myanimelist.net/images/anime/1640/...,Alita De Shui Gian Gushi,A Bedtime Tale,阿莉塔的睡前故事,6.42,"Action, Fantasy",,The soul which looks up to the giant dragon or...,ONA,1.0,,,Studio Tumble,Original,18 min,PG-13 - Teens 13 or older,7564.0,18290,1,128,399,unknown,
13758,36166,https://myanimelist.net/anime/36166/Fireball_H...,https://cdn.myanimelist.net/images/anime/11/87...,Fireball Humorous,Fireball Humorous,ファイアボール ユーモラス,5.76,"Comedy, Sci-Fi",,,TV,3.0,fall 2017,Disney Platform Distribution,Jinnis Animation Studios,Original,2 min per ep,G - All Ages,11116.0,9805,3,1341,3778,fall,2017.0
10885,21293,https://myanimelist.net/anime/21293/Sore_Ike_A...,https://cdn.myanimelist.net/images/anime/9/562...,Sore Ike! Anpanman: Shabondama no Purun,"Anpanman: Purun, the Soap Bubble",それいけ! アンパンマン シャボン玉のプルン,6.16,"Comedy, Fantasy",,Anpanman's friends go see a bubble show perfor...,Movie,1.0,,"VAP, Tokyo Movie Shinsha",,Unknown,51 min,PG - Children,9040.0,15471,1,205,705,unknown,


## Saving the dataframe

In [28]:
clean_anime_csv_path = os.path.join(data_dir, "clean_anime.csv")
df_anime.to_csv(clean_anime_csv_path, index=False)


if os.path.exists(clean_anime_csv_path):
    print("File succesfully saved.")
    print(f"File size: {os.path.getsize(clean_anime_csv_path) / 1024:.2f} KB")
else:
    print("⚠️ Error while saving file.")

File succesfully saved.
File size: 12123.37 KB
