# Import libraries and Setting displays

In [1]:
# import pandas as pd
# from googletrans import Translator

In [None]:
import pandas as pd
import numpy as np
import os
import re

import matplotlib.pyplot as plt 
import seaborn as sns

In [3]:
pd.set_option('display.max_rows', 100) 
pd.set_option('display.max_columns', None) 

# pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', None)     # Show all rows
# pd.set_option('display.max_colwidth', None) # Show entire content of each column
pd.set_option('display.width', None)        # No limit on display width
pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping

# Opening file and creating dataframe

In [4]:
# Setting paths
current_dir = os.getcwd()  # Use os.getcwd() to get the current working directory
parent_dir = os.path.dirname(current_dir)
data_dir = os.path.join(parent_dir, "data")
file_path = os.path.join(data_dir, "")

top_anime_dataset_v2_csv_path = os.path.join(data_dir, "top_anime_dataset_v2.csv")
#----------------------------------------------------------------------------------------------------
# Creating dataframes
df_anime = pd.read_csv(top_anime_dataset_v2_csv_path)

df_anime.sample(5)

Unnamed: 0,anime_id,anime_url,image_url,name,english_name,japanese_names,score,genres,themes,demographics,synopsis,type,episodes,premiered,producers,studios,source,duration,rating,rank,popularity,favorites,scored_by,members
14175,6721,https://myanimelist.net/anime/6721/Ankoku_Cat,https://cdn.myanimelist.net/images/anime/11/86...,Ankoku Cat,,暗黒キャット,5.7,"Action, Comedy",Anthropomorphic,,"Dark Side Cat, a mysterious stray cat, dashes ...",ONA,6.0,,,Fanworks,Original,6 min per ep,G - All Ages,11365.0,10940,2,1207,2662
12628,58098,https://myanimelist.net/anime/58098/1_Year,https://cdn.myanimelist.net/images/anime/1156/...,1 Year,,1year,5.92,,Music,,Music video for 1 Year by PinocchioP feat. Hat...,Music,1.0,,,,Original,4 min,PG-13 - Teens 13 or older,,21146,0,115,220
883,36028,https://myanimelist.net/anime/36028/Golden_Kamuy,https://cdn.myanimelist.net/images/anime/1145/...,Golden Kamuy,Golden Kamuy,ゴールデンカムイ,7.88,"Action, Adventure","Adult Cast, Historical, Military",Seinen,In early 1900s Hokkaido after the Russo-Japane...,TV,12.0,spring 2018,"Magic Capsule, NBCUniversal Entertainment Japa...",Geno Studio,Manga,23 min per ep,R - 17+ (violence & profanity),865.0,585,4726,173631,419280
12361,23645,https://myanimelist.net/anime/23645/Anime_Art_...,https://cdn.myanimelist.net/images/anime/1491/...,Anime Art Video Collection: Douwa,,アニメ・アート・ビデオ・コレクション 童話,5.96,Fantasy,,Kids,A series of animated shorts based on fairy tales.,OVA,6.0,,HoriPro,"Studio 4°C, Nippon Animation",Other,13 min per ep,PG - Children,10118.0,12721,0,650,1533
13179,2095,https://myanimelist.net/anime/2095/Salamander,https://cdn.myanimelist.net/images/anime/9/271...,Salamander,,沙羅曼蛇,5.85,"Adventure, Sci-Fi",,,Three tales based on the Gradius video game se...,OVA,3.0,,Pony Canyon,Pierrot,Game,53 min per ep,R+ - Mild Nudity,10694.0,11547,1,904,2227


# Seeking for nulls

In [5]:
df_anime.columns.to_list()

['anime_id',
 'anime_url',
 'image_url',
 'name',
 'english_name',
 'japanese_names',
 'score',
 'genres',
 'themes',
 'demographics',
 'synopsis',
 'type',
 'episodes',
 'premiered',
 'producers',
 'studios',
 'source',
 'duration',
 'rating',
 'rank',
 'popularity',
 'favorites',
 'scored_by',
 'members']

In [6]:
def check_anime_nulls(df_anime):
   
    nulls = df_anime.isnull().sum()
    null_percentage = (nulls / len(df_anime) * 100).round(2)
    null_report = pd.DataFrame({'Null Count': nulls, 'Null %': null_percentage})
    print(f"Qty of rows: ({len(df_anime)})")
    print(null_report)


check_anime_nulls(df_anime)

Qty of rows: (15000)
                Null Count  Null %
anime_id                 0    0.00
anime_url                0    0.00
image_url                0    0.00
name                     0    0.00
english_name          6642   44.28
japanese_names          46    0.31
score                    0    0.00
genres                1601   10.67
themes                5077   33.85
demographics         10592   70.61
synopsis               468    3.12
type                     1    0.01
episodes               112    0.75
premiered            10316   68.77
producers             5429   36.19
studios               2379   15.86
source                   0    0.00
duration                 0    0.00
rating                  68    0.45
rank                  3080   20.53
popularity               0    0.00
favorites                0    0.00
scored_by                0    0.00
members                  0    0.00


# Looking for Duplicate values

In [7]:
def check_duplicates(df, subset=None):
    dups = df[df.duplicated(subset=subset, keep=False)]
    print(f"Duplicates ({len(dups)}):\n{dups}" if not dups.empty else "No duplicates found.")

check_duplicates(df_anime) 

Duplicates (68):
       anime_id                                          anime_url                                          image_url                                               name                                       english_name                         japanese_names  score                                             genres                               themes demographics                                           synopsis        type  episodes    premiered                                          producers                             studios       source       duration                          rating     rank  popularity  favorites  scored_by  members
30        59571  https://myanimelist.net/anime/59571/Shingeki_n...  https://cdn.myanimelist.net/images/anime/1379/...  Shingeki no Kyojin Movie: Kanketsu-hen - The L...                   Attack on Titan: The Last Attack          劇場版 進撃の巨人 完結編 THE LAST ATTACK   8.81                            Action, Drama, Suspense             Go

In [8]:
# Very quick check for more accuracy
df_anime['concat'] = df_anime['anime_id'].astype(str) + '_' + df_anime['name'].astype(str) + '_' + df_anime['popularity'].astype(str)

len(df_anime) # 15000
df_anime['concat'].nunique() # 14966

df_anime.drop('concat', axis=1, inplace=True)

## Droping duplicates 

In [9]:
def drop_duplicates(df, subset=None, inplace=False):
    df.drop_duplicates(subset=subset, keep='first', inplace=inplace)
    if not inplace:
        return df

drop_duplicates(df_anime, inplace=True)
len(df_anime)

14966

# Checking data to impute/inferred

In [19]:
# check_anime_nulls(df_anime)
df_anime.sample(7)

Unnamed: 0,anime_id,anime_url,image_url,name,english_name,japanese_names,score,genres,themes,synopsis,type,episodes,producers,studios,source,duration,rating,rank,popularity,favorites,scored_by,members,premiered_season,premiered_year
1709,45558,https://myanimelist.net/anime/45558/Fanren_Xiu...,https://cdn.myanimelist.net/images/anime/1323/...,Fanren Xiu Xian Chuan: Yan Jia Bao Dazhan,Fanren Xiu Xian Chuan: Yan Jia Bao Dazhan,凡人修仙传 特别篇 燕家堡大战,7.58,"Action, Adventure, Fantasy","Historical, Martial Arts",Episodes 18-21 of Fanren Xiu Xian Chuan.,ONA,4.0,bilibili,"Original Force, Wonder Cat Animation",Novel,18 min per ep,PG-13 - Teens 13 or older,1654.0,10256,9,1582,3256,unknown,unknown
5503,55986,https://myanimelist.net/anime/55986/Na_mo_Naki...,https://cdn.myanimelist.net/images/anime/1151/...,Na mo Naki Nanimo Kamo,Nameless Name,名もなき何もかも,6.86,,Music,Music video for the song Na mo Naki Nanimo Kam...,Music,1.0,Toei Animation,,Original,4 min,PG-13 - Teens 13 or older,,14745,1,503,878,unknown,unknown
6437,6609,https://myanimelist.net/anime/6609/Pucca_TV,https://cdn.myanimelist.net/images/anime/3/151...,Pucca (TV),Pucca (TV),짜장소녀 뿌까,6.72,"Comedy, Romance",Parody,Pucca is the young daughter of a Chinese noodl...,TV,26.0,VOOZ Character System,,Other,23 min per ep,G - All Ages,5814.0,5264,40,9856,18792,fall,2006
1728,53585,https://myanimelist.net/anime/53585/Modern_Lov...,https://cdn.myanimelist.net/images/anime/1054/...,Modern Love Tokyo: Kare ga Kanaderu Futari no ...,Modern Love Tokyo: He's Playing Our Song,モダンラブ・東京 ~彼が奏でるふたりの調べ~,7.57,Romance,,Though her high school days are long behind he...,ONA,1.0,Robot Communications,The Answer Studio,Other,30 min,PG-13 - Teens 13 or older,1681.0,9634,14,1220,3970,unknown,unknown
4220,4763,https://myanimelist.net/anime/4763/Tsuma_Shibori,https://cdn.myanimelist.net/images/anime/3/915...,Tsuma Shibori,Beautiful Sisters,妻しぼり,7.09,Hentai,,,OVA,2.0,MS Pictures,T-Rex,Visual novel,29 min per ep,Rx - Hentai,,6746,49,4006,10073,unknown,unknown
727,54915,https://myanimelist.net/anime/54915/5-toubun_n...,https://cdn.myanimelist.net/images/anime/1567/...,5-toubun no Hanayome∽,The Quintessential Quintuplets~,五等分の花嫁∽,7.96,"Comedy, Romance","Harem, School",Featuring stories from the manga not adapted i...,TV Special,2.0,,Shaft,Manga,24 min per ep,PG-13 - Teens 13 or older,720.0,2511,555,29491,82009,unknown,unknown
11936,9244,https://myanimelist.net/anime/9244/Nine__Kanke...,https://cdn.myanimelist.net/images/anime/5/256...,Nine: Kanketsu-hen,Nine: Kanketsu-hen,ナイン 完結編,6.02,"Romance, Sports","School, Team Sports",Last part of the Nine TV movies.,TV Special,1.0,Fuji TV,Group TAC,Manga,1 hr 13 min,PG-13 - Teens 13 or older,9811.0,13775,0,361,1134,unknown,unknown


### Column: [''premiered']:
Separate it in 2 column for better readness, one with year and the other one with season

In [11]:
df_anime['premiered'].sample(5)

# Extract the season only with regex and make a new column for it
df_anime['premiered_season'] = df_anime['premiered'].str.extract(r'(?i)(spring|summer|fall|winter)')

# Extract the year only and make a new column for it.
df_anime['premiered_year'] = df_anime['premiered'].str.extract(r'(\d{4})')

# Fill missing values with 'unknown'
df_anime['premiered_season'].fillna('unknown', inplace=True)
df_anime['premiered_year'].fillna('unknown', inplace=True)

# Drop column premiered
df_anime.drop(columns=['premiered'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['premiered_season'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['premiered_year'].fillna('unknown', inplace=True)


### Column: ['type']:

By checking, it has sense because of the duration that the Nanit has be 'TV'.

In [12]:
# df_anime['type'].unique()
# # --------------------------------------------------------------------------
# df_anime['type'].isnull().sum()
# df_anime[df_anime['type'].isin(['nan'])][['type', 'duration']].sample(5)
# # --------------------------------------------------------------------------
# null_type_df = df_anime[df_anime['type'].isnull()][['type', 'duration']]

# if len(null_type_df) > 0:
#     muestra_tipos_vacios = null_type_df.sample(min(5, len(null_type_df)))
#     print(muestra_tipos_vacios)
# else:
#     print("No rows with null 'type'.")
# --------------------------------------------------------------------------
# check_type_duration = df_anime[['type', 'duration']]
# check_type_duration.describe()
# --------------------------------------------------------------------------
df_anime['type'].fillna('TV', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['type'].fillna('TV', inplace=True)


### Column: ['episodes']:

Since it has only a 0.71% of nulls, it will be inferred taking into account the types and using the median.


In [13]:
check_ona = df_anime[df_anime['type'].isin(['ONA'])][['episodes', 'type']]
check_ova = df_anime[df_anime['type'].isin(['OVA'])][['episodes', 'type']]
# --------------------------------------------------------------------------
check_ona.describe()
check_ova.describe()
# --------------------------------------------------------------------------
episodes_ova = df_anime[df_anime['type'] == 'OVA']['episodes'].dropna()
if not episodes_ova.empty:
    min_ova = episodes_ova.min()
    max_ova = episodes_ova.max()
    mean_ova = episodes_ova.mean().round(2)
    median_ova = episodes_ova.median()
    mode_ova = episodes_ova.mode().tolist()
    print(f"OVA: min: {min_ova} max: {max_ova}  mean: {mean_ova} median: {median_ova} mode: {mode_ova}")

print("___"*18)

episodes_ona = df_anime[df_anime['type'] == 'ONA']['episodes'].dropna()
if not episodes_ova.empty:
    min_ona = episodes_ova.min()
    max_ona = episodes_ova.max()
    mean_ona = episodes_ona.mean().round(2)
    median_ona = episodes_ona.median()
    mode_ona = episodes_ona.mode().tolist()
    print(f"ONA: min: {min_ona} max: {max_ona}  mean: {mean_ona} median: {median_ona} mode: {mode_ona}")
# --------------------------------------------------------------------------
# Using median to fill the nulls

df_anime['episodes'] = df_anime.groupby('type')['episodes'].transform(lambda x: x.fillna(x.median()))

OVA: min: 1.0 max: 110.0  mean: 2.61 median: 2.0 mode: [1.0]
______________________________________________________
ONA: min: 1.0 max: 110.0  mean: 15.17 median: 10.0 mode: [1.0]


### Column ['']

In [14]:
df_anime[['themes', 'synopsis', 'producers', 'studios', 'source']].sample(5)

Unnamed: 0,themes,synopsis,producers,studios,source
14437,Video Game,"Oragon lived a lonely life all by himself, unt...",XFLAG Pictures,Marza Animation Planet,Game
8997,"Idols (Male), Music",The seven members of BTS face their old selves...,,Studio Pivote,Original
4595,Harem,"""Kusano's first errand"" - A ten minute OVA tha...",Aniplex,Seven Arcs,Manga
12276,,"In ancient times, spiritual holiness that trie...",,Yi Chen Animation,Original
9351,School,Suga Kimio finds himself hiding in the girls l...,,Zexcs,Manga


### Column: ['english_name'] 

At the moment the librarie googletrans is the most suitable but for the moment cannot be used for version issues. I'm trying to make a new special environmente but I haven't been able yet.

In [15]:
# def translate_japanese_names(df, japanese_col, english_col):
#     translator = Translator()
#     for index, row in df.iterrows():
#         if pd.isnull(row[english_col]) or not row[english_col]:
#             japanese_name = row[japanese_col]
#             if pd.notnull(japanese_name) and japanese_name:
#                 try:
#                     translation = translator.translate(japanese_name, dest='en')
#                     df.at[index, english_col] = translation.text
#                 except Exception as e:
#                     print(f"Translation error for '{japanese_name}': {e}")
#                     df.at[index, english_col] = None

# translate_japanese_names(df_anime, 'japanese_names', 'english_name')

In [16]:
df_anime['english_name'].fillna(df_anime['name'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['english_name'].fillna(df_anime['name'], inplace=True)


### Column ['demographics']
There are only 6 unique values which correspond to: 

- array(['Shounen', nan, 'Seinen', 'Shoujo', 'Josei', 'Kids, Shounen','Kids', 'Kids, Shoujo'], dtype=object)
- Young boys, Adult men, Young girls, Adult women, For children

Since there's a 70.60% of nulls, the column will be droped.

In [17]:
df_anime['demographics'].unique()
df_anime.drop(columns=['demographics'], inplace=True)

## Saving the dataframe

In [18]:
clean_anime_csv_path = os.path.join(data_dir, "clean_anime.csv")
df_anime.to_csv(clean_anime_csv_path, index=False)

if os.path.exists(clean_anime_csv_path):
    print("File succesfully saved.")
    print(f" File size: {os.path.getsize('clean_anime.csv') / 1024:.2f} KB")
else:
    print("⚠️ Error while saving file.")

File succesfully saved.
 File size: 11962.61 KB
