# Import libraries and Setting displays

In [None]:
# import pandas as pd
# from googletrans import Translator

In [1]:
import pandas as pd
import numpy as np
import os
import re

import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
pd.set_option('display.max_rows', 100) 
pd.set_option('display.max_columns', None) 

# pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', None)     # Show all rows
# pd.set_option('display.max_colwidth', None) # Show entire content of each column
pd.set_option('display.width', None)        # No limit on display width
pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping

# Opening file and creating dataframe

In [3]:
# Setting paths
current_dir = os.getcwd()  # Use os.getcwd() to get the current working directory
parent_dir = os.path.dirname(current_dir)
data_dir = os.path.join(parent_dir, "data")
file_path = os.path.join(data_dir, "")

top_anime_dataset_v2_csv_path = os.path.join(data_dir, "top_anime_dataset_v2.csv")
#----------------------------------------------------------------------------------------------------
# Creating dataframes
df_anime = pd.read_csv(top_anime_dataset_v2_csv_path)

df_anime.sample(5)

Unnamed: 0,anime_id,anime_url,image_url,name,english_name,japanese_names,score,genres,themes,demographics,synopsis,type,episodes,premiered,producers,studios,source,duration,rating,rank,popularity,favorites,scored_by,members
8589,38672,https://myanimelist.net/anime/38672/Precure_Mi...,https://cdn.myanimelist.net/images/anime/1960/...,Precure Miracle Universe Movie,,映画プリキュアミラクルユニバース,6.44,Action,Mahou Shoujo,,So many mysteries☆ The grand adventure to the ...,Movie,1.0,,,Toei Animation,Original,1 hr 11 min,G - All Ages,7488.0,9928,3,1443,3629
7994,44259,https://myanimelist.net/anime/44259/Next_Color...,https://cdn.myanimelist.net/images/anime/1533/...,Next Color Planet,Next Color Planet,NEXT COLOR PLANET,6.51,,Music,,"To celebrate Suisei's 2nd Anniversary, her thi...",Music,1.0,,,,Original,4 min,G - All Ages,,15467,5,449,707
8877,925,https://myanimelist.net/anime/925/Transformers...,https://cdn.myanimelist.net/images/anime/1535/...,Transformers: The☆Headmasters,Transformers The Headmasters,トランスフォーマー ザ☆ヘッドマスターズ,6.41,"Action, Adventure, Sci-Fi","Mecha, Space",,Headmasters starts with Galvatron leading a ne...,TV,35.0,summer 1987,"Takara, Nippon Television Network",Toei Animation,Other,23 min per ep,PG-13 - Teens 13 or older,7628.0,9106,8,1966,4718
3344,44408,https://myanimelist.net/anime/44408/Long_Zu,https://cdn.myanimelist.net/images/anime/1854/...,Long Zu,Dragon Raja -The Blazing Dawn-,龙族 / 龍族 -The Blazing Dawn-,7.24,"Adventure, Fantasy",,,Lu Mingfei is a typical high school kid gettin...,ONA,16.0,,Tencent Penguin Pictures,Garden Culture,Novel,22 min per ep,PG-13 - Teens 13 or older,3133.0,3583,228,12241,43642
1039,33489,https://myanimelist.net/anime/33489/Little_Wit...,https://cdn.myanimelist.net/images/anime/1520/...,Little Witch Academia (TV),Little Witch Academia,リトルウィッチアカデミア,7.81,"Adventure, Comedy, Fantasy",School,,"""A believing heart is your magic!""—these were ...",TV,25.0,winter 2017,"Ultra Super Pictures, TOHO animation, Good Smi...",Trigger,Original,24 min per ep,G - All Ages,1005.0,336,7865,329504,669740


# Seeking for nulls

In [4]:
df_anime.columns.to_list()

['anime_id',
 'anime_url',
 'image_url',
 'name',
 'english_name',
 'japanese_names',
 'score',
 'genres',
 'themes',
 'demographics',
 'synopsis',
 'type',
 'episodes',
 'premiered',
 'producers',
 'studios',
 'source',
 'duration',
 'rating',
 'rank',
 'popularity',
 'favorites',
 'scored_by',
 'members']

In [5]:
def check_anime_nulls(df_anime):
   
    nulls = df_anime.isnull().sum()
    null_percentage = (nulls / len(df_anime) * 100).round(2)
    null_report = pd.DataFrame({'Null Count': nulls, 'Null %': null_percentage})
    print(f"Qty of rows: ({len(df_anime)})")
    print(null_report)


check_anime_nulls(df_anime)

Qty of rows: (15000)
                Null Count  Null %
anime_id                 0    0.00
anime_url                0    0.00
image_url                0    0.00
name                     0    0.00
english_name          6642   44.28
japanese_names          46    0.31
score                    0    0.00
genres                1601   10.67
themes                5077   33.85
demographics         10592   70.61
synopsis               468    3.12
type                     1    0.01
episodes               112    0.75
premiered            10316   68.77
producers             5429   36.19
studios               2379   15.86
source                   0    0.00
duration                 0    0.00
rating                  68    0.45
rank                  3080   20.53
popularity               0    0.00
favorites                0    0.00
scored_by                0    0.00
members                  0    0.00


# Looking for Duplicate values

In [6]:
def check_duplicates(df, subset=None):
    dups = df[df.duplicated(subset=subset, keep=False)]
    print(f"Duplicates ({len(dups)}):\n{dups}" if not dups.empty else "No duplicates found.")

check_duplicates(df_anime) 

Duplicates (68):
       anime_id                                          anime_url                                          image_url                                               name                                       english_name                         japanese_names  score                                             genres                               themes demographics                                           synopsis        type  episodes    premiered                                          producers                             studios       source       duration                          rating     rank  popularity  favorites  scored_by  members
30        59571  https://myanimelist.net/anime/59571/Shingeki_n...  https://cdn.myanimelist.net/images/anime/1379/...  Shingeki no Kyojin Movie: Kanketsu-hen - The L...                   Attack on Titan: The Last Attack          劇場版 進撃の巨人 完結編 THE LAST ATTACK   8.81                            Action, Drama, Suspense             Go

In [7]:
# Very quick check for more accuracy
df_anime['concat'] = df_anime['anime_id'].astype(str) + '_' + df_anime['name'].astype(str) + '_' + df_anime['popularity'].astype(str)

len(df_anime) # 15000
df_anime['concat'].nunique() # 14966

df_anime.drop('concat', axis=1, inplace=True)

## Droping duplicates 

In [8]:
def drop_duplicates(df, subset=None, inplace=False):
    df.drop_duplicates(subset=subset, keep='first', inplace=inplace)
    if not inplace:
        return df

drop_duplicates(df_anime, inplace=True)
len(df_anime)

14966

# Checking data to impute/inferred

In [27]:
# check_anime_nulls(df_anime)
df_anime.sample(7)

Unnamed: 0,anime_id,anime_url,image_url,name,english_name,japanese_names,score,genres,themes,synopsis,type,episodes,producers,studios,source,duration,rating,rank,popularity,favorites,scored_by,members,premiered_season,premiered_year
10599,50088,https://myanimelist.net/anime/50088/No2,https://cdn.myanimelist.net/images/anime/1188/...,No.2,No.2,ナンバーツー,6.2,,Music,Official music video for the song No.2 by Takuto.,Music,1.0,,,Original,4 min,R - 17+ (violence & profanity),,21117,0,116,223,unknown,unknown
13698,1385,https://myanimelist.net/anime/1385/Kurohime__S...,https://cdn.myanimelist.net/images/anime/1025/...,Kurohime: Shikkoku no Yakata,Black Widow,黒姫 -桎梏の館-,5.77,Hentai,,A camping trip takes a turn for the wild when ...,OVA,2.0,Discovery,Mook Animation,Visual novel,28 min per ep,Rx - Hentai,,9228,10,1750,4538,unknown,unknown
11104,29235,https://myanimelist.net/anime/29235/Ninja_Hatt...,https://cdn.myanimelist.net/images/anime/3/704...,Ninja Hattori-kun Plus Perman: Chounouryoku Wars,,忍者ハットリくん＋パーマン 超能力ウォーズ,6.13,Adventure,"Martial Arts, Super Power",Crossover film featuring both of Motoo Abiko's...,Movie,1.0,,Shin-Ei Animation,Unknown,52 min,PG - Children,9208.0,14327,0,382,984,unknown,unknown
13743,1644,https://myanimelist.net/anime/1644/Princess_Rouge,https://cdn.myanimelist.net/images/anime/1405/...,Princess Rouge,,プリンセス・ルージュ,5.77,"Fantasy, Romance",,"Yusuke, an orphan, literally has Rouge, the am...",OVA,2.0,"AC Create, Cosmic Ray, BEAM Entertainment, Nic...","J.C.Staff, Front Line",Original,28 min per ep,PG-13 - Teens 13 or older,11088.0,10702,4,1170,2867,unknown,unknown
7099,30812,https://myanimelist.net/anime/30812/Gyakuten_M...,https://cdn.myanimelist.net/images/anime/1019/...,Gyakuten Majo Saiban: Chijo na Majo ni Sabakar...,,逆転魔女裁判 ～痴女な魔女に裁かれちゃう～ THE ANIMATION,6.63,Hentai,,Based on the erotic game by Erectlip.,OVA,1.0,Pink Pineapple,Seven,Visual novel,25 min,Rx - Hentai,,6229,68,5323,12124,unknown,unknown
3357,1313,https://myanimelist.net/anime/1313/Digimon_Adv...,https://cdn.myanimelist.net/images/anime/8/203...,Digimon Adventure 02,Digimon Adventure 02,デジモンアドベンチャー０２,7.24,"Action, Adventure, Comedy, Fantasy",Isekai,Taichi Yagami and his friends have moved on to...,TV,50.0,"Yomiko Advertising, Fuji TV",Toei Animation,Original,25 min per ep,PG - Children,3168.0,1143,778,146320,227373,spring,2000
3780,12711,https://myanimelist.net/anime/12711/Uta_no☆Pri...,https://cdn.myanimelist.net/images/anime/12/44...,Uta no☆Prince-sama♪ Maji Love 2000%,Uta no Prince Sama 2,うたの☆プリンスさまっ♪ マジLOVE2000%,7.17,"Comedy, Romance","Idols (Male), Music, Reverse Harem, School","Entering her Master's course, Nanami Haruka is...",TV,13.0,"Showgate, King Records",A-1 Pictures,Visual novel,25 min per ep,PG-13 - Teens 13 or older,3535.0,2018,541,57147,115096,spring,2013


### Column: [''premiered']:
Separate it in 2 column for better readness, one with year and the other one with season

In [10]:
df_anime['premiered'].sample(5)

# Extract the season only with regex and make a new column for it
df_anime['premiered_season'] = df_anime['premiered'].str.extract(r'(?i)(spring|summer|fall|winter)')

# Extract the year only and make a new column for it.
df_anime['premiered_year'] = df_anime['premiered'].str.extract(r'(\d{4})')

# Fill missing values with 'unknown'
df_anime['premiered_season'].fillna('unknown', inplace=True)
df_anime['premiered_year'].fillna('unknown', inplace=True)

# Drop column premiered
df_anime.drop(columns=['premiered'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['premiered_season'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['premiered_year'].fillna('unknown', inplace=True)


### Column: ['type']:

By checking, it has sense because of the duration that the Nanit has be 'TV'.

In [21]:
# df_anime['type'].unique()
# # --------------------------------------------------------------------------
# df_anime['type'].isnull().sum()
# df_anime[df_anime['type'].isin(['nan'])][['type', 'duration']].sample(5)
# # --------------------------------------------------------------------------
# null_type_df = df_anime[df_anime['type'].isnull()][['type', 'duration']]

# if len(null_type_df) > 0:
#     muestra_tipos_vacios = null_type_df.sample(min(5, len(null_type_df)))
#     print(muestra_tipos_vacios)
# else:
#     print("No rows with null 'type'.")
# --------------------------------------------------------------------------
# check_type_duration = df_anime[['type', 'duration']]
# check_type_duration.describe()
# --------------------------------------------------------------------------
df_anime['type'].fillna('TV', inplace=True)

### Column: ['episodes']:

Since it has only a 0.71% of nulls, it will be inferred taking into account the types and using the median.


In [22]:
check_ona = df_anime[df_anime['type'].isin(['ONA'])][['episodes', 'type']]
check_ova = df_anime[df_anime['type'].isin(['OVA'])][['episodes', 'type']]
# --------------------------------------------------------------------------
check_ona.describe()
check_ova.describe()
# --------------------------------------------------------------------------
episodes_ova = df_anime[df_anime['type'] == 'OVA']['episodes'].dropna()
if not episodes_ova.empty:
    min_ova = episodes_ova.min()
    max_ova = episodes_ova.max()
    mean_ova = episodes_ova.mean().round(2)
    median_ova = episodes_ova.median()
    mode_ova = episodes_ova.mode().tolist()
    print(f"OVA: min: {min_ova} max: {max_ova}  mean: {mean_ova} median: {median_ova} mode: {mode_ova}")

print("___"*18)

episodes_ona = df_anime[df_anime['type'] == 'ONA']['episodes'].dropna()
if not episodes_ova.empty:
    min_ona = episodes_ova.min()
    max_ona = episodes_ova.max()
    mean_ona = episodes_ona.mean().round(2)
    median_ona = episodes_ona.median()
    mode_ona = episodes_ona.mode().tolist()
    print(f"ONA: min: {min_ona} max: {max_ona}  mean: {mean_ona} median: {median_ona} mode: {mode_ona}")
# --------------------------------------------------------------------------
# Using median to fill the nulls

df_anime['episodes'] = df_anime.groupby('type')['episodes'].transform(lambda x: x.fillna(x.median()))

OVA: min: 1.0 max: 110.0  mean: 2.61 median: 2.0 mode: [1.0]
______________________________________________________
ONA: min: 1.0 max: 110.0  mean: 15.17 median: 10.0 mode: [1.0]


### Column ['']

In [None]:
df_anime[['themes', 'synopsis', 'producers', 'studios', 'source']].sample(5)

### Column: ['english_name'] 

At the moment the librarie googletrans is the most suitable but for the moment cannot be used for version issues. I'm trying to make a new special environmente but I haven't been able yet.

In [None]:
# def translate_japanese_names(df, japanese_col, english_col):
#     translator = Translator()
#     for index, row in df.iterrows():
#         if pd.isnull(row[english_col]) or not row[english_col]:
#             japanese_name = row[japanese_col]
#             if pd.notnull(japanese_name) and japanese_name:
#                 try:
#                     translation = translator.translate(japanese_name, dest='en')
#                     df.at[index, english_col] = translation.text
#                 except Exception as e:
#                     print(f"Translation error for '{japanese_name}': {e}")
#                     df.at[index, english_col] = None

# translate_japanese_names(df_anime, 'japanese_names', 'english_name')

In [None]:
df_anime['english_name'].fillna(df_anime['name'], inplace=True)

### Column ['demographics']
There are only 6 unique values which correspond to: 

- array(['Shounen', nan, 'Seinen', 'Shoujo', 'Josei', 'Kids, Shounen','Kids', 'Kids, Shoujo'], dtype=object)
- Young boys, Adult men, Young girls, Adult women, For children

Since there's a 70.60% of nulls, the column will be droped.

In [24]:
df_anime['demographics'].unique()
df_anime.drop(columns=['demographics'], inplace=True)

## Saving the dataframe

In [29]:
clean_anime_csv_path = os.path.join(data_dir, "clean_anime.csv")
df_anime.to_csv(clean_anime_csv_path, index=False)

if os.path.exists(clean_anime_csv_path):
    print("File succesfully saved.")
    print(f" File size: {os.path.getsize('clean_anime.csv') / 1024:.2f} KB")
else:
    print("⚠️ Error while saving file.")

File succesfully saved.
 File size: 11962.61 KB
