# <span style="color:#ffc509"> Import libraries and Setting displays </span>

In [1]:
import pandas as pd
import numpy as np
import os
import re

import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
pd.set_option('display.max_rows', 100) 
pd.set_option('display.max_columns', None) 

# pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', None)     # Show all rows
# pd.set_option('display.max_colwidth', None) # Show entire content of each column
pd.set_option('display.width', None)        # No limit on display width
pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping

# <span style="color:#ffc509">  Opening file and creating dataframe </span>

In [3]:
# Setting paths
current_dir = os.getcwd()  # Use os.getcwd() to get the current working directory
parent_dir = os.path.dirname(current_dir)
data_dir = os.path.join(parent_dir, "data")
file_path = os.path.join(data_dir, "")

top_anime_dataset_v2_csv_path = os.path.join(data_dir, "top_anime_dataset_v2.csv")
#----------------------------------------------------------------------------------------------------
# Creating dataframes
df_anime = pd.read_csv(top_anime_dataset_v2_csv_path)

df_anime.sample(5)

Unnamed: 0,anime_id,anime_url,image_url,name,english_name,japanese_names,score,genres,themes,demographics,synopsis,type,episodes,premiered,producers,studios,source,duration,rating,rank,popularity,favorites,scored_by,members
11690,18759,https://myanimelist.net/anime/18759/Captain_Movie,https://cdn.myanimelist.net/images/anime/6/728...,Captain Movie,,キャプテン 劇場版,6.06,Sports,Team Sports,Shounen,The movie focuses on a game between Sumiya 2 a...,Movie,1.0,,Nippon Television Network,Eiken,Manga,1 hr 38 min,G - All Ages,9611.0,15986,0,124,626
14514,40713,https://myanimelist.net/anime/40713/Memories_M...,https://cdn.myanimelist.net/images/anime/1021/...,Memories (Music),,Memories,5.65,,Music,,MMD music video for the song Memories by Smile...,Music,1.0,,,,Original,2 min,G - All Ages,,17109,0,269,492
10687,10202,https://myanimelist.net/anime/10202/Kamen_Ride...,https://cdn.myanimelist.net/images/anime/1248/...,Kamen Rider Den-O: Imagin Anime 3,,仮面ライダー電王: イマジンあにめ 3,6.19,"Action, Adventure, Comedy",Super Power,Kids,A third set of animated shorts featuring the p...,OVA,12.0,,animate Film,,Other,4 min per ep,G - All Ages,8885.0,14378,1,340,965
5665,21653,https://myanimelist.net/anime/21653/IS__Infini...,https://cdn.myanimelist.net/images/anime/9/675...,IS: Infinite Stratos 2 - World Purge-hen,,IS〈インフィニット・ストラトス〉2　ワールド・パージ編,6.84,"Action, Comedy, Romance, Sci-Fi, Ecchi","Harem, School",,While Ichika Orimura departs to visit Kuramoch...,OVA,1.0,,Overlap,8bit,Light novel,44 min,R+ - Mild Nudity,5134.0,2463,61,41616,84735
1766,7762,https://myanimelist.net/anime/7762/Yondemasu_y...,https://cdn.myanimelist.net/images/anime/2/246...,"Yondemasu yo, Azazel-san.",,よんでますよ、アザゼルさん。,7.56,"Comedy, Supernatural","Gag Humor, Mythology",Seinen,"The great detective Akutabe has an assistant, ...",OVA,4.0,,Fonishia,Production I.G,Manga,21 min per ep,R+ - Mild Nudity,1732.0,4107,69,13533,33345


# <span style="color:#c9083f"> Seeking for nulls </span>

In [4]:
df_anime.columns.to_list()

['anime_id',
 'anime_url',
 'image_url',
 'name',
 'english_name',
 'japanese_names',
 'score',
 'genres',
 'themes',
 'demographics',
 'synopsis',
 'type',
 'episodes',
 'premiered',
 'producers',
 'studios',
 'source',
 'duration',
 'rating',
 'rank',
 'popularity',
 'favorites',
 'scored_by',
 'members']

In [5]:
def check_anime_nulls(df_anime):
   
    nulls = df_anime.isnull().sum()
    null_percentage = (nulls / len(df_anime) * 100).round(2)
    null_report = pd.DataFrame({'Null Count': nulls, 'Null %': null_percentage})
    print(f"Qty of rows: ({len(df_anime)})")
    print(null_report)


check_anime_nulls(df_anime)

Qty of rows: (15000)
                Null Count  Null %
anime_id                 0    0.00
anime_url                0    0.00
image_url                0    0.00
name                     0    0.00
english_name          6642   44.28
japanese_names          46    0.31
score                    0    0.00
genres                1601   10.67
themes                5077   33.85
demographics         10592   70.61
synopsis               468    3.12
type                     1    0.01
episodes               112    0.75
premiered            10316   68.77
producers             5429   36.19
studios               2379   15.86
source                   0    0.00
duration                 0    0.00
rating                  68    0.45
rank                  3080   20.53
popularity               0    0.00
favorites                0    0.00
scored_by                0    0.00
members                  0    0.00


# <span style="color:#c9083f">  Looking for Duplicate values </span>

In [6]:
def check_duplicates(df, subset=None):
    dups = df[df.duplicated(subset=subset, keep=False)]
    pd.set_option('display.max_rows', 5) 
    print(f"Duplicates ({len(dups)}):\n{dups}" if not dups.empty else "No duplicates found.")

check_duplicates(df_anime) 

Duplicates (68):
       anime_id                                          anime_url                                          image_url                                               name                      english_name                  japanese_names  score                   genres                    themes demographics                                           synopsis   type  episodes premiered    producers studios source     duration                          rating  rank  popularity  favorites  scored_by  members
30        59571  https://myanimelist.net/anime/59571/Shingeki_n...  https://cdn.myanimelist.net/images/anime/1379/...  Shingeki no Kyojin Movie: Kanketsu-hen - The L...  Attack on Titan: The Last Attack   劇場版 進撃の巨人 完結編 THE LAST ATTACK   8.81  Action, Drama, Suspense  Gore, Military, Survival      Shounen  A compilation movie for Shingeki no Kyojin: Th...  Movie       1.0       NaN  Pony Canyon   MAPPA  Manga  2 hr 25 min  R - 17+ (violence & profanity)  32.0        4169   

In [7]:
# Very quick check for more accuracy
df_anime['concat'] = df_anime['anime_id'].astype(str) + '_' + df_anime['name'].astype(str) + '_' + df_anime['popularity'].astype(str)

len(df_anime)                # there are 15000
df_anime['concat'].nunique() # there are 14966

df_anime.drop('concat', axis=1, inplace=True)

## <span style="color:#951445">  Droping duplicates  </span>

In [8]:
def drop_duplicates(df, subset=None, inplace=False):
    df.drop_duplicates(subset=subset, keep='first', inplace=inplace)
    if not inplace:
        return df

drop_duplicates(df_anime, inplace=True)
len(df_anime)

14966

# <span style="color:#c9083f">  Checking data to impute/inferred

### Column: <span style="color:#ff6a26"> ['premiered'] </span>
Separate it in 2 column for better readness, one with year and the other one with season

In [9]:
df_anime['premiered'].sample(5)

# Extract the season only with regex and make a new column for it
df_anime['premiered_season'] = df_anime['premiered'].str.extract(r'(?i)(spring|summer|fall|winter)')

# Extract the year only and make a new column for it.
df_anime['premiered_year'] = df_anime['premiered'].str.extract(r'(\d{4})')

# Fill missing values with 'unknown'
df_anime['premiered_season'].fillna('unknown', inplace=True)
df_anime['premiered_year'].fillna('', inplace=True)

# Droping the column premiered
df_anime.drop(columns=['premiered'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['premiered_season'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['premiered_year'].fillna('', inplace=True)


### Column: <span style="color:#ff6a26"> ['type'] </span>

By checking, it has sense because of the duration that the Nanit has be 'TV'.

In [10]:
# df_anime['type'].unique()
# # --------------------------------------------------------------------------
# df_anime['type'].isnull().sum()
# df_anime[df_anime['type'].isin(['nan'])][['type', 'duration']].sample(5)
# # --------------------------------------------------------------------------
# null_type_df = df_anime[df_anime['type'].isnull()][['type', 'duration']]

# if len(null_type_df) > 0:
#     muestra_tipos_vacios = null_type_df.sample(min(5, len(null_type_df)))
#     print(muestra_tipos_vacios)
# else:
#     print("No rows with null 'type'.")
# --------------------------------------------------------------------------
# check_type_duration = df_anime[['type', 'duration']]
# check_type_duration.describe()
# --------------------------------------------------------------------------
df_anime['type'].fillna('TV', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['type'].fillna('TV', inplace=True)


### Column: <span style="color:#ff6a26"> ['episodes'] </span>

Since it has only a 0.71% of nulls, it will be inferred taking into account the types and using the median.


In [11]:
check_ona = df_anime[df_anime['type'].isin(['ONA'])][['episodes', 'type']]
check_ova = df_anime[df_anime['type'].isin(['OVA'])][['episodes', 'type']]
# --------------------------------------------------------------------------
check_ona.describe()
check_ova.describe()
# --------------------------------------------------------------------------
episodes_ova = df_anime[df_anime['type'] == 'OVA']['episodes'].dropna()
if not episodes_ova.empty:
    min_ova = episodes_ova.min()
    max_ova = episodes_ova.max()
    mean_ova = episodes_ova.mean().round(2)
    median_ova = episodes_ova.median()
    mode_ova = episodes_ova.mode().tolist()
    print(f"OVA: min: {min_ova} max: {max_ova}  mean: {mean_ova} median: {median_ova} mode: {mode_ova}")

print("___"*18)

episodes_ona = df_anime[df_anime['type'] == 'ONA']['episodes'].dropna()
if not episodes_ova.empty:
    min_ona = episodes_ova.min()
    max_ona = episodes_ova.max()
    mean_ona = episodes_ona.mean().round(2)
    median_ona = episodes_ona.median()
    mode_ona = episodes_ona.mode().tolist()
    print(f"ONA: min: {min_ona} max: {max_ona}  mean: {mean_ona} median: {median_ona} mode: {mode_ona}")
# --------------------------------------------------------------------------
# Using median to fill the nulls

df_anime['episodes'] = df_anime.groupby('type')['episodes'].transform(lambda x: x.fillna(x.median()))

OVA: min: 1.0 max: 110.0  mean: 2.61 median: 2.0 mode: [1.0]
______________________________________________________
ONA: min: 1.0 max: 110.0  mean: 15.17 median: 10.0 mode: [1.0]


### Column <span style="color:#ff6a26"> [['producers', 'studios', 'synopsis', 'source']] </span>

There are no more columns to be able to inferred the columns, therefore, the nulls will be fill by "unknown"

In [12]:
# Fill missing values with 'unknown'
df_anime['producers'].fillna('unknown', inplace=True)
df_anime['studios'].fillna('unknown', inplace=True)
df_anime['synopsis'].fillna('unknown', inplace=True)
df_anime['synopsis'].fillna('unknown', inplace=True)
df_anime['source'].fillna('unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['producers'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['studios'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

### Column: <span style="color:#ff6a26"> ['english_name'] </span>


- First try was to use the library googletrans to translate the column from japaneses_names to english_name. Problem was with package httpx since it was not compatible with the specific version needed to different important libraries.
- Also, due to the huge complexity of the language and the subtlety of the of titles adaptation this type of "hard translation" might not be the best or adecuate.
- By checking the column [anime_url] it can observed that the last part is like the english names, therefore, the last part of the [anime_url] will be extracted to fill in the [english_name] when null and to replace "_" by a space. Besides, [anime_url] has 0% of null.


In [13]:
def extract_and_format_name(url):
    
    if pd.isna(url):
        return None 
    
    parts = url.split('/')
    if len(parts) >= 5: # Checking url structure
        name = parts[-1]  
        formatted_name = name.replace('_', ' ') 
        return formatted_name
    else:
        return None

# Calling function
df_anime['english_name'] = df_anime['english_name'].fillna(df_anime['anime_url'].apply(extract_and_format_name))

### Column: <span style="color:#ff6a26"> ['japanese_names'] </span>

There are 0.30% (count:45) of nulls, therefore, it will be replace for unknown

In [14]:
df_anime['japanese_names'].fillna('unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['japanese_names'].fillna('unknown', inplace=True)


### Column <span style="color:#ff6a26"> ['demographics'] </span>
There are only 6 unique values which correspond to: 

- array(['Shounen', nan, 'Seinen', 'Shoujo', 'Josei', 'Kids, Shounen','Kids', 'Kids, Shoujo'], dtype=object)
- Young boys, Adult men, Young girls, Adult women, For children

Since there's a 70.60% of nulls, the column will be droped.

In [15]:
df_anime['demographics'].unique()
df_anime.drop(columns=['demographics'], inplace=True)

### Column: <span style="color:#ff6a26"> ['duration'] </span>
Duration will be changed for only duration in minutes

Since The duration is a string (example: 29 min per ep, 59 min, 1 hr 37 min ) all the hours will be converted to 60 and in case added to the mins showned and also all the words will be changed so the column becomes a float. Separately, to better reflect the reality of the column, column will be renaimed to duration_in_min_and_secs.

In [16]:
df_anime[df_anime['duration'].str.contains('sec')]['duration'].sample(10)
df_anime['duration'].unique()

array(['24 min per ep', '24 min', '23 min per ep', '1 hr 44 min',
       '26 min per ep', '2 hr 10 min', '25 min per ep', '1 hr 50 min',
       '22 min per ep', '1 hr 12 min per ep', '2 hr 20 min',
       '1 hr 46 min', '2 hr 25 min', '3 min', '1 hr 23 min', '2 hr 4 min',
       '1 hr 36 min', '29 min per ep', '57 min', '1 hr 59 min',
       '2 hr 13 min', '2 hr 2 min', '1 hr 24 min', '1 hr 45 min',
       '30 min per ep', '2 hr 5 min', '1 hr 30 min', '2 hr 41 min',
       '2 hr 35 min', '47 min', '1 hr 4 min', '1 hr 57 min',
       '1 hr 56 min', '1 hr 48 min', '1 hr 21 min', '1 hr 26 min',
       '45 min', '1 hr 28 min', '1 hr 35 min', '2 hr', '1 hr 54 min',
       '1 hr 6 min per ep', '21 min', '1 hr 1 min per ep',
       '28 min per ep', '1 hr 13 min', '1 hr 36 min per ep',
       '16 min per ep', '5 min per ep', '27 min per ep', '1 hr 31 min',
       '9 min', '2 hr 15 min', '1 hr 38 min', '1 hr 55 min',
       '32 min per ep', '21 min per ep', '2 hr 1 min', '1 hr 49 min',
       '

### <span style="color:#951445"> First let's strap the spaces </dpan>

In [17]:
df_anime['duration'] = df_anime['duration'].str.lower().str.strip() # First, let's clean the spaces

### <span style="color:#951445"> Main function </span>

In [18]:
def clean_duration(duration_str):
    
    if pd.isna(duration_str) or duration_str == "Unknown":
        return None

    duration_str = duration_str.strip()

    hours = 0
    minutes = 0
    seconds = 0

    # Extract hours
    hr_match = re.search(r'(\d+)\s*hr', duration_str, flags=re.IGNORECASE)
    if hr_match:
        hours = int(hr_match.group(1))

    # Extract minutes
    min_match = re.search(r'(\d+)\s*min', duration_str, flags=re.IGNORECASE)
    if min_match:
        minutes = int(min_match.group(1))

    # Extract seconds
    sec_match = re.search(r'(\d+)\s*sec', duration_str, flags=re.IGNORECASE)
    if sec_match:
        seconds = int(sec_match.group(1))

    # Handle cases with only numbers
    if not hr_match and not min_match and not sec_match:
        num_match = re.search(r'(\d+)', duration_str)
        if num_match:
            value = int(num_match.group(1))
            if value < 60:
                seconds = value
            else:
                minutes = value

    total_minutes = hours * 60 + minutes
    return f"{total_minutes}:{seconds:02d}"

# Calling function and creating column
df_anime['duration_in_min_and_secs'] = df_anime['duration'].apply(clean_duration)

print(df_anime['duration_in_min_and_secs'].head(10))

0    24:00
1    24:00
     ...  
8    24:00
9    24:00
Name: duration_in_min_and_secs, Length: 10, dtype: object


### <span style="color:#951445"> Now let's check if the function did it's job by <u> comparing </u> it </span>

In [19]:
def compare_durations(row):
    original = row['duration']
    cleaned = row['duration_in_min_and_secs']
    if pd.isna(original) or pd.isna(cleaned):
        return False
    return original != cleaned

diff_rows = df_anime[df_anime.apply(compare_durations, axis=1)]

pd.set_option('display.max_rows', None) 
print(diff_rows[['duration', 'duration_in_min_and_secs']])

                 duration duration_in_min_and_secs
0           24 min per ep                    24:00
1           24 min per ep                    24:00
2           24 min per ep                    24:00
3                  24 min                    24:00
4           23 min per ep                    23:00
5           24 min per ep                    24:00
6             1 hr 44 min                   104:00
7           23 min per ep                    23:00
8           24 min per ep                    24:00
9           24 min per ep                    24:00
10          26 min per ep                    26:00
11          24 min per ep                    24:00
12          24 min per ep                    24:00
13          23 min per ep                    23:00
14          23 min per ep                    23:00
15          24 min per ep                    24:00
16          24 min per ep                    24:00
17            2 hr 10 min                   130:00
18          25 min per ep      

## <span style="color:#951445">  Droping original column  </span>

In [20]:
# # Drop the original duration column if needed
df_anime.drop('duration', axis=1, inplace=True)

### Column <span style="color:#ff6a26"> [themes] </span>

It has 33.85% of null

By checking models to see if inferred is possible:

- KNN = moderatly ok, there are some themes were is ok an another were it is not at all good.
- Random Forest = worst than KNN
- Logistic regression = worst than KNN but better than Random Forest
- Gradient Boosting = it gave an error 

It will be replaced by unknown

In [21]:
# Fill missing values with 'unknown'
df_anime['themes'].fillna('unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['themes'].fillna('unknown', inplace=True)


### Column <span style="color:#ff6a26"> [genres] </span>

It has 10.67% of null

By checking models to inferred:

- KNN: not good enough
- Random Forest:  not good either

It will be replaced by unknown

In [22]:
df_anime['genres'].nunique()
df_anime['genres'].sample(5)
df_anime[['name', 'themes', 'synopsis', 'genres']].sample(5)

# Fill missing values with 'unknown'
df_anime['genres'].fillna('unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_anime['genres'].fillna('unknown', inplace=True)


## <span style="color:#ffc509">  Analyzing the qty left of nulls and "unknown" </span>

In [23]:
def analizar_nulos_y_unknown(df):
   
    total_rows = len(df)
    resultados = []

    for columna in df.columns:
        null_count = df[columna].isnull().sum()
        null_percentage = ((null_count / total_rows) * 100).round(2)

        unknown_count = (df[columna] == 'unknown').sum()
        unknown_percentage = ((unknown_count / total_rows) * 100).round(2)

        resultados.append({
            'Columna': columna,
            'Null Count': null_count,
            'Null %': null_percentage,
            'Unknown Count': unknown_count,
            'Unknown %': unknown_percentage
        })

    return pd.DataFrame(resultados)

# Ejemplo de uso (asumiendo que tienes tu DataFrame llamado df_anime)
resultados_analisis = analizar_nulos_y_unknown(df_anime)

# Imprimir el DataFrame con los resultados
print(f"Qty of rows: ({len(df_anime)})")
print(resultados_analisis)

Qty of rows: (14966)
                     Columna  Null Count  Null %  Unknown Count  Unknown %
0                   anime_id           0    0.00              0       0.00
1                  anime_url           0    0.00              0       0.00
2                  image_url           0    0.00              0       0.00
3                       name           0    0.00              0       0.00
4               english_name           0    0.00              0       0.00
5             japanese_names           0    0.00             45       0.30
6                      score           0    0.00              0       0.00
7                     genres           0    0.00           1594      10.65
8                     themes           0    0.00           5066      33.85
9                   synopsis           0    0.00            466       3.11
10                      type           0    0.00              0       0.00
11                  episodes           0    0.00              0       0.00
12  

# <span style="color:#ffc509"> Saving the dataframe </span>

In [24]:
clean_anime_csv_path = os.path.join(data_dir, "clean_anime.csv")
df_anime.to_csv(clean_anime_csv_path, index=False)


if os.path.exists(clean_anime_csv_path):
    print("File succesfully saved.")
    print(f"File size: {os.path.getsize(clean_anime_csv_path) / 1024:.2f} KB")
else:
    print("⚠️ Error while saving file.")

File succesfully saved.
File size: 12081.60 KB
