# Import libraries and Setting displays

In [1]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt 
import seaborn as sns

In [33]:
pd.set_option('display.max_rows', 20) 
pd.set_option('display.max_columns', None) 

# pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', None)     # Show all rows
# pd.set_option('display.max_colwidth', None) # Show entire content of each column
pd.set_option('display.width', None)        # No limit on display width
pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping

# Opening file and creating dataframe

In [35]:
# Setting paths
current_dir = os.getcwd()  # Use os.getcwd() to get the current working directory
parent_dir = os.path.dirname(current_dir)
data_dir = os.path.join(parent_dir, "data")
file_path = os.path.join(data_dir, "")

top_anime_dataset_v2_csv_path = os.path.join(data_dir, "top_anime_dataset_v2.csv")
#----------------------------------------------------------------------------------------------------
# Creating dataframes
df_anime = pd.read_csv(top_anime_dataset_v2_csv_path)

df_anime.sample(5)

Unnamed: 0,anime_id,anime_url,image_url,name,english_name,japanese_names,score,genres,themes,demographics,synopsis,type,episodes,premiered,producers,studios,source,duration,rating,rank,popularity,favorites,scored_by,members
2253,587,https://myanimelist.net/anime/587/Hanbun_no_Ts...,https://cdn.myanimelist.net/images/anime/10/75...,Hanbun no Tsuki ga Noboru Sora,Looking Up At The Half-Moon,半分の月がのぼる空,7.45,"Drama, Romance",,,"After contracting hepatitis A, Ezaki Yuuichi h...",TV,6.0,winter 2006,"Pony Canyon, Dream Force",Group TAC,Light novel,24 min per ep,PG-13 - Teens 13 or older,2180.0,2178,494,49559,102947
2773,1010,https://myanimelist.net/anime/1010/Ranma_½__Ch...,https://cdn.myanimelist.net/images/anime/1985/...,Ranma ½: Chou Musabetsu Kessen! Ranma Team vs....,Ranma ½: Team Ranma vs. The Legendary Phoenix,らんま１／２ 超無差別決戦！ 乱馬チームVS伝説の鳳凰,7.34,"Action, Adventure, Comedy, Supernatural",Martial Arts,Shounen,"Kuno purchases a strange egg, believing that t...",Movie,1.0,,Fuji TV,Studio Deen,Manga,31 min,PG-13 - Teens 13 or older,2630.0,5342,7,8794,18064
6057,58272,https://myanimelist.net/anime/58272/Boku_no_Ts...,https://cdn.myanimelist.net/images/anime/1101/...,Boku no Tsuma wa Kanjou ga Nai,My Wife Has No Emotion,僕の妻は感情がない,6.77,"Comedy, Romance, Sci-Fi",,Seinen,Takuma Kosugi is an office worker who does not...,TV,12.0,summer 2024,"TMS Entertainment, Mainichi Broadcasting Syste...",Tezuka Productions,Manga,23 min per ep,PG-13 - Teens 13 or older,5479.0,3072,219,21378,57674
3267,4618,https://myanimelist.net/anime/4618/RideBack,https://cdn.myanimelist.net/images/anime/1834/...,RideBack,Ride Back,ライドバック,7.25,"Action, Drama, Sci-Fi","Mecha, School",Seinen,"In the future, an organization called the GGP ...",TV,12.0,winter 2009,,Madhouse,Manga,24 min per ep,PG-13 - Teens 13 or older,3094.0,3338,133,21570,49449
9312,52244,https://myanimelist.net/anime/52244/Korashime_...,https://cdn.myanimelist.net/images/anime/1133/...,Korashime 2: Kyouikuteki Depaga Shidou,,懲らしめ2～狂育的デパガ指導～,6.35,Hentai,,,,OVA,4.0,,,PoRO,Visual novel,24 min per ep,Rx - Hentai,,10200,7,1022,3325


# Seeking for nulls

In [32]:
df_anime.columns.to_list()

['anime_url',
 'image_url',
 'name',
 'english_name',
 'japanese_names',
 'score',
 'genres',
 'themes',
 'demographics',
 'synopsis',
 'type',
 'episodes',
 'premiered',
 'producers',
 'studios',
 'source',
 'duration',
 'rating',
 'rank',
 'popularity',
 'favorites',
 'scored_by',
 'members']

In [36]:
def check_anime_nulls(df_anime):
    """
    Prints the null count and percentage for each column in df_anime.
    """
    nulls = df_anime.isnull().sum()
    null_percentage = (nulls / len(df_anime) * 100).round(2)
    null_report = pd.DataFrame({'Null Count': nulls, 'Null %': null_percentage})
    print(f"Qty of rows: ({len(df_anime)})")
    print(null_report)


check_anime_nulls(df_anime)

Qty of rows: (15000)
              Null Count  Null %
anime_id               0    0.00
anime_url              0    0.00
image_url              0    0.00
name                   0    0.00
english_name        6642   44.28
...                  ...     ...
rank                3080   20.53
popularity             0    0.00
favorites              0    0.00
scored_by              0    0.00
members                0    0.00

[24 rows x 2 columns]


# Looking for Duplicate values

In [38]:
def check_duplicates(df, subset=None):
    dups = df[df.duplicated(subset=subset, keep=False)]
    print(f"Duplicates ({len(dups)}):\n{dups}" if not dups.empty else "No duplicates found.")

check_duplicates(df_anime) 

Duplicates (68):
       anime_id                                          anime_url                                          image_url                                               name                                    english_name                         japanese_names  score                                             genres                    themes demographics                                           synopsis   type  episodes    premiered             producers                             studios    source       duration                          rating     rank  popularity  favorites  scored_by  members
30        59571  https://myanimelist.net/anime/59571/Shingeki_n...  https://cdn.myanimelist.net/images/anime/1379/...  Shingeki no Kyojin Movie: Kanketsu-hen - The L...                Attack on Titan: The Last Attack          劇場版 進撃の巨人 完結編 THE LAST ATTACK   8.81                            Action, Drama, Suspense  Gore, Military, Survival      Shounen  A compilation movie for Shin

In [47]:
# Very quick check for more accuracy
df_anime['concat'] = df_anime['anime_id'].astype(str) + '_' + df_anime['name'].astype(str) + '_' + df_anime['popularity'].astype(str)

len(df_anime) # 15000
df_anime['concat'].nunique() # 14966

df_anime.drop('concat', axis=1, inplace=True)

## Droping duplicates 

In [51]:
def drop_duplicates(df, subset=None, inplace=False):
    df.drop_duplicates(subset=subset, keep='first', inplace=inplace)
    if not inplace:
        return df

drop_duplicates(df_anime, inplace=True)
len(df_anime)

14966