# Imports

In [31]:
import kaggle
import kagglehub
import numpy as np
import pandas as pd
from IPython.display import display

path = kagglehub.dataset_download("svanoo/myanimelist-dataset")
print("Path to dataset files:", path)

Path to dataset files: C:\Users\User\.cache\kagglehub\datasets\svanoo\myanimelist-dataset\versions\2


# Dataset analysis and cleaning

In [61]:
#Collumns that we are interested in
cols_to_use = ['anime_id', 'title', 'synopsis', 'genres' , 'score']
#Read the file while only taking certain collumns
full_df=pd.read_csv(r"C:\Users\User\.cache\kagglehub\datasets\svanoo\myanimelist-dataset\versions\2\anime.csv",
               sep='\t',
               usecols=cols_to_use)
print("Original Dataset")
print("Shape of the Dataset:",full_df.shape)
#Display fiest five columns of the dataset
display(full_df)


Original Dataset
Shape of the Dataset: (13379, 5)


Unnamed: 0,anime_id,title,synopsis,genres,score
0,2366,Touma Kishinden Oni,Shuramaru is hated and feared by the villagers...,Supernatural,
1,4940,Sabaku no Kaizoku! Captain Kuppa,"Sometime in the future, the world was complete...",Action|Adventure,
2,50285,On Air Dekinai!,"Set in 2014, the anime follows the adventures ...",Comedy,
3,3975,Uchi no 3 Shimai,The daily life of the Motsumoto family. The th...,Comedy|Slice of Life,
4,36036,Running Man,"The Soul Tree, the great source of our race. ...",Action|Adventure|Kids,
...,...,...,...,...,...
13374,32188,Steins;Gate: Kyoukaimenjou no Missing Link - D...,"Having reached his emotional breaking point, R...",Sci-Fi|Suspense,8.24
13375,31324,Grisaia no Meikyuu: Caprice no Mayu 0 - Takizo...,Special bundled with the Blu-ray/DVD volume of .,Ecchi,7.27
13376,31283,Bikini Warriors Special,"According to the official Hobby Japan website,...",Comedy|Fantasy|Ecchi|Parody,5.36
13377,33142,Re:Zero kara Hajimeru Break Time,A series of comedic shorts featuring chibi ver...,Adventure|Comedy|Fantasy|Parody,6.66


In [62]:
#Finds rows with missing synopsis
missing_synopsis = full_df[full_df['synopsis'].isnull() | (full_df['synopsis'] == ".")]
print("Finding rows with missing synopsis")
display(missing_synopsis)
print("Number of missing synopsis:",len(missing_synopsis))

#Removes the rows with missing synopsis
df_cleaned = full_df[~(full_df['synopsis'].isnull() | (full_df['synopsis'] == "."))]
print("Cleaned df")
display(df_cleaned)


Finding rows with missing synopsis


Unnamed: 0,anime_id,title,synopsis,genres,score
6,49065,Shuimu Xuetang 2nd Season,.,Comedy|Kids,
7,48335,Canimals,.,Comedy|Kids,
18,51119,Grisaia: Phantom Trigger the Animation (TV),.,Action|School,
19,34928,Akindo Sei no Little Peso,.,Comedy|Sci-Fi,
21,45596,Kindan Joshi,.,Comedy|Romance,
...,...,...,...,...,...
12681,9563,Hidamari Sketch x ☆☆☆ Specials,.,Comedy|Slice of Life|School|Seinen,7.82
12694,38154,3-gatsu no Lion: Ugoku! Nya Shogi,.,Game,6.06
12802,30533,Gundam Build Fighters Try: Island Wars,.,Action|Sci-Fi|Mecha,6.77
12889,38810,Bleach: Gotei 13 Omake,.,Action|Adventure,6.56


Number of missing synopsis: 864
Cleaned df


Unnamed: 0,anime_id,title,synopsis,genres,score
0,2366,Touma Kishinden Oni,Shuramaru is hated and feared by the villagers...,Supernatural,
1,4940,Sabaku no Kaizoku! Captain Kuppa,"Sometime in the future, the world was complete...",Action|Adventure,
2,50285,On Air Dekinai!,"Set in 2014, the anime follows the adventures ...",Comedy,
3,3975,Uchi no 3 Shimai,The daily life of the Motsumoto family. The th...,Comedy|Slice of Life,
4,36036,Running Man,"The Soul Tree, the great source of our race. ...",Action|Adventure|Kids,
...,...,...,...,...,...
13374,32188,Steins;Gate: Kyoukaimenjou no Missing Link - D...,"Having reached his emotional breaking point, R...",Sci-Fi|Suspense,8.24
13375,31324,Grisaia no Meikyuu: Caprice no Mayu 0 - Takizo...,Special bundled with the Blu-ray/DVD volume of .,Ecchi,7.27
13376,31283,Bikini Warriors Special,"According to the official Hobby Japan website,...",Comedy|Fantasy|Ecchi|Parody,5.36
13377,33142,Re:Zero kara Hajimeru Break Time,A series of comedic shorts featuring chibi ver...,Adventure|Comedy|Fantasy|Parody,6.66


In [63]:
#Find incomplete synopsis 


# Compute number of words for each synopsis and add in another collumn
df_cleaned.loc[:,'synopsis_word_count'] = df_cleaned['synopsis'].apply(lambda x: len(x.split()))
display(df_cleaned)
min_words=8
print("Minimum number of words in a synopsis:",min_words)

short_synopsis = df_cleaned[df_cleaned['synopsis_word_count'] < min_words]
display(short_synopsis[['title', 'synopsis', 'synopsis_word_count']])

#IMPORTANT NOTE: Removing rows based on word count may remove some meaningful synopsis.
#This problem can be solved if the model takes the anime title as the input and classifies the prompt from user based on synopsis and also anime titles
#This removes the need of data cleaning due to incomplete synopsis
#But for now, we remove every synopsis below 8 words

print("Cleaned df")
df_cleaned = df_cleaned[df_cleaned['synopsis_word_count'] >= min_words]
display(df_cleaned)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.loc[:,'synopsis_word_count'] = df_cleaned['synopsis'].apply(lambda x: len(x.split()))


Unnamed: 0,anime_id,title,synopsis,genres,score,synopsis_word_count
0,2366,Touma Kishinden Oni,Shuramaru is hated and feared by the villagers...,Supernatural,,22
1,4940,Sabaku no Kaizoku! Captain Kuppa,"Sometime in the future, the world was complete...",Action|Adventure,,48
2,50285,On Air Dekinai!,"Set in 2014, the anime follows the adventures ...",Comedy,,85
3,3975,Uchi no 3 Shimai,The daily life of the Motsumoto family. The th...,Comedy|Slice of Life,,35
4,36036,Running Man,"The Soul Tree, the great source of our race. ...",Action|Adventure|Kids,,107
...,...,...,...,...,...,...
13374,32188,Steins;Gate: Kyoukaimenjou no Missing Link - D...,"Having reached his emotional breaking point, R...",Sci-Fi|Suspense,8.24,94
13375,31324,Grisaia no Meikyuu: Caprice no Mayu 0 - Takizo...,Special bundled with the Blu-ray/DVD volume of .,Ecchi,7.27,8
13376,31283,Bikini Warriors Special,"According to the official Hobby Japan website,...",Comedy|Fantasy|Ecchi|Parody,5.36,28
13377,33142,Re:Zero kara Hajimeru Break Time,A series of comedic shorts featuring chibi ver...,Adventure|Comedy|Fantasy|Parody,6.66,15


Minimum number of words in a synopsis: 8


Unnamed: 0,title,synopsis,synopsis_word_count
5,Watanuki-san Chi to,Second season of .,4
12,Dungeon ni Deai wo Motomeru no wa Machigatteir...,Fourth season of .,4
17,Maou Gakuin no Futekigousha: Shijou Saikyou no...,Second half of .,4
28,Jashin-chan Dropkick X,Third season of .,4
32,"Komi-san wa, Comyushou desu. 2nd Season",Second season of .,4
...,...,...,...
13327,Ladies versus Butlers! Tokuten Disc Music Clip,Special music clip released on DVD.,6
13328,JoJo no Kimyou na Bouken Part 5: Ougon no Kaze...,Recaps of the series.,4
13338,Sukitte Ii na yo.: Mei and Marshmallow,Specials released with the anime BD/DVD.,6
13353,IS: Infinite Stratos 2 - Infinite Wedding,Picture drama included on the Blu-ray BOX.,7


Cleaned df


Unnamed: 0,anime_id,title,synopsis,genres,score,synopsis_word_count
0,2366,Touma Kishinden Oni,Shuramaru is hated and feared by the villagers...,Supernatural,,22
1,4940,Sabaku no Kaizoku! Captain Kuppa,"Sometime in the future, the world was complete...",Action|Adventure,,48
2,50285,On Air Dekinai!,"Set in 2014, the anime follows the adventures ...",Comedy,,85
3,3975,Uchi no 3 Shimai,The daily life of the Motsumoto family. The th...,Comedy|Slice of Life,,35
4,36036,Running Man,"The Soul Tree, the great source of our race. ...",Action|Adventure|Kids,,107
...,...,...,...,...,...,...
13374,32188,Steins;Gate: Kyoukaimenjou no Missing Link - D...,"Having reached his emotional breaking point, R...",Sci-Fi|Suspense,8.24,94
13375,31324,Grisaia no Meikyuu: Caprice no Mayu 0 - Takizo...,Special bundled with the Blu-ray/DVD volume of .,Ecchi,7.27,8
13376,31283,Bikini Warriors Special,"According to the official Hobby Japan website,...",Comedy|Fantasy|Ecchi|Parody,5.36,28
13377,33142,Re:Zero kara Hajimeru Break Time,A series of comedic shorts featuring chibi ver...,Adventure|Comedy|Fantasy|Parody,6.66,15


In [69]:
#get the number of unique genres
df_genres=df_cleaned
# Split the genres and count occurrences
from collections import Counter

# Flatten the list of genres after splitting each row by '|'
genre_counter = Counter(
    genre.strip() 
    for genre_list in df_genres['genres'] 
    for genre in genre_list.split('|')
)

# Display the genre counts as a DataFrame for clarity
genre_counts_df = pd.DataFrame(genre_counter.items(), columns=['Genre', 'Count']).sort_values(by='Count', ascending=False)
display(genre_counts_df)
genre_list = genre_counts_df['Genre'].tolist()
print(genre_list)  # prints first 5 elements


Unnamed: 0,Genre,Count
3,Comedy,4426
1,Action,3075
9,Fantasy,2382
2,Adventure,2097
18,Drama,1982
6,Sci-Fi,1909
16,Romance,1625
19,Shounen,1570
4,Slice of Life,1499
27,School,1391


['Comedy', 'Action', 'Fantasy', 'Adventure', 'Drama', 'Sci-Fi', 'Romance', 'Shounen', 'Slice of Life', 'School', 'Supernatural', 'Hentai', 'Kids', 'Historical', 'Mecha', 'Seinen', 'Mystery', 'Ecchi', 'Music', 'Shoujo', 'Sports', 'Super Power', 'Parody', 'Military', 'Demons', 'Harem', 'Horror', 'Space', 'Game', 'Martial Arts', 'Psychological', 'Avant Garde', 'Police', 'Samurai', 'Boys Love', 'Suspense', 'Vampire', 'Girls Love', 'Cars', 'Josei', 'Gourmet', 'Erotica', 'Work Life', 'Award Winning']
