Database source:
- [Kaggle](https://www.kaggle.com/hernan4444/anime-recommendation-database-2020)
- [GitHub](https://github.com/Hernan4444/MyAnimeList-Database)
- [File details](https://github.com/Hernan4444/MyAnimelist-Database/tree/master/data)

# Setup and Preprocessing

In [1]:
from loguru import logger
import nltk
from nltk.corpus import stopwords
import pickle
import re
import requests
import numpy as np
import pandas as pd
import swifter
from bs4 import BeautifulSoup
from sklearn.metrics.pairwise import cosine_similarity, sigmoid_kernel
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
pd.set_option("display.max_columns", 999)
# pd.reset_option("display.max_columns")

In [4]:
HTML_PATH = 'T:/New Download Folder/anime_html'
DATA_DIR = 'data/'

In [5]:
anime_df = pd.read_csv(DATA_DIR + 'anime.csv')
print(anime_df.shape)
anime_df.head()

(17562, 35)


Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating,Ranked,Popularity,Members,Favorites,Watching,Completed,On-Hold,Dropped,Plan to Watch,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity),28.0,39,1251960,61971,105808,718161,71513,26678,329800,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity),159.0,518,273145,1174,4143,208333,1935,770,57964,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older,266.0,201,558913,12944,29113,343492,25465,13925,146918,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,"TV Tokyo, Bandai Visual, Dentsu, Victor Entert...","Funimation, Bandai Entertainment",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older,2481.0,1467,94683,587,4300,46165,5121,5378,33719,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,"TV Tokyo, Dentsu",Unknown,Toei Animation,Manga,23 min. per ep.,PG - Children,3710.0,4369,13224,18,642,7314,766,1108,3394,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


In [5]:
anime_df.columns

Index(['MAL_ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1'],
      dtype='object')

In [None]:
# anime_df.rename(columns={'Genders': 'Genres'}, inplace=True)
# anime_df.to_csv(DATA_DIR + 'anime.csv', index=False)

In [4]:
synopsis_df = pd.read_csv(DATA_DIR + 'anime_with_synopsis.csv')
synopsis_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,synopsis,combined
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, SciFi, Space","In the year 2071, humanity has colonized sever...","Action, Adventure, Comedy, Drama, SciFi, Space..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, SciFi, Space","other day, another bounty—such is the life of ...","Action, Drama, Mystery, SciFi, Space other day..."
2,6,Trigun,8.24,"Action, SciFi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...","Action, SciFi, Adventure, Comedy, Drama, Shoun..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,"Action, Mystery, Police, Supernatural, Drama, ..."
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...,"Adventure, Fantasy, Shounen, Supernatural It i..."


In [87]:
# synopsis_df.rename(columns={'Genders': 'Genres', 'sypnopsis': 'synopsis'}, inplace=True)
# synopsis_df.to_csv(DATA_DIR + 'anime_with_synopsis.csv', index=False)

In [93]:
synopsis_df[synopsis_df.isna().any(axis=1)]

Unnamed: 0,MAL_ID,Name,Score,Genres,synopsis
11451,34755,Kuma no Gakkou: Patissier Jackie to Ohisama no...,Unknown,"Comedy, Kids",
11469,34794,Yukai na Animal Bus,Unknown,"Comedy, Kids",
13686,38475,Yuru Camp△ Movie,Unknown,"Comedy, Slice of Life",
15025,40714,Youkai Watch Jam: Youkai Gakuen Y - N to no So...,6.28,"Comedy, Demons, Kids, Supernatural, School",
15747,42717,Kaeru no Pickles: Kimochi no Iro,Unknown,"Slice of Life, Kids",
16056,44848,Iii Icecrin,Unknown,Kids,
16110,45731,Argonavis from BanG Dream! Movie,Unknown,Music,
16120,46095,Vivy: Fluorite Eye's Song,Unknown,Sci-Fi,


In [98]:
synopsis_df.dropna(inplace=True)

In [111]:
def clean_genres(genres):
    return genres.replace(',', '')

synopsis_df.Genres.apply(clean_genres)

0               Action Adventure Comedy Drama Sci-Fi Space
1                        Action Drama Mystery Sci-Fi Space
2             Action Sci-Fi Adventure Comedy Drama Shounen
3           Action Mystery Police Supernatural Drama Magic
4                   Adventure Fantasy Shounen Supernatural
                               ...                        
16209                       Adventure Mystery Supernatural
16210                           Comedy Horror Supernatural
16211    Mystery Dementia Horror Psychological Supernat...
16212                       Adventure Slice of Life Comedy
16213                                       Action Fantasy
Name: Genres, Length: 16206, dtype: object

In [130]:
synopsis_df.Genres = synopsis_df.Genres.str.replace('Sci-Fi', 'SciFi')

In [136]:
synopsis_df['combined'] = synopsis_df['Genres'] + ' ' + synopsis_df['synopsis']

In [137]:
synopsis_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,synopsis,combined
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, SciFi, Space","In the year 2071, humanity has colonized sever...","Action, Adventure, Comedy, Drama, SciFi, Space..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, SciFi, Space","other day, another bounty—such is the life of ...","Action, Drama, Mystery, SciFi, Space other day..."
2,6,Trigun,8.24,"Action, SciFi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...","Action, SciFi, Adventure, Comedy, Drama, Shoun..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,"Action, Mystery, Police, Supernatural, Drama, ..."
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...,"Adventure, Fantasy, Shounen, Supernatural It i..."


In [181]:
# synopsis_df.to_csv(DATA_DIR + 'anime_with_synopsis.csv', index=False)

In [138]:
synopsis_df['combined'][0]

'Action, Adventure, Comedy, Drama, SciFi, Space In the year 2071, humanity has colonized several of the planets and moons of the solar system leaving the now uninhabitable surface of planet Earth behind. The Inter Solar System Police attempts to keep peace in the galaxy, aided in part by outlaw bounty hunters, referred to as "Cowboys." The ragtag team aboard the spaceship Bebop are two such individuals. Mellow and carefree Spike Spiegel is balanced by his boisterous, pragmatic partner Jet Black as the pair makes a living chasing bounties and collecting rewards. Thrown off course by the addition of new members that they meet in their travels—Ein, a genetically engineered, highly intelligent Welsh Corgi; femme fatale Faye Valentine, an enigmatic trickster with memory loss; and the strange computer whiz kid Edward Wong—the crew embarks on thrilling adventures that unravel each member\'s dark and mysterious past little by little. Well-balanced with high density action and light-hearted com

In [4]:
synopsis_df = pd.read_csv(DATA_DIR + 'anime_with_synopsis.csv')
synopsis_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,synopsis,combined
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, SciFi, Space","In the year 2071, humanity has colonized sever...","Action, Adventure, Comedy, Drama, SciFi, Space..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, SciFi, Space","other day, another bounty—such is the life of ...","Action, Drama, Mystery, SciFi, Space other day..."
2,6,Trigun,8.24,"Action, SciFi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...","Action, SciFi, Adventure, Comedy, Drama, Shoun..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,"Action, Mystery, Police, Supernatural, Drama, ..."
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...,"Adventure, Fantasy, Shounen, Supernatural It i..."


In [32]:
def get_anime_rows(df, anime_name):
    return df[df.Name.str.contains(anime_name, case=False, regex=False)]

In [197]:
specific_animes = get_anime_rows(synopsis_df, 'bleach')
specific_animes

Unnamed: 0,MAL_ID,Name,Score,Genres,synopsis,combined,Characters
237,269,Bleach,7.8,"Action, Adventure, Comedy, Super Power, Supern...",Ichigo Kurosaki is an ordinary high schooler—u...,"Action, Adventure, Comedy, Super Power, Supern...","KurosakiIchigo, KuchikiRukia, AbaraiRenji"
679,762,Bleach: Memories in the Rain,7.15,"Action, Adventure, Supernatural, Drama, Shounen","Kurosaki Ichigo, the temporary Shinigami(Death...","Action, Adventure, Supernatural, Drama, Shoune...","KurosakiIchigo, KuchikiRukia, InoueOrihime"
731,834,Bleach: The Sealed Sword Frenzy,6.98,"Action, Adventure, Comedy, Super Power, Supern...",Shinigami named Baishin who was sealed by Soul...,"Action, Adventure, Comedy, Super Power, Supern...","KurosakiIchigo, KuchikiRukia, Baishin"
1468,1686,Bleach Movie 1: Memories of Nobody,7.45,"Action, Adventure, Comedy, Super Power, Supern...","hen a life ends, its soul departs to its final...","Action, Adventure, Comedy, Super Power, Supern...","KurosakiIchigo, KuchikiRukia, Senna"
2431,2889,Bleach Movie 2: The DiamondDust Rebellion - Mo...,7.45,"Action, Adventure, Comedy, Super Power, Supern...",ssigned to protect a royal procession transpor...,"Action, Adventure, Comedy, Super Power, Supern...","KurosakiIchigo, HitsugayaToushirou, MatsumotoR..."
3342,4835,Bleach Movie 3: Fade to Black - Kimi no Na wo ...,7.52,"Action, Adventure, Comedy, Super Power, Supern...",fter a mysterious pair attack Rukia Kuchiki an...,"Action, Adventure, Comedy, Super Power, Supern...","KurosakiIchigo, KuchikiRukia, Shizuku"
4599,8247,Bleach Movie 4: Jigoku-hen,7.61,"Action, Adventure, Comedy, Super Power, Supern...","""Hell"" is the place where a person is sent to ...","Action, Adventure, Comedy, Super Power, Supern...","KurosakiIchigo, KuchikiRukia, AbaraiRenji"
11677,35118,Bleach KaraBuri!: Gotei Juusan Yatai Daisakusen!,6.51,"Comedy, Shounen",animated adaptation of several chapters of Col...,"Comedy, Shounen KurosakiIchigo, KuchikiRukia, ...","KurosakiIchigo, KuchikiRukia, HitsugayaToushirou"
13891,38810,Bleach: Gotei 13 Omake,6.45,"Action, Adventure",No synopsis information has been added to this...,,
15348,41467,Bleach: Sennen Kessen-hen,Unknown,"Action, Adventure, Comedy, Super Power, Supern...",No synopsis information has been added to this...,,


In [228]:
try:
    print(unique_char_names)
except:
    unique_char_names = set()

{'nemuru', 'arrester', 'pugi', 'melchiott', 'leobort', 'baka', 'toujin', 'toucan', 'riela', 'amatsu', 'shoukichi', 'takashiro', 'tarobei', 'armin', 'pour', 'rudolf', 'gotes', 'touno', 'haru', 'retsu', 'dad', 'ichimaru', 'exkaiser', 'suomi', 'fang long', 'tatewaki', 'otone', 'exon', 'shiogami', 'fumi', 'tama-chan', 'itaru', 'the spirit of the world tree', 'tiantian', 'amao', 'ogawa', 'shirahane', 'ame', 'hong', 'shinran', 'apulo', 'jacky', 'iroha', 'mabuchi', 'asagaya', 'aisha', 'toudou', 'eurasian eagle-owl', 'irvine', 'ikue', 'dokin-chan', 'morokuzu', 'takae', 'nagaya', 'shishigami', 'bei', 'yingxiong', 'diablo', 'slaine', 'nima', 'tamiko', 'vergil', 'takagi', 'meshiya', 'hasami', 'oto', 'omega', 'agahari', 'gremory', 'noel marres', 'klaus', 'satoya', 'pe teacher', 'general', 'senkawa', 'shuichi', 'tenma white', 'tamaki', 'sakurazuki', 'huntley', 'hassan', 'moritomo', 'law', 'menmen', 'yamamoto', 'buddha', 'koda', 'hiraga', 'halkara', 'oliver', 'kasamatsu', 'yumiya', 'kokage', 'j. c. 

In [401]:
def get_char_names(mal_id, top_k=5):
    # mal_id = get_anime_rows(synopsis_df, anime_name).iloc[0, 0]
    # r = requests.get(f'https://myanimelist.net/anime/{mal_id}')
    # same with requesting from URL but faster
    html_path = f'{HTML_PATH}/{mal_id}/details.html'
    with open(html_path, 'r', encoding='utf-8') as f:
        html = f.read()
    soup = BeautifulSoup(html)
    chars = soup.findAll('h3', attrs={'class': 'h3_characters_voice_actors'})
    # get only top character names to remove them from synopsis
    chars = [x.a.text.strip() for x in chars[:top_k]]
    # print(chars)
    if not chars:
        return np.nan
    # regex = '|'.join(chars).replace(', ', '|')
    # make joined names like 'JohnDoe' instead of separated
    regex = r"\W+"  # remove any non-alphanumeric
    joined_names = list(map(lambda x: re.sub(regex, '', x), chars))
    joined_names = ', '.join(joined_names)
    
    # make unique first and last names
    chars = ', '.join(chars)
    chars = chars.split(', ')
    # print(chars)
    
    # update the set of unique character names 
    #  to remove from synopsis later
    # unique_char_names.update(set(chars))
    return joined_names
    # return np.nan

In [403]:
# %%timeit
get_char_names(156)

'ShirouKamui, MonouFuuma, SumeragiSubaru, SakurazukaSeishirou, ArisugawaSorata'

In [31]:
# @logger.catch
def remove_char_names(row):
    print(row.Characters)
    if 'No synopsis information' in row.synopsis:
        # also remove unnecessary info
        row.synopsis = 'unknown'
        return row
    # print(row)
    anime_name = row.Name
    # regex = get_char_names(anime_name)
    regex = '|'.join(row.Characters.split(','))
    # print(regex)
    # don't need IGNORECASE for names
    row.synopsis = re.sub(regex, '', row.synopsis)
    return row

In [200]:
specific_animes.MAL_ID.apply(get_char_names, top_k=3)

237              KurosakiIchigo, KuchikiRukia, AbaraiRenji
679             KurosakiIchigo, KuchikiRukia, InoueOrihime
731                  KurosakiIchigo, KuchikiRukia, Baishin
1468                   KurosakiIchigo, KuchikiRukia, Senna
2431     KurosakiIchigo, HitsugayaToushirou, MatsumotoR...
3342                 KurosakiIchigo, KuchikiRukia, Shizuku
4599             KurosakiIchigo, KuchikiRukia, AbaraiRenji
11677     KurosakiIchigo, KuchikiRukia, HitsugayaToushirou
13891                                                  NaN
15348                                                  NaN
Name: MAL_ID, dtype: object

In [1]:
# unique_char_names

In [203]:
# specific_animes['Characters'] = specific_animes.Name.swifter.apply(get_char_names)
specific_animes['Characters'] = specific_animes.MAL_ID.apply(get_char_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  specific_animes['Characters'] = specific_animes.MAL_ID.apply(get_char_names)


In [67]:
# specific_animes.apply(remove_char_names, axis=1)

Unnamed: 0,MAL_ID,Name,Score,Genres,synopsis,combined
237,269,Bleach,7.8,"Action, Adventure, Comedy, Super Power, Supern...",is an ordinary high schooler—until his famil...,"Action, Adventure, Comedy, Super Power, Supern..."
679,762,Bleach: Memories in the Rain,7.15,"Action, Adventure, Supernatural, Drama, Shounen",", the temporary Shinigami(Death God) for , d...","Action, Adventure, Supernatural, Drama, Shoune..."
731,834,Bleach: The Sealed Sword Frenzy,6.98,"Action, Adventure, Comedy, Super Power, Supern...",Shinigami named Baishin who was sealed by Soul...,"Action, Adventure, Comedy, Super Power, Supern..."
1468,1686,Bleach Movie 1: Memories of Nobody,7.45,"Action, Adventure, Comedy, Super Power, Supern...","hen a life ends, its soul departs to its final...","Action, Adventure, Comedy, Super Power, Supern..."
2431,2889,Bleach Movie 2: The DiamondDust Rebellion - Mo...,7.45,"Action, Adventure, Comedy, Super Power, Supern...",ssigned to protect a royal procession transpor...,"Action, Adventure, Comedy, Super Power, Supern..."
3342,4835,Bleach Movie 3: Fade to Black - Kimi no Na wo ...,7.52,"Action, Adventure, Comedy, Super Power, Supern...",fter a mysterious pair attack and erase her ...,"Action, Adventure, Comedy, Super Power, Supern..."
4599,8247,Bleach Movie 4: Jigoku-hen,7.61,"Action, Adventure, Comedy, Super Power, Supern...","""Hell"" is the place where a person is sent to ...","Action, Adventure, Comedy, Super Power, Supern..."
11677,35118,Bleach KaraBuri!: Gotei Juusan Yatai Daisakusen!,6.51,"Comedy, Shounen",animated adaptation of several chapters of Col...,"Comedy, Shounen animated adaptation of several..."
13891,38810,Bleach: Gotei 13 Omake,6.45,"Action, Adventure",unknown,"Action, Adventure No synopsis information has ..."
15348,41467,Bleach: Sennen Kessen-hen,Unknown,"Action, Adventure, Comedy, Super Power, Supern...",unknown,"Action, Adventure, Comedy, Super Power, Supern..."


In [25]:
# ! NOTE: This takes quite awhile to finish scraping!
# synopsis_df['Characters'] = synopsis_df.MAL_ID.swifter.apply(get_char_names)
# synopsis_df['Characters'] = synopsis_df.MAL_ID.apply(get_char_names)

Pandas Apply:   0%|          | 0/16206 [00:00<?, ?it/s]

In [39]:
# synopsis_df.to_csv(DATA_DIR + 'new_synopsis.csv', index=False)

In [36]:
# if '' in unique_char_names:
#     unique_char_names.remove('')
# with open('data/unique_character_names.pkl', 'wb') as f:
#     pickle.dump(unique_char_names, f)

In [45]:
pd.Series(list(unique_char_names)).isna().sum()

0

In [43]:
synopsis_df['combined'] = synopsis_df['Genres'] + ' ' + synopsis_df['Characters'] + ' ' + synopsis_df['synopsis']

In [396]:
synopsis_df = pd.read_csv(DATA_DIR + 'new_synopsis.csv')
synopsis_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,synopsis,combined,Characters
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, SciFi, Space","In the year 2071, humanity has colonized sever...","Action, Adventure, Comedy, Drama, SciFi, Space...","SpiegelSpike, ValentineFaye, Wong Hau Pepelu T..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, SciFi, Space","other day, another bounty—such is the life of ...","Action, Drama, Mystery, SciFi, Space SpiegelSp...","SpiegelSpike, ValentineFaye, Wong Hau Pepelu T..."
2,6,Trigun,8.24,"Action, SciFi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...","Action, SciFi, Adventure, Comedy, Drama, Shoun...","Vash the Stampede, WolfwoodNicholas D., Thomps..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,"Action, Mystery, Police, Supernatural, Drama, ...","SenaRobin, Amon, LeeMichael"
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...,"Adventure, Fantasy, Shounen, Supernatural Beet...","Beet, Kissu, Slade"


In [397]:
synopsis_df.loc[synopsis_df.MAL_ID == 156]

Unnamed: 0,MAL_ID,Name,Score,Genres,synopsis,combined,Characters
134,156,X,7.43,"Action, Super Power, Drama, Magic, Romance, Fa...",His destiny has finally arrived as the young S...,"Action, Super Power, Drama, Magic, Romance, Fa...","Ichise, Ran, YoshiiKazuho"


In [241]:
na_char_df = synopsis_df[synopsis_df.Characters.isna()].copy()
na_char_df.shape

(6008, 7)

In [265]:
print(len(na_char_df[na_char_df.Score == 'Unknown']))

3819


In [266]:
print(len(synopsis_df[synopsis_df.Score == 'Unknown']))

5116


In [272]:
synopsis_df[synopsis_df.Score == 'Unknown']

Unnamed: 0,MAL_ID,Name,Score,Genres,synopsis,combined,Characters
1347,1547,Obake no Q-tarou,Unknown,"Comedy, School, Slice of Life, Supernatural","Q-taro, a monster, is living with the Ohara fa...","Comedy, School, Slice of Life, Supernatural Q-...","Q-Tarou, OuharaShouta, U-ko"
1439,1656,PostPet Momobin,Unknown,"Comedy, Kids","omo and Komomo can deliver mail from anyone, t...","Comedy, Kids Komomo, Momo, StrongRock omo and ...","Komomo, Momo, StrongRock"
1512,1739,Shibawanko no Wa no Kokoro,Unknown,Kids,Based on a japanese children`s book by Yoshie ...,"Kids Shiba-wanko, Miike-nyanko Based on a japa...","Shiba-wanko, Miike-nyanko"
1619,1863,Silk Road Shounen Yuuto,Unknown,"Adventure, Fantasy, Historical","hen a boy Yuto visits Qinghai in China, he is ...",,
1808,2073,Hengen Taima Yakou Karura Mau! Sendai Kokeshi ...,Unknown,"Horror, Shoujo",Shoko and Maiko Ougi are apparently two ordina...,,
...,...,...,...,...,...,...,...
16201,48481,Daomu Biji Zhi Qinling Shen Shu,Unknown,"Adventure, Mystery, Supernatural",No synopsis information has been added to this...,,
16202,48483,Mieruko-chan,Unknown,"Comedy, Horror, Supernatural",ko is a typical high school student whose life...,"Comedy, Horror, Supernatural Miko ko is a typi...",Miko
16203,48488,Higurashi no Naku Koro ni Sotsu,Unknown,"Mystery, Dementia, Horror, Psychological, Supe...",Sequel to Higurashi no Naku Koro ni Gou .,,
16204,48491,Yama no Susume: Next Summit,Unknown,"Adventure, Slice of Life, Comedy",New Yama no Susume anime.,"Adventure, Slice of Life, Comedy YukimuraAoi, ...","YukimuraAoi, AobaKokona, KuraueHinata"


In [268]:
get_anime_rows(synopsis_df, 'shaman')

Unnamed: 0,MAL_ID,Name,Score,Genres,synopsis,combined,Characters
132,154,Shaman King,7.78,"Action, Adventure, Comedy, Super Power, Supern...",attle is about to begin in Tokyo: the Shaman F...,"Action, Adventure, Comedy, Super Power, Supern...","AsakuraYou, TaoRen, AsakuraHao"
1051,1182,Shamanic Princess,6.64,"Action, Fantasy, Magic, Romance","From the Guardian World, home of mages, Tiara ...","Action, Fantasy, Magic, Romance Tiara, Lena, G...","Tiara, Lena, Graham"
2968,3861,Shaman King Specials,7.29,"Action, Comedy, Supernatural, Shounen","Throughout the broadcast of Shaman King, many ...","Action, Comedy, Supernatural, Shounen AsakuraY...","AsakuraYou, TaoRen, KyouyamaAnna"
3577,5466,Mahoutsukai Tai! vs. Shamanic Princess,5.84,"Comedy, Magic, Fantasy",parody-esque epilogue to both Shamanic Princes...,"Comedy, Magic, Fantasy Tiara, TakakuraTakeo pa...","Tiara, TakakuraTakeo"
14172,39205,Oshamani,Unknown,Kids,"Fashionable, hipster, and ironic--these gruff ...",,
15550,42205,Shaman King (2021),Unknown,"Action, Adventure, Comedy, Super Power, Supern...",Shamans are extraordinary individuals with the...,"Action, Adventure, Comedy, Super Power, Supern...","AsakuraYou, TaoRen, AsakuraHao"


In [45]:
# synopsis_df.to_csv(DATA_DIR + 'new_synopsis.csv', index=False)

In [68]:
# characters_df = synopsis_df.copy()
# characters_df.Characters.dropna(inplace=True)

In [72]:
# characters_df.to_csv(DATA_DIR + 'characters.csv', index=False)

## Remove unnecessary animes

- like scores < 7, year < 1990, unknown synopsis etc.

In [12]:
ori_df = pd.read_csv(DATA_DIR + 'anime_with_synopsis.csv')
ori_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,synopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [57]:
characters_df = pd.read_csv(DATA_DIR + 'characters.csv')
characters_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,synopsis,combined,Characters
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, SciFi, Space","In the year 2071, humanity has colonized sever...","Action, Adventure, Comedy, Drama, SciFi, Space...","SpiegelSpike, ValentineFaye, Wong Hau Pepelu T..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, SciFi, Space","other day, another bounty—such is the life of ...","Action, Drama, Mystery, SciFi, Space other day...","SpiegelSpike, ValentineFaye, Wong Hau Pepelu T..."
2,6,Trigun,8.24,"Action, SciFi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...","Action, SciFi, Adventure, Comedy, Drama, Shoun...","Vash the Stampede, WolfwoodNicholas D., Thomps..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,"Action, Mystery, Police, Supernatural, Drama, ...","SenaRobin, Amon, LeeMichael"
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...,"Adventure, Fantasy, Shounen, Supernatural It i...","Beet, Kissu, Slade"


In [58]:
characters_df[characters_df.Characters.isna()]

Unnamed: 0,MAL_ID,Name,Score,Genres,synopsis,combined,Characters


In [59]:
characters_df = characters_df[characters_df.Characters != ' ']

In [60]:
characters_df = characters_df[characters_df.Genres != 'Unknown']

In [61]:
characters_df = characters_df[characters_df.Score != 'Unknown']

In [62]:
characters_df.shape

(8895, 7)

In [63]:
characters_df.Score = characters_df.Score.astype(float)

In [64]:
characters_df = characters_df[characters_df.Score > 6.5]
characters_df.shape

(5529, 7)

In [65]:
characters_df[characters_df.Score < 7].sort_values('Score', ascending=False).head(20)

Unnamed: 0,MAL_ID,Name,Score,Genres,synopsis,combined,Characters
5106,9712,Maria†Holic Alive,6.99,"Comedy, Parody, School, Shoujo Ai",hen Kanako Miyamae transferred to a new all-gi...,"Comedy, Parody, School, Shoujo Ai hen Kanako M...","ShidouMariya, ShinoujiMatsurika, MiyamaeKanako"
12121,35868,Grisaia: Phantom Trigger The Animation,6.99,"Action, School","Following the Heath Oslo incident, the existen...","Action, School Following the Heath Oslo incide...","ShishigayaTouka, IkomaMurasaki, FukamiRena"
10769,33420,RESTART POiNTER,6.99,Music,"Official music video for IDOLiSH7's song ""REST...",Music Official music video for IDOLiSH7's song...,"NanaseRiku, OusakaSougo, YotsubaTamaki"
10233,32323,Aishen Qiaokeli-ing...,6.99,"Harem, Comedy, Romance, School","Jiang Hao Yi, a normal, high school student wh...","Harem, Comedy, Romance, School Jiang Hao Yi, a...","OuyangXueli, XiaZitong, JiangHaoyi"
324,358,"Iriya no Sora, UFO no Natsu",6.99,"Drama, Romance, SciFi",saba Naoyuki is an ordinary high school studen...,"Drama, Romance, SciFi saba Naoyuki is an ordin...","IriyaKana, AsabaNaoyuki, SuizenjiKunihiro"
8796,28617,Punch Line,6.99,"Comedy, Ecchi, SciFi, Super Power, Supernatural",fter escaping a bus hijacking with the help of...,"Comedy, Ecchi, SciFi, Super Power, Supernatura...","IridatsuYuuta, NaruginoMikatan, HikiotaniIto"
421,461,One Piece Movie 3: Chinjuu-jima no Chopper Oukoku,6.99,"Action, Adventure, Comedy, Super Power, Fantas...",The crew comes upon Crown Island where the ani...,"Action, Adventure, Comedy, Super Power, Fantas...","Monkey D.Luffy, RoronoaZoro, Sanji"
7062,19023,"Wake Up, Girls!",6.99,"Drama, Music","On Christmas 2013, the band Wake Up, Girls pla...","Drama, Music On Christmas 2013, the band Wake ...","ShimadaMayu, OkamotoMiyu, KatayamaMinami"
6493,16157,Choujigen Game Neptune The Animation,6.99,"Action, Comedy, Fantasy, Parody, SciFi, Supern...",fter years of fruitless war between the four r...,"Action, Comedy, Fantasy, Parody, SciFi, Supern...","Neptune, Noire, Blanc"
5882,12281,Rinne no Lagrange Season 2,6.99,"Action, Comedy, Mecha, SciFi",Several months have passed since Lan and Mugin...,"Action, Comedy, Mecha, SciFi Several months ha...","KyounoMadoka, Fin E Ld SiLaffinty, Muginami"


In [66]:
anime_df.head(2)

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating,Ranked,Popularity,Members,Favorites,Watching,Completed,On-Hold,Dropped,Plan to Watch,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity),28.0,39,1251960,61971,105808,718161,71513,26678,329800,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity),159.0,518,273145,1174,4143,208333,1935,770,57964,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0


In [67]:
characters_df = pd.merge(characters_df, anime_df, on='MAL_ID', suffixes=('', '_y'))
characters_df = characters_df[['MAL_ID', 'Name', 'Type', 'Aired', 'Score', 'Ranked', 'Popularity', 'Genres', 'synopsis', 'combined', 'Characters']]

In [68]:
characters_df.Aired.str[-4:].value_counts()

2016    314
2015    304
2014    291
2017    289
2018    278
2013    278
2012    253
2011    248
2019    246
2010    233
2007    227
2008    225
2009    224
2006    202
2020    196
2005    155
2004    135
2003    132
2002    106
2001     97
1999     87
2000     77
1998     72
1996     70
1997     68
1995     67
to ?     57
1994     51
1993     44
1992     44
1991     37
1986     35
1989     35
1988     35
1987     34
1990     34
1985     31
1984     28
1982     22
1981     22
1983     21
1979     20
2021     17
1980     16
1978     15
1975     10
1976      8
1969      7
1977      7
1974      6
1971      5
1968      4
1973      2
1966      2
1972      2
1964      1
1970      1
1962      1
1961      1
Name: Aired, dtype: int64

In [69]:
# len(characters_df[characters_df.Aired.str[-4:] == 'to ?'])
print(len(characters_df[characters_df.Aired.str.contains('?', regex=False)]))
# characters_df[characters_df.Aired.str.contains('?', regex=False)]

57


In [70]:
def get_air_date(date_str):
    try:
        date = int(date_str[-4:])
    except:
        # to find the first 4 digits (i.e. year)
        date = int(re.search(r'\d{4}', date_str).group())
    return date

In [71]:
characters_df.Aired = characters_df.Aired.apply(get_air_date)

In [72]:
characters_df = characters_df[characters_df.Aired > 1990]

In [73]:
characters_df.sort_values('Ranked')

Unnamed: 0,MAL_ID,Name,Type,Aired,Score,Ranked,Popularity,Genres,synopsis,combined,Characters
2181,5114,Fullmetal Alchemist: Brotherhood,TV,2010,9.19,1.0,3,"Action, Military, Adventure, Comedy, Drama, Ma...","""In order for something to be obtained, someth...","Action, Military, Adventure, Comedy, Drama, Ma...","ElricEdward, ElricAlphonse, MustangRoy"
4725,35180,3-gatsu no Lion 2nd Season,TV,2018,9.00,10.0,545,"Drama, Game, Seinen, Slice of Life","Now in his second year of high school, Rei Kir...","Drama, Game, Seinen, Slice of Life Now in his ...","KiriyamaRei, KawamotoHinata, KawamotoAkari"
4283,31933,JoJo no Kimyou na Bouken Part 4: Diamond wa Ku...,TV,2016,8.51,100.0,163,"Action, Adventure, Comedy, Supernatural, Drama...","The year is 1999. Morioh, a normally quiet and...","Action, Adventure, Comedy, Supernatural, Drama...","KuujouJoutarou, KiraYoshikage, HigashikataJousuke"
3990,28249,Arslan Senki (TV),TV,2015,7.71,1000.0,467,"Action, Adventure, Drama, Fantasy, Historical,...",The year is 320. Under the rule of the bellige...,"Action, Adventure, Drama, Fantasy, Historical,...","Arslan, Daryun, Narsus"
1887,3603,JoJo no Kimyou na Bouken: Phantom Blood,Movie,2007,7.71,1000.0,2403,"Action, Adventure, Horror, Shounen, Vampire",adaptation of the original five volume arc of ...,"Action, Adventure, Horror, Shounen, Vampire ad...","BrandoDio, JoestarJonathan, ZeppeliWill A."
...,...,...,...,...,...,...,...,...,...,...,...
5045,37585,Yarichin☆Bitch-bu,OVA,2019,6.66,Unknown,2117,"Comedy, School, Yaoi",Takashi Toono was unsure what to expect when h...,"Comedy, School, Yaoi Takashi Toono was unsure ...","KashimaYuu, ToonoTakashi, YuriAyato"
552,747,Haru wo Daite Ita,OVA,2005,6.76,Unknown,3437,"Drama, Romance, Yaoi",The adorable and attractive main characters of...,"Drama, Romance, Yaoi The adorable and attracti...","IwakiKyousuke, KatouYouji, SawaNagisa"
5308,39761,Saezuru Tori wa Habatakanai: The Clouds Gather,Movie,2020,7.57,Unknown,3900,"Drama, Yaoi",Yashiro is the young leader of Shinseikai and ...,"Drama, Yaoi Yashiro is the young leader of Shi...","Yashiro, DoumekiChikara, KugaEishin"
2670,8634,Koisuru Boukun,OVA,2010,7.11,Unknown,2086,"Comedy, Romance, Yaoi",Tetsuhiro Morinaga is in love with his uppercl...,"Comedy, Romance, Yaoi Tetsuhiro Morinaga is in...","TatsumiSouichi, MorinagaTetsuhiro, TatsumiKanako"


In [74]:
characters_df.rename(columns={'Aired': 'Year'}, inplace=True)

In [75]:
# characters_df.to_csv(DATA_DIR + 'characters_17-May.csv', index=False)

In [314]:
characters_df = pd.read_csv(DATA_DIR + 'characters_17-May.csv')
characters_df.head()

Unnamed: 0,MAL_ID,Name,Type,Year,Score,Ranked,Popularity,Genres,synopsis,combined,Characters
0,1,Cowboy Bebop,TV,1999,8.78,28.0,39,"Action, Adventure, Comedy, Drama, SciFi, Space","In the year 2071, humanity has colonized sever...","Action, Adventure, Comedy, Drama, SciFi, Space...","SpiegelSpike, ValentineFaye, Wong Hau Pepelu T..."
1,5,Cowboy Bebop: Tengoku no Tobira,Movie,2001,8.39,159.0,518,"Action, Drama, Mystery, SciFi, Space","other day, another bounty—such is the life of ...","Action, Drama, Mystery, SciFi, Space other day...","SpiegelSpike, ValentineFaye, Wong Hau Pepelu T..."
2,6,Trigun,TV,1998,8.24,266.0,201,"Action, SciFi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...","Action, SciFi, Adventure, Comedy, Drama, Shoun...","Vash the Stampede, WolfwoodNicholas D., Thomps..."
3,7,Witch Hunter Robin,TV,2002,7.27,2481.0,1467,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,"Action, Mystery, Police, Supernatural, Drama, ...","SenaRobin, Amon, LeeMichael"
4,8,Bouken Ou Beet,TV,2005,6.98,3710.0,4369,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...,"Adventure, Fantasy, Shounen, Supernatural It i...","Beet, Kissu, Slade"


In [13]:
characters_df.shape, ori_df.shape

((5124, 10), (16214, 5))

In [83]:
# characters_df['combined'] = characters_df['Genres'] + ' ' + characters_df['Characters'] + ' '

## Get all character names

In [409]:
characters_df.Characters = characters_df.MAL_ID.apply(get_char_names, top_k=10)

In [410]:
characters_df.Characters[:5]

0    SpiegelSpike, ValentineFaye, WongHauPepeluTivr...
1    SpiegelSpike, ValentineFaye, WongHauPepeluTivr...
2    VashtheStampede, WolfwoodNicholasD, ThompsonMi...
3    SenaRobin, Amon, LeeMichael, SakakiHaruto, Kar...
4    Beet, Kissu, Slade, Milfa, Poala, Zenon, Shagg...
Name: Characters, dtype: object

In [417]:
len(characters_df[characters_df.Characters.isna()])

9

In [465]:
characters_df.dropna(inplace=True)
characters_df.reset_index(inplace=True, drop=True)

In [470]:
# characters_df.to_csv(DATA_DIR + 'filtered_full_char_names.csv', index=False)

In [447]:
def separate_char_names(name):
    # name = 'Saber, ToosakaRin, Archer'
    name = name.replace(', ', '')
    name_list = re.findall('[A-Z]{1}[^A-Z]+', name)
    name_list = [txt.strip() for txt in name_list]
    return name_list

In [448]:
characters_df['char_name_list'] = characters_df.Characters.apply(separate_char_names)

In [449]:
characters_df.combined = characters_df.char_name_list.apply(str) + ' ' + characters_df.synopsis
characters_df.combined[0]

'[\'Spiegel\', \'Spike\', \'Valentine\', \'Faye\', \'Wong\', \'Hau\', \'Pepelu\', \'Tivrusky\', \'Edward\', \'Black\', \'Jet\', \'Ein\', \'Vicious\', \'Julia\', \'Eckener\', \'Grencia\', \'Mars\', \'Elijah\', \'Guo\', \'Vonde\', \'Oniyate\', \'Andy\', \'Mad\', \'Pierrot\'] In the year 2071, humanity has colonized several of the planets and moons of the solar system leaving the now uninhabitable surface of planet Earth behind. The Inter Solar System Police attempts to keep peace in the galaxy, aided in part by outlaw bounty hunters, referred to as "Cowboys." The ragtag team aboard the spaceship Bebop are two such individuals. Mellow and carefree Spike Spiegel is balanced by his boisterous, pragmatic partner Jet Black as the pair makes a living chasing bounties and collecting rewards. Thrown off course by the addition of new members that they meet in their travels—Ein, a genetically engineered, highly intelligent Welsh Corgi; femme fatale Faye Valentine, an enigmatic trickster with memor

In [450]:
def remove_names(text):
    # text.split(']')[0]
    idx = re.search(r']', text).span()[1] + 1
    name_list = eval(text[:idx])
    text = text[idx:]
    regex = '|'.join(name_list)
    text = re.sub(regex, '', text)
    return text

In [451]:
characters_df.combined = characters_df.combined.apply(remove_names)

In [452]:
idx = 1
display(characters_df.loc[[idx]])
print(characters_df.combined.loc[idx])

Unnamed: 0,index,MAL_ID,Name,Type,Year,Score,Ranked,Popularity,Genres,synopsis,combined,Characters,char_name_list
1,1,5,Cowboy Bebop: Tengoku no Tobira,Movie,2001,8.39,159.0,518,"Action, Drama, Mystery, SciFi, Space","other day, another bounty—such is the life of ...","other day, another bounty—such is the life of ...","SpiegelSpike, ValentineFaye, WongHauPepeluTivr...","[Spiegel, Spike, Valentine, Faye, Wong, Hau, P..."


other day, another bounty—such is the life of the often unlucky crew of the Bebop. However, this routine is interrupted when , who is chasing a fairly worthless target on Mars, witnesses an oil tanker suddenly explode, causing mass hysteria. As casualties mount due to a strange disease spreading through the smoke from the blast, a whopping three hundred million woolong price is placed on the head of the supposed perpetrator. With lives at stake and a solution to their money problems in sight, the Bebop crew springs into action. , , , and , followed closely by , split up to pursue different leads across Alba City. Through their individual investigations, they discover a cover-up scheme involving a pharmaceutical company, revealing a plot that reaches much further than the ragtag team of bounty hunters could have realized.


In [453]:
characters_df.combined[id_to_idx(269)]

"  is an ordinary high schooler—until his family is attacked by a Hollow, a corrupt spirit that seeks to devour human souls. It is then that he meets a Soul Reaper named  , who gets injured while protecting 's family from the assailant. To save his family,  accepts 's offer of taking her powers and becomes a Soul Reaper as a result. However, as  is unable to regain her powers,  is given the daunting task of hunting down the Hollows that plague their town. However, he is not alone in his fight, as he is later joined by his friends—classmates  ,  , and  —who each have their own unique abilities. As  and his comrades get used to their new duties and support each other on and off the battlefield, the young Soul Reaper soon learns that the Hollows are not the only real threat to the human world."

In [454]:
characters_df['combined'] = characters_df['Genres'] + ' ' + characters_df['Characters'] + ' ' + characters_df['combined']

In [519]:
characters_df['genre_and_char'] = characters_df['Genres'] + ' ' + characters_df['Characters']

In [455]:
characters_df.combined[id_to_idx(269)]

"Action, Adventure, Comedy, Super Power, Supernatural, Shounen KurosakiIchigo, KuchikiRukia, AbaraiRenji, InoueOrihime, IshidaUryuu, SadoYasutora, ZarakiKenpachi, HitsugayaToushirou, CiferUlquiorra, UraharaKisuke   is an ordinary high schooler—until his family is attacked by a Hollow, a corrupt spirit that seeks to devour human souls. It is then that he meets a Soul Reaper named  , who gets injured while protecting 's family from the assailant. To save his family,  accepts 's offer of taking her powers and becomes a Soul Reaper as a result. However, as  is unable to regain her powers,  is given the daunting task of hunting down the Hollows that plague their town. However, he is not alone in his fight, as he is later joined by his friends—classmates  ,  , and  —who each have their own unique abilities. As  and his comrades get used to their new duties and support each other on and off the battlefield, the young Soul Reaper soon learns that the Hollows are not the only real threat to the

In [506]:
# characters_df.to_csv(DATA_DIR + 'filtered_full_char_names.csv', index=False)

- ! OR TRY add the character names to stopwords to remove in TfidVectorizer

In [456]:
stopwords_set = set(stopwords.words('english'))
print(stopwords_set)

{'in', 'wasn', "should've", 'and', 'you', 'now', "mightn't", 'its', "needn't", 'those', 'as', 'out', 'only', "haven't", 'won', 'for', 'her', "wasn't", 're', 'to', 'ourselves', 'ain', 'own', 'on', 'if', "wouldn't", 'each', 't', "that'll", 'both', 'off', 'theirs', 'about', 'such', 'that', 'again', 'whom', 'me', 'he', 'it', 'are', "it's", 'this', 'we', 'couldn', 'but', 'more', 'doing', 'or', "hasn't", 'any', "mustn't", "doesn't", 'was', "isn't", 'other', "don't", 've', 'some', "shouldn't", 'them', 'yourselves', 'their', 'until', 'shan', 'having', 'through', 'do', 'at', 'shouldn', 'up', 'under', 'haven', "she's", 'she', 'same', 'doesn', "won't", 'when', 'does', 'hasn', 'into', 'been', 'hers', 'just', 'how', 'yours', 'll', 'before', 'few', 'what', 'mustn', 'down', 'y', 'ma', 'isn', 'be', 'were', 'him', 'during', 'am', 'itself', 'ours', 'don', 'd', 'nor', 'they', 'has', 'have', 'once', 'hadn', 'which', 'i', 'his', 'these', "hadn't", 'where', 'aren', 'is', 'himself', 'over', 'there', 's', "yo

In [208]:
# unique_char_names = pickle.load(open('data/unique_character_names.pkl', 'rb'))
# unique_char_names = set([x.lower() for x in unique_char_names])

# stopwords_set.update(unique_char_names)

In [57]:
regex = '|'.join(unique_char_names)
# synopsis_df.Name.str.replace(regex, '', case=False, regex=True)

In [81]:
%%timeit
# adding flag will double the time, but don't need IGNORECASE for names
re.sub(regex, '', synopsis_df.synopsis[2])

49.2 ms ± 301 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [99]:
%%timeit
synopsis_df.iloc[:1].apply(remove_char_names, axis=1)

1.54 ms ± 11.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [111]:
specific_animes.apply(remove_char_names, axis=1)

KurosakiIchigo, KuchikiRukia
KurosakiIchigo, KuchikiRukia
KurosakiIchigo, KuchikiRukia
KurosakiIchigo, KuchikiRukia
KurosakiIchigo, HitsugayaToushirou
KurosakiIchigo, KuchikiRukia
KurosakiIchigo, KuchikiRukia
KurosakiIchigo, KuchikiRukia
nan
nan


Unnamed: 0,MAL_ID,Name,Score,Genres,synopsis,combined,Characters
237,269,Bleach,7.8,"Action, Adventure, Comedy, Super Power, Supern...",Ichigo Kurosaki is an ordinary high schooler—u...,"Action, Adventure, Comedy, Super Power, Supern...","KurosakiIchigo, KuchikiRukia"
679,762,Bleach: Memories in the Rain,7.15,"Action, Adventure, Supernatural, Drama, Shounen","Kurosaki Ichigo, the temporary Shinigami(Death...","Action, Adventure, Supernatural, Drama, Shoune...","KurosakiIchigo, KuchikiRukia"
731,834,Bleach: The Sealed Sword Frenzy,6.98,"Action, Adventure, Comedy, Super Power, Supern...",Shinigami named Baishin who was sealed by Soul...,"Action, Adventure, Comedy, Super Power, Supern...","KurosakiIchigo, KuchikiRukia"
1468,1686,Bleach Movie 1: Memories of Nobody,7.45,"Action, Adventure, Comedy, Super Power, Supern...","hen a life ends, its soul departs to its final...","Action, Adventure, Comedy, Super Power, Supern...","KurosakiIchigo, KuchikiRukia"
2431,2889,Bleach Movie 2: The DiamondDust Rebellion - Mo...,7.45,"Action, Adventure, Comedy, Super Power, Supern...",ssigned to protect a royal procession transpor...,"Action, Adventure, Comedy, Super Power, Supern...","KurosakiIchigo, HitsugayaToushirou"
3342,4835,Bleach Movie 3: Fade to Black - Kimi no Na wo ...,7.52,"Action, Adventure, Comedy, Super Power, Supern...",fter a mysterious pair attack Rukia Kuchiki an...,"Action, Adventure, Comedy, Super Power, Supern...","KurosakiIchigo, KuchikiRukia"
4599,8247,Bleach Movie 4: Jigoku-hen,7.61,"Action, Adventure, Comedy, Super Power, Supern...","""Hell"" is the place where a person is sent to ...","Action, Adventure, Comedy, Super Power, Supern...","KurosakiIchigo, KuchikiRukia"
11677,35118,Bleach KaraBuri!: Gotei Juusan Yatai Daisakusen!,6.51,"Comedy, Shounen",animated adaptation of several chapters of Col...,"Comedy, Shounen animated adaptation of several...","KurosakiIchigo, KuchikiRukia"
13891,38810,Bleach: Gotei 13 Omake,6.45,"Action, Adventure",unknown,"Action, Adventure No synopsis information has ...",
15348,41467,Bleach: Sennen Kessen-hen,Unknown,"Action, Adventure, Comedy, Super Power, Supern...",unknown,"Action, Adventure, Comedy, Super Power, Supern...",


In [82]:
%%timeit
synopsis_df.loc[:1, 'synopsis'].str.replace(regex, '', regex=True)

91.8 ms ± 188 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [69]:
# need about 40 mins to finish replacing with re.sub
48.9 * 1e-3 * 48492 / 60

39.52098

# Recommendation system modelling

In [81]:
# get_char_names(33840)

In [227]:
text = 'Grim Reaper'
text.lower() in stopwords_set

True

In [340]:
print(sorted(stopwords_set))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some',

In [457]:
capital_stopwords = [x.capitalize() for x in stopwords_set]
stopwords_list = list(stopwords_set)
stopwords_list.extend(capital_stopwords)
cap_stopwords_set = set(stopwords_list)
print(sorted(cap_stopwords_set))

['A', 'About', 'Above', 'After', 'Again', 'Against', 'Ain', 'All', 'Am', 'An', 'And', 'Any', 'Are', 'Aren', "Aren't", 'As', 'At', 'Be', 'Because', 'Been', 'Before', 'Being', 'Below', 'Between', 'Both', 'But', 'By', 'Can', 'Couldn', "Couldn't", 'D', 'Did', 'Didn', "Didn't", 'Do', 'Does', 'Doesn', "Doesn't", 'Doing', 'Don', "Don't", 'Down', 'During', 'Each', 'Few', 'For', 'From', 'Further', 'Had', 'Hadn', "Hadn't", 'Has', 'Hasn', "Hasn't", 'Have', 'Haven', "Haven't", 'Having', 'He', 'Her', 'Here', 'Hers', 'Herself', 'Him', 'Himself', 'His', 'How', 'I', 'If', 'In', 'Into', 'Is', 'Isn', "Isn't", 'It', "It's", 'Its', 'Itself', 'Just', 'Ll', 'M', 'Ma', 'Me', 'Mightn', "Mightn't", 'More', 'Most', 'Mustn', "Mustn't", 'My', 'Myself', 'Needn', "Needn't", 'No', 'Nor', 'Not', 'Now', 'O', 'Of', 'Off', 'On', 'Once', 'Only', 'Or', 'Other', 'Our', 'Ours', 'Ourselves', 'Out', 'Over', 'Own', 'Re', 'S', 'Same', 'Shan', "Shan't", 'She', "She's", 'Should', "Should've", 'Shouldn', "Shouldn't", 'So', 'Some',

In [520]:
# vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1), lowercase=False, stop_words=stopwords_set)
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1), lowercase=False, stop_words=cap_stopwords_set)
# tfid_matrix = vectorizer.fit_transform(characters_df['combined'])
tfid_matrix = vectorizer.fit_transform(characters_df['genre_and_char'])
tfid_matrix.shape

(5115, 22558)

In [490]:
# vectorizer = CountVectorizer(min_df=1, lowercase=False, stop_words=cap_stopwords_set)
# tfid_matrix = vectorizer.fit_transform(characters_df['combined'])
# tfid_matrix.shape

In [521]:
cos = cosine_similarity(tfid_matrix, tfid_matrix)
cos.shape

(5115, 5115)

In [19]:
def id_to_idx(anime_id):
    return characters_df.loc[characters_df.MAL_ID == anime_id].index[0]

def idx_to_id(anime_idx):
    return characters_df.loc[anime_idx, 'MAL_ID']

In [492]:
# anime_id = 269  # bleach
anime_id = 269
def check_anime(anime_id, return_words=False, verbose=1):
    anime_idx = id_to_idx(anime_id)
    words = sorted(vectorizer.inverse_transform(tfid_matrix[anime_idx])[0])
    display(characters_df.loc[[anime_idx]])
    if verbose:
        print("Synopsis:", characters_df.loc[anime_idx, 'synopsis'])
        print(words)
    if return_words:
        return words
check_anime(anime_id)

Unnamed: 0,MAL_ID,Name,Type,Year,Score,Ranked,Popularity,Genres,synopsis,combined,Characters,char_name_list
207,269,Bleach,TV,2012,7.8,818.0,33,"Action, Adventure, Comedy, Super Power, Supern...",Ichigo Kurosaki is an ordinary high schooler—u...,"Action, Adventure, Comedy, Super Power, Supern...","KurosakiIchigo, KuchikiRukia, AbaraiRenji, Ino...","[Kurosaki, Ichigo, Kuchiki, Rukia, Abarai, Ren..."


Synopsis: Ichigo Kurosaki is an ordinary high schooler—until his family is attacked by a Hollow, a corrupt spirit that seeks to devour human souls. It is then that he meets a Soul Reaper named Rukia Kuchiki, who gets injured while protecting Ichigo's family from the assailant. To save his family, Ichigo accepts Rukia's offer of taking her powers and becomes a Soul Reaper as a result. However, as Rukia is unable to regain her powers, Ichigo is given the daunting task of hunting down the Hollows that plague their town. However, he is not alone in his fight, as he is later joined by his friends—classmates Orihime Inoue, Yasutora Sado, and Uryuu Ishida—who each have their own unique abilities. As Ichigo and his comrades get used to their new duties and support each other on and off the battlefield, the young Soul Reaper soon learns that the Hollows are not the only real threat to the human world.
['AbaraiRenji', 'Action', 'Adventure', 'CiferUlquiorra', 'Comedy', 'HitsugayaToushirou', 'Holl

In [47]:
characters_df.loc[anime_idx]

MAL_ID                                                       20
Name                                                     Naruto
Year                                                       2007
Score                                                      7.91
Ranked                                                    660.0
Popularity                                                    8
Genres        Action, Adventure, Comedy, Super Power, Martia...
synopsis      oments prior to Naruto Uzumaki's birth, a huge...
combined      Action, Adventure, Comedy, Super Power, Martia...
Characters           UzumakiNaruto, HatakeKakashi, UchihaSasuke
Name: 10, dtype: object

In [460]:
all_anime_types = set(characters_df.Type.unique())

@logger.catch
def get_recommendation(anime_name, top_k=10, anime_types='TV', popular=False):
    df = characters_df.copy()
    # get the index of the anime row
    try:
        anime_idx = get_anime_rows(df, anime_name).index[0]
    except:
        raise Exception('Anime not found!')
    print(f"Recommending Animes for: {df.loc[anime_idx, 'Name']} ...")
    # generate a list of tuples of (index, cosine_scores) for the anime
    cosine_scores = list(enumerate(cos[anime_idx]))
    # sort by the highest similarity
    cosine_scores = sorted(cosine_scores, key=lambda x: x[1], reverse=True)
    # get the indices for the sorted animes
    anime_indices = [x[0] for x in cosine_scores]
    # get the df rows for the sorted animes
    df = df.loc[anime_indices]
    if popular:
        df = df[df.Popularity < 2500]
    if anime_types == 'all':
        pass
    elif isinstance(anime_types, (str, tuple, list)):
        anime_types = set([anime_types]) if isinstance(anime_types, str) else set(anime_types)
        assert anime_types.issubset(all_anime_types), 'Anime type does not exist'
        df = df[df.Type.isin(anime_types)]
    else:
        raise Exception('Type of anime_types variable is not valid.')
    # return only the top k recommended animes
    return df.iloc[0:top_k + 1]

- !! MAYBE can try to sort the recommended animes by Score

In [505]:
check_anime(356)

Unnamed: 0,MAL_ID,Name,Type,Year,Score,Ranked,Popularity,Genres,synopsis,combined,Characters,char_name_list
265,356,Fate/stay night,TV,2006,7.34,2152.0,119,"Action, Supernatural, Magic, Romance, Fantasy","fter a mysterious inferno kills his family, Sh...","Action, Supernatural, Magic, Romance, Fantasy ...","Saber, ToosakaRin, Archer, EmiyaShirou, EmiyaK...","[Saber, Toosaka, Rin, Archer, Emiya, Shirou, E..."


Synopsis: fter a mysterious inferno kills his family, Shirou is saved and adopted by Kiritsugu Emiya, who teaches him the ways of magic and justice. One night, years after Kiritsugu's death, Shirou is cleaning at school, when he finds himself caught in the middle of a deadly encounter between two superhumans known as Servants. During his attempt to escape, the boy is caught by one of the Servants and receives a life-threatening injury. Miraculously, he survives, but the same Servant returns to finish what he started. In desperation, Shirou summons a Servant of his own, a knight named Saber. The two must now participate in the Fifth Holy Grail War, a battle royale of seven Servants and the mages who summoned them, with the grand prize being none other than the omnipotent Holy Grail itself. Fate/stay night follows Shirou as he struggles to find the fine line between a hero and a killer, his ideals clashing with the harsh reality around him. Will the boy become a hero like his foster fath

In [518]:
check_anime(11783)

Unnamed: 0,MAL_ID,Name,Type,Year,Score,Ranked,Popularity,Genres,synopsis,combined,Characters,char_name_list
2679,11783,Dog Days',TV,2012,7.09,3248.0,1544,"Action, Adventure, Magic, Fantasy",Cinque returns to the land of Flonyard in Dog ...,"Action, Adventure, Magic, Fantasy FiriannoBisc...","FiriannoBiscottiMillhiore, MartinozziÉclair, G...","[Firianno, Biscotti, Millhiore, MartinozziÉcla..."


Synopsis: Cinque returns to the land of Flonyard in Dog Days' to resume his duties as the hero of the Biscotti Republic and it's as though he never left! The difference this time is that he's brought two friends with him who become heroes in their own right: childhood best friend Rebecca Anderson, who becomes the hero for the Principality of Pastillage at the urging of their leader Princess Couvert Eschenbach Pastillage, and his cousin Nanami Takatsuki, who becomes the hero for the Galette Lion Dominion. As with the first season though, with so many secrets to be uncovered and mysteries to be solved, will there even be time for the athletic events that these three hyper and athletic teenagers love to participate in so much?
['Action', 'Adventure', 'AndersonRebecca', 'Days', 'Dog', 'Dominion', 'ElmarRicotta', 'EschenbachPastillageCouvert', 'Fantasy', 'FiriannoBiscottiMillhiore', 'Flonyard', 'Galette', 'GalettedesRoisLeonmitchelli', 'IzumiCinque', 'Lion', 'Magic', 'MartinozziÉclair', 'Pa

In [503]:
def compare_anime(anime_id, anime_id_recommended):
    words_1 = check_anime(anime_id, return_words=True, verbose=0)
    words_rec = check_anime(anime_id_recommended, return_words=True, verbose=0)
    total = 0
    for item in words_1:
        if item in words_rec:
            total += 1
    return f"Total same words found = {total}"

In [513]:
compare_anime(356, 10087)

Unnamed: 0,MAL_ID,Name,Type,Year,Score,Ranked,Popularity,Genres,synopsis,combined,Characters,char_name_list
265,356,Fate/stay night,TV,2006,7.34,2152.0,119,"Action, Supernatural, Magic, Romance, Fantasy","fter a mysterious inferno kills his family, Sh...","Action, Supernatural, Magic, Romance, Fantasy ...","Saber, ToosakaRin, Archer, EmiyaShirou, EmiyaK...","[Saber, Toosaka, Rin, Archer, Emiya, Shirou, E..."


Unnamed: 0,MAL_ID,Name,Type,Year,Score,Ranked,Popularity,Genres,synopsis,combined,Characters,char_name_list
2453,10087,Fate/Zero,TV,2011,8.34,192.0,59,"Action, Supernatural, Magic, Fantasy","h the promise of granting any wish, the omnipo...","Action, Supernatural, Magic, Fantasy Saber, Em...","Saber, EmiyaKiritsugu, Gilgamesh, KotomineKire...","[Saber, Emiya, Kiritsugu, Gilgamesh, Kotomine,..."


'Total same words found = 37'

In [525]:
get_recommendation('subarashi', top_k=30, anime_types=('TV'))
# get_recommendation('fate', top_k=30, anime_types=('TV'), popular=1)

Recommending Animes for: Kono Subarashii Sekai ni Shukufuku wo! ...


Unnamed: 0,MAL_ID,Name,Type,Year,Score,Ranked,Popularity,Genres,synopsis,combined,Characters,char_name_list,genre_and_char
3738,30831,Kono Subarashii Sekai ni Shukufuku wo!,TV,2016,8.15,363.0,34,"Adventure, Comedy, Fantasy, Magic, Parody, Sup...",fter dying a laughable and pathetic death on h...,"Adventure, Comedy, Fantasy, Magic, Parody, Sup...","Megumin, SatouKazuma, Aqua, DustinessFordLalat...","[Megumin, Satou, Kazuma, Aqua, Dustiness, Ford...","Adventure, Comedy, Fantasy, Magic, Parody, Sup..."
3996,32937,Kono Subarashii Sekai ni Shukufuku wo! 2,TV,2017,8.31,225.0,69,"Adventure, Comedy, Parody, Supernatural, Magic...","hen Kazuma Satou died, he was given two choice...","Adventure, Comedy, Parody, Supernatural, Magic...","Megumin, SatouKazuma, Aqua, DustinessFordLalat...","[Megumin, Satou, Kazuma, Aqua, Dustiness, Ford...","Adventure, Comedy, Parody, Supernatural, Magic..."
4758,38472,Isekai Quartet,TV,2019,7.45,1736.0,510,"Comedy, Fantasy, Parody",It is a normal day; everyone from deranged mil...,"Comedy, Fantasy, Parody Emilia, SatouKazuma, N...","Emilia, SatouKazuma, NatsukiSubaru, Degurechaf...","[Emilia, Satou, Kazuma, Natsuki, Subaru, Degur...","Comedy, Fantasy, Parody Emilia, SatouKazuma, N..."
4912,39988,Isekai Quartet 2,TV,2020,7.38,2002.0,951,"Comedy, Parody, Fantasy",Despite completing all the tasks given to them...,"Comedy, Parody, Fantasy Emilia, SatouKazuma, N...","Emilia, SatouKazuma, NatsukiSubaru, Degurechaf...","[Emilia, Satou, Kazuma, Natsuki, Subaru, Degur...","Comedy, Parody, Fantasy Emilia, SatouKazuma, N..."
2577,10790,Kore wa Zombie Desu ka? of the Dead,TV,2012,7.56,1392.0,432,"Action, Harem, Comedy, Supernatural, Magic, Ecchi",kawa Ayumu was revived as a zombie by the cute...,"Action, Harem, Comedy, Supernatural, Magic, Ec...","HellscytheEucliwood, AikawaAyumu, Seraphim, Yo...","[Hellscythe, Eucliwood, Aikawa, Ayumu, Seraphi...","Action, Harem, Comedy, Supernatural, Magic, Ec..."
515,738,MÄR,TV,2007,7.28,2455.0,2427,"Action, Adventure, Comedy, Fantasy, Shounen","Dreaming of a magical world every night, the y...","Action, Adventure, Comedy, Fantasy, Shounen To...","ToramizuGinta, Dorothy, Alviss, Nanashi, Snow,...","[Toramizu, Ginta, Dorothy, Alviss, Nanashi, Sn...","Action, Adventure, Comedy, Fantasy, Shounen To..."
516,740,Bishoujo Senshi Sailor Moon R,TV,1994,7.69,1051.0,1213,"Demons, Magic, Romance, Shoujo",Usagi Tsukino and her friends have been living...,"Demons, Magic, Romance, Shoujo TsukinoUsagi, H...","TsukinoUsagi, HinoRei, KinoMakoto, AinoMinako,...","[Tsukino, Usagi, Hino, Rei, Kino, Makoto, Aino...","Demons, Magic, Romance, Shoujo TsukinoUsagi, H..."
395,530,Bishoujo Senshi Sailor Moon,TV,1993,7.68,1070.0,514,"Demons, Magic, Romance, Shoujo",Usagi Tsukino is an average student and crybab...,"Demons, Magic, Romance, Shoujo TsukinoUsagi, H...","TsukinoUsagi, HinoRei, KinoMakoto, AinoMinako,...","[Tsukino, Usagi, Hino, Rei, Kino, Makoto, Aino...","Demons, Magic, Romance, Shoujo TsukinoUsagi, H..."
1969,6166,Asobi ni Iku yo!,TV,2010,6.65,5140.0,917,"Comedy, Ecchi, Harem, Romance, SciFi","Kio is just another boring, nice guy with a bo...","Comedy, Ecchi, Harem, Romance, SciFi Eris, Fut...","Eris, FutabaAoi, KinjouManami, KakazuKio, Chai...","[Eris, Futaba, Aoi, Kinjou, Manami, Kakazu, Ki...","Comedy, Ecchi, Harem, Romance, SciFi Eris, Fut..."
3503,25731,Cross Ange: Tenshi to Ryuu no Rondo,TV,2015,7.41,1857.0,958,"Action, Mecha, SciFi",gelise Ikaruga Misurugi is the first princess ...,"Action, Mecha, SciFi IkarugaMisurugiAngelise, ...","IkarugaMisurugiAngelise, SchlievogtHildegard, ...","[Ikaruga, Misurugi, Angelise, Schlievogt, Hild...","Action, Mecha, SciFi IkarugaMisurugiAngelise, ..."


In [152]:
get_recommendation('bleach').iloc[-2, -2]

'Action, Adventure, Comedy, Super Power, Supernatural, Shounen KurosakiIchigo, KuchikiRukia, AbaraiRenji "Hell" is the place where a person is sent to who committed violent crimes when they were alive. Shinigami are forbidden to go there. One day, prisoners revolt and make their escape to Karakura—the real world—where Ichigo and his friends live in. Ichigo and his friends are defeated one after the other by prisoners with overwhelming power. A mysterious man appears who comes to their rescue. With Kokutou leading the way, Ichigo, Rukia, Uryuu, and Renji marche into Hell to save the world. Note: Episode 299 of Bleach serves as a prologue to this movie. (Source: AniDB, edited)'

In [121]:
def get_link_by_text(soup, anime_id, text):
    a_tags = soup.find_all("a", text=text)
    links = list(filter(lambda x: str(anime_id) in x["href"], a_tags))
    return links[0]["href"]

In [81]:
anime_id = 1
data = requests.get(f"https://myanimelist.net/anime/{anime_id}")

anime_info = data.text
soup = BeautifulSoup(anime_info, "html.parser")

In [73]:
# Running once will only remove the first found <script> element,
# so it's unnecessary
# soup.script.decompose()

In [1]:
link_review = get_link_by_text(soup, anime_id, "Reviews")
link_recomendations = get_link_by_text(soup, anime_id, "Recommendations")
link_stats = get_link_by_text(soup, anime_id, "Stats")
link_staff = get_link_by_text(soup, anime_id, "Characters & Staff")
link_pictures = get_link_by_text(soup, anime_id, "Pictures")

In [80]:
link_review

'https://myanimelist.net/anime/1/Cowboy_Bebop/reviews?p=24'

In [78]:
link = link_review
page = 23
actual_link = f"{link}?p={page}"
data = requests.get(actual_link)
soup = BeautifulSoup(data.text, "html.parser")
reviews = soup.find_all("a", text="Overall Rating")

In [83]:
list(filter(lambda x: str(anime_id) in x["href"], soup.find_all("a", text=text)))

[<a href="https://myanimelist.net/anime/1/Cowboy_Bebop/reviews">Reviews</a>]