## ANIME RECOMMENDER SYSTEM

## MY ANIME LIST WEBSCRAPPING

In [1]:
# import library
import requests 
from bs4 import BeautifulSoup 

import pandas as pd
import numpy as np
import re

from fuzzywuzzy import fuzz, process
from rapidfuzz import fuzz, process
import difflib

import ipywidgets as widgets
from IPython.display import display

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [2]:
# scrapping anime lists such as ranking, title, rating, etc. on the first page
site_url = 'https://myanimelist.net/topanime.php'

response = requests.get(site_url)

if response.status_code == 200:
    doc = BeautifulSoup(response.text, 'html.parser')
    row_content = doc.find_all('tr', class_='ranking-list')
    top_anime = []

    for row in row_content:
        ranking = {
            'Rank': row.find('td', class_="rank ac").find('span').text,
            'Title': row.find('div', class_="di-ib clearfix").find('a').text,
            'Rating': row.find('td', class_="score ac fs14").find('span').text,
            'Image_URL': row.find('td', class_='title al va-t word-break').find('img')['data-src'],
        }

        episode_info = row.find('div', class_="information di-ib mt4").text.strip().split('\n')
        ranking['Episodes'] = episode_info[0].strip() if episode_info else None
        ranking['Dates'] = episode_info[1].strip() if len(episode_info) > 1 else None

        members_info = episode_info[-1].strip().replace('members', '').replace(',', '').strip()
        ranking['Members'] = int(members_info) if members_info.isdigit() else None

        top_anime.append(ranking)

    df_top_anime = pd.DataFrame(top_anime)
    print(df_top_anime.to_string(index=False))

else:
    print("Failed to retrieve the page. Status code:", response.status_code)

In [3]:
# scrapping anime lists such as ranking, title, rating, etc. on the next page
site_url = 'https://myanimelist.net'

def get_top_anime_page(page_number):
    top_anime_url = site_url + '/topanime.php?limit=' + str(page_number * 50)
    response = requests.get(top_anime_url)
    doc = BeautifulSoup(response.text, 'html.parser')
    row_content = doc.find_all('tr', {'class': 'ranking-list'})
    return row_content

def parse_episodes(listt):
    result = []
    for i in listt[:2]:
        r = i.strip()
        result.append(r)
    return result

def parse_members_info(row):
    episode_info = row.find('div', class_="information di-ib mt4").text.strip().split('\n')
    members_info = episode_info[-1].strip().replace('members', '').replace(',', '').strip()
    return int(members_info) if members_info.isdigit() else None

def parse_anime_data(row):
    episode = parse_episodes(row.find('div', class_='information di-ib mt4').text.strip().split('\n'))
    ranking = {
        'Rank': row.find('td', class_='rank ac').find('span').text,
        'Title': row.find('div', class_='di-ib clearfix').find('a').text,
        'Rating': row.find('td', class_='score ac fs14').find('span').text,
        'Image_URL': row.find('td', class_='title al va-t word-break').find('img')['data-src'],
        'Episodes': episode[0],
        'Dates': episode[1],
        'Members': parse_members_info(row)
    }
    return ranking

def get_all_top_anime():
    top_anime = []
    for page_number in range(1, 200):
        row_content = get_top_anime_page(page_number)
        for row in row_content:
            ranking = parse_anime_data(row)
            top_anime.append(ranking)
    return top_anime

all_top_anime = get_all_top_anime()

df_51_10000 = pd.DataFrame(all_top_anime)
print(df_51_10000.to_string())

In [4]:
# combine the scrapping results of the first page and the next page
df_1_50 = pd.DataFrame(top_anime)
df_51_10000 = pd.DataFrame(all_top_anime)

df = pd.concat([df_1_50, df_51_10000], ignore_index=True)
print(df.to_string())

In [5]:
# save to csv
df.to_csv('Anime_Recommender_System_Scrapping.csv', index=False)

## DATA PRE-PROCESSING

In [2]:
# merge the scrapped dataframe with the dataframe from kaggle only to complete additional information such as genre, studios, procedure, and synopsis
anime_movies_scrapping = pd.read_csv('Anime_Recommender_System_Scrapping.csv')
anime_movies = pd.read_csv('Anime_Recommender_System.csv')

df = anime_movies_scrapping[['Rank', 'Title', 'Rating', 'Image_URL', 'Episodes', 'Dates', 'Members']].merge(
     anime_movies[['title', 'genres', 'studios', 'producers', 'synopsis']],
     left_on='Title', right_on='title', how='left')

In [3]:
# removed the title column from the kaggle dataframe, and organized the column names
df.drop(columns='title', inplace=True)

df.rename(columns={'genres': 'Genres',
                   'studios': 'Studios',
                   'producers': 'Producers',
                   'synopsis': 'Synopsis'},
                   inplace=True)
df.head()

Unnamed: 0,Rank,Title,Rating,Image_URL,Episodes,Dates,Members,Genres,Studios,Producers,Synopsis
0,1,Sousou no Frieren,9.12,https://cdn.myanimelist.net/r/50x70/images/ani...,TV (28 eps),Sep 2023 -,342432,"['Adventure', 'Drama', 'Fantasy']",['Madhouse'],"['TOHO animation', 'Shogakukan']","The demon king has been defeated, and the vict..."
1,2,Fullmetal Alchemist: Brotherhood,9.09,https://cdn.myanimelist.net/r/50x70/images/ani...,TV (64 eps),Apr 2009 - Jul 2010,3262313,"['Action', 'Adventure', 'Drama', 'Fantasy']",['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...",After a horrific alchemy experiment goes wrong...
2,3,Steins;Gate,9.07,https://cdn.myanimelist.net/r/50x70/images/ani...,TV (24 eps),Apr 2011 - Sep 2011,2505276,"['Drama', 'Sci-Fi', 'Suspense']",['White Fox'],"['Frontier Works', 'Media Factory', 'Kadokawa ...",Eccentric scientist Rintarou Okabe has a never...
3,4,Gintama°,9.06,https://cdn.myanimelist.net/r/50x70/images/ani...,TV (51 eps),Apr 2015 - Mar 2016,614712,"['Action', 'Comedy', 'Sci-Fi']",['Bandai Namco Pictures'],"['TV Tokyo', 'Aniplex', 'Dentsu']","Gintoki, Shinpachi, and Kagura return as the f..."
4,5,Shingeki no Kyojin Season 3 Part 2,9.05,https://cdn.myanimelist.net/r/50x70/images/ani...,TV (10 eps),Apr 2019 - Jul 2019,2194804,"['Action', 'Drama']",['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",Seeking to restore humanity's diminishing hope...


In [4]:
# dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7750 entries, 0 to 7749
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Rank       7750 non-null   int64  
 1   Title      7750 non-null   object 
 2   Rating     7750 non-null   float64
 3   Image_URL  7750 non-null   object 
 4   Episodes   7750 non-null   object 
 5   Dates      7750 non-null   object 
 6   Members    7750 non-null   int64  
 7   Genres     7620 non-null   object 
 8   Studios    7620 non-null   object 
 9   Producers  7620 non-null   object 
 10  Synopsis   7523 non-null   object 
dtypes: float64(1), int64(2), object(8)
memory usage: 666.1+ KB


In [5]:
# remove values with white space to NaN and check for missing values
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
df.isnull().sum()

Rank           0
Title          0
Rating         0
Image_URL      0
Episodes       0
Dates          0
Members        0
Genres       130
Studios      130
Producers    130
Synopsis     227
dtype: int64

In [6]:
# remove missing values
df.dropna(inplace=True)

In [7]:
# remove all punctuation in the title and synopsis fields
def remove_punctuation(df, column_name):
    df[column_name] = df[column_name].str.replace(r"[^\w\s]", " ", regex=True)
    df[column_name] = df[column_name].str.replace(r"\s+", " ", regex=True)

remove_punctuation(df, 'Title')
remove_punctuation(df, 'Synopsis')

In [8]:
# remove_square_brackets_and_quotation_marks in the genres, studios, and producers fields
def remove_square_brackets_and_quotation_marks(df, column_name):
    df[column_name] = df[column_name].str.strip("[]").str.strip("'")
    df[column_name] = df[column_name].apply(lambda x: ' '.join(re.findall(r'\b\w+\b', x)))

remove_square_brackets_and_quotation_marks(df, 'Genres')
remove_square_brackets_and_quotation_marks(df, 'Studios')
remove_square_brackets_and_quotation_marks(df, 'Producers')

In [9]:
# separates the value in the episodes column into 2 columns, namely type and episodes
def process_episodes(episodes):
    match = re.match(r'(\w+) \((\d+) eps\)', episodes)
    if match:
        return match.group(1), int(match.group(2))
    else:
        return None, None

df['Type'], df['Episodes'] = zip(*df['Episodes'].map(process_episodes))
df['Episodes'] = df['Episodes'].replace(r'[\(\)eps]+', '', regex=True)
df['Episodes'] = df['Episodes'].fillna('Unknown')

In [10]:
# separates the values in the dates column into start date and end date
def process_dates(date_range):
    if '-' in date_range:
        start_date, end_date = map(str.strip, date_range.split('-'))
        end_date = 'Present' if end_date == '' else end_date
    else:
        start_date, end_date = date_range, 'Present'
    return start_date, end_date

df[['StartDate', 'EndDate']] = df['Dates'].apply(process_dates).apply(pd.Series)
df = df.drop(columns='Dates')

In [11]:
# create a status column based on whether the anime has finished airing or is still running
def determine_status(end_date):
    return 'Currently Airing' if end_date == 'Present' else 'Finished Airing'

df['Status'] = df['EndDate'].apply(determine_status)

In [12]:
# sort column names
col_order = ['Rank', 'Title', 'Rating', 'Type', 'Episodes', 'StartDate', 'EndDate', 'Status', 'Genres', 'Studios', 'Producers', 'Synopsis', 'Members']
df = df[col_order]

In [13]:
df.head()

Unnamed: 0,Rank,Title,Rating,Type,Episodes,StartDate,EndDate,Status,Genres,Studios,Producers,Synopsis,Members
0,1,Sousou no Frieren,9.12,TV,28.0,Sep 2023,Present,Currently Airing,Adventure Drama Fantasy,Madhouse,TOHO animation Shogakukan,The demon king has been defeated and the victo...,342432
1,2,Fullmetal Alchemist Brotherhood,9.09,TV,64.0,Apr 2009,Jul 2010,Finished Airing,Action Adventure Drama Fantasy,Bones,Aniplex Square Enix Mainichi Broadcasting Syst...,After a horrific alchemy experiment goes wrong...,3262313
2,3,Steins Gate,9.07,TV,24.0,Apr 2011,Sep 2011,Finished Airing,Drama Sci Fi Suspense,White Fox,Frontier Works Media Factory Kadokawa Shoten M...,Eccentric scientist Rintarou Okabe has a never...,2505276
3,4,Gintama,9.06,TV,51.0,Apr 2015,Mar 2016,Finished Airing,Action Comedy Sci Fi,Bandai Namco Pictures,TV Tokyo Aniplex Dentsu,Gintoki Shinpachi and Kagura return as the fun...,614712
4,5,Shingeki no Kyojin Season 3 Part 2,9.05,TV,10.0,Apr 2019,Jul 2019,Finished Airing,Action Drama,Wit Studio,Production I G Dentsu Mainichi Broadcasting Sy...,Seeking to restore humanity s diminishing hope...,2194804


In [14]:
# remove values with white space to NaN and check for missing values
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
df.isnull().sum()

Rank            0
Title           0
Rating          0
Type           31
Episodes        0
StartDate       0
EndDate         0
Status          0
Genres        197
Studios       303
Producers    2154
Synopsis        0
Members         0
dtype: int64

In [15]:
# remove missing values
df.dropna(inplace=True)
df.reset_index(inplace=True)

In [16]:
# dataset information after cleaning
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5049 entries, 0 to 5048
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   index      5049 non-null   int64  
 1   Rank       5049 non-null   int64  
 2   Title      5049 non-null   object 
 3   Rating     5049 non-null   float64
 4   Type       5049 non-null   object 
 5   Episodes   5049 non-null   object 
 6   StartDate  5049 non-null   object 
 7   EndDate    5049 non-null   object 
 8   Status     5049 non-null   object 
 9   Genres     5049 non-null   object 
 10  Studios    5049 non-null   object 
 11  Producers  5049 non-null   object 
 12  Synopsis   5049 non-null   object 
 13  Members    5049 non-null   int64  
dtypes: float64(1), int64(3), object(10)
memory usage: 552.4+ KB


## CONTENT-BASED FILTERING

In [17]:
# combine the three features into one text, convert the text into TF-IDF vectors, and calculate the cosine similarity between the TF-IDF vectors
df['Combined'] = df['Genres'] + ' ' + df['Studios'] + ' ' + df['Producers']

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Combined'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
df['Content_Based_Score'] = cosine_sim.diagonal()

## POPULARITY-BASED FILTERING

In [18]:
# scaling the rank, rating, and member columns and creating a popularity score
scaler = MinMaxScaler()
df[['Rank_Normalized', 'Rating_Normalized', 'Members_Normalized']] = scaler.fit_transform(df[['Rank', 'Rating', 'Members']])

weights = {'Rank_Normalized': 0.2, 'Rating_Normalized': 0.5, 'Members_Normalized': 0.3}
df['Popularity_Score'] = df.apply(lambda row: sum(row[col] * weights[col] for col in weights), axis=1)

## ANIME RECOMMENDER SYSTEM

In [19]:
# create a weightscore to determine how much score weight for each algorithm
content_based_weight = 0.7
popularity_based_weight = 0.3

df['Final_Score'] = content_based_weight * df['Content_Based_Score'] + popularity_based_weight * df['Popularity_Score']
df_sorted_final = df.sort_values(by='Final_Score', ascending=False)

In [None]:
# create a search bar box to determine anime recommendations based on user input
def search(title, df_sorted_final, cosine_sim):
    try:
        idx = df_sorted_final[df_sorted_final['Title'].str.contains(title, case=False)].index[0]
    except IndexError:
        print(f"No matching anime found for the input '{title}'.")
        return None

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_similar_indices = [i[0] for i in sim_scores[1:6]]

    recommended_anime = df_sorted_final[['Title', 'Final_Score']].iloc[top_similar_indices]
    return recommended_anime

anime_input = widgets.Text(
    value='Jujutsu Kaisen',
    description='Anime Title :',
    disabled=False
)

anime_list = widgets.Output()

def on_type(change):
    with anime_list:
        anime_list.clear_output()
        title = change.new
        if len(title) > 2:
            recommended_anime = search(title, df_sorted_final, cosine_sim)
            if recommended_anime is not None:
                display(recommended_anime)

anime_input.observe(on_type, names='value')
display(anime_input, anime_list)

Text(value='Jujutsu Kaisen', description='Anime Title :')

Output()