## Anime Recommendation Engine

### Data Preprocessing

In [30]:
import pandas as pd
from pathlib import Path
import os

In [31]:
df = pd.read_csv(Path(os.getcwd()).parent / 'data' / 'raw' / 'combined_dataset_raw.csv')

In [32]:
df = df.dropna()

In [33]:
def process_duration(row):
    if row['Duration'] == 'Unknown':
        return 0
    else:
        total_time = 0
        if 'hr' in row['Duration']:
            total_time += int(row['Duration'].split(' ')[0]) * 60
            if 'min' in row['Duration']:
                total_time += int(row['Duration'].split(' ')[2])
        else:
            total_time += int(row['Duration'].split(' ')[0])
        return total_time * row['Episodes']

df['Duration_mins'] = df.apply(process_duration, axis=1)

In [34]:
df[['Duration', 'Duration_mins', 'Episodes', 'Type']].head()

Unnamed: 0,Duration,Duration_mins,Episodes,Type
0,24 min. per ep.,288.0,12.0,TV
1,23 min. per ep.,299.0,13.0,TV
2,1 hr. 10 min.,70.0,1.0,OVA
3,24 min. per ep.,1536.0,64.0,TV
4,30 min.,30.0,1.0,Special


In [36]:
df[df['Episodes'] == 110][['Duration', 'Duration_mins', 'Episodes', 'Type']].head()

Unnamed: 0,Duration,Duration_mins,Episodes,Type
6741,26 min. per ep.,2860.0,110.0,OVA
14368,26 min. per ep.,2860.0,110.0,OVA
37959,26 min. per ep.,2860.0,110.0,OVA
41345,26 min. per ep.,2860.0,110.0,OVA
42158,26 min. per ep.,2860.0,110.0,OVA


In [37]:
df.drop(['Duration'], axis=1, inplace=True)
df.rename(columns={'Duration_mins': 'Duration'}, inplace=True)

In [44]:
# Combine Score-1 to Score-10 for a combined user_score column which ranges between 1 and 10
# eg. if Score-1 = 8, Score-2 = 9, Score-3 = 10, then user_score = 1*(8/(8+9+10)) + 2*(9/(8+9+10)) + 3*(10/(8+9+10))
def process_user_score(row):
    total_score = 0
    total_weight = 0
    for i in range(1, 11):
        if row['Score-' + str(i)] != 0:
            total_score += i * row['Score-' + str(i)]
            total_weight += row['Score-' + str(i)]
    if total_weight == 0:
        return 0
    else:
        return total_score / total_weight
df['user_score'] = df.apply(process_user_score, axis=1)

In [45]:
df.drop(['Score-' + str(i) for i in range(1, 11)], axis=1, inplace=True)

In [48]:
df.drop(['Watching', 'Completed', 'On-Hold', 'Dropped', 'Plan to Watch'], axis=1, inplace=True)

In [50]:
df.drop(['Licensors'], axis=1, inplace=True)

In [57]:
df.drop(['Premiered', 'Aired'], axis=1, inplace=True)

In [58]:
df.columns

Index(['user_id', 'anime_id', 'rating', 'watching_status', 'episodes_watched',
       'Name', 'Score', 'Genres', 'English name', 'Japanese name', 'Type',
       'Episodes', 'Producers', 'Studios', 'Source', 'Rating', 'Ranked',
       'Popularity', 'Members', 'Favorites', 'Duration', 'user_score'],
      dtype='object')

In [62]:
df.drop(['Members', 'Favorites'], axis=1, inplace=True)

In [77]:
df.drop(['Producers'], axis=1, inplace=True)

In [63]:
df.columns

Index(['user_id', 'anime_id', 'rating', 'watching_status', 'episodes_watched',
       'Name', 'Score', 'Genres', 'English name', 'Japanese name', 'Type',
       'Episodes', 'Producers', 'Studios', 'Source', 'Rating', 'Ranked',
       'Popularity', 'Duration', 'user_score'],
      dtype='object')

In [78]:
df.to_csv(Path(os.getcwd()).parent / 'data' / 'processed' / 'combined_dataset_processed.csv', index=False)

In [65]:
df.columns

Index(['user_id', 'anime_id', 'rating', 'watching_status', 'episodes_watched',
       'Name', 'Score', 'Genres', 'English name', 'Japanese name', 'Type',
       'Episodes', 'Producers', 'Studios', 'Source', 'Rating', 'Ranked',
       'Popularity', 'Duration', 'user_score'],
      dtype='object')

In [79]:
user_recommendation_df = df[['user_id', 'anime_id', 'rating', 'watching_status', 'episodes_watched']]
anime_data_df = df.drop(['user_id', 'rating', 'watching_status', 'episodes_watched'], axis=1)

In [80]:
anime_data_df = anime_data_df.drop_duplicates()

In [81]:
anime_data_df.to_csv(Path(os.getcwd()).parent / 'data' / 'processed' / 'anime_mapping.csv', index=False)

In [82]:
user_recommendation_df.to_csv(Path(os.getcwd()).parent / 'data' / 'processed' / 'user_recommendations.csv', index=False)

In [83]:
anime_data_df.head()

Unnamed: 0,anime_id,Name,Score,Genres,English name,Japanese name,Type,Episodes,Studios,Source,Rating,Ranked,Popularity,Duration,user_score
0,27831,Durarara!!x2 Ten,8.01,"Action, Mystery, Supernatural",Durarara!! x2 Ten,デュラララ!!×２ 転,TV,12.0,Shuka,Light novel,R - 17+ (violence & profanity),529.0,448.0,288.0,8.012107
1,29785,Jitsu wa Watashi wa,6.91,"Comedy, Supernatural, Romance, Vampire, Fantas...","Actually, I am...",実は私は,TV,13.0,"TMS Entertainment, 3xCube",Manga,PG-13 - Teens 13 or older,4004.0,761.0,299.0,6.882443
2,10417,Gyo,5.45,Horror,GYO:Tokyo Fish Attack!,ギョ,OVA,1.0,ufotable,Manga,R+ - Mild Nudity,9806.0,2337.0,70.0,5.445792
3,5114,Fullmetal Alchemist: Brotherhood,9.19,"Action, Military, Adventure, Comedy, Drama, Ma...",Fullmetal Alchemist:Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,TV,64.0,Bones,Manga,R - 17+ (violence & profanity),1.0,3.0,1536.0,9.071444
4,1015,Full Metal Panic! The Second Raid: Wari to Him...,7.69,Comedy,Unknown,わりとヒマな戦隊長の一日,Special,1.0,Kyoto Animation,Manga,PG-13 - Teens 13 or older,1051.0,1884.0,30.0,7.700794


In [85]:
common_features = ['anime_id', 'Score', 'Genres', 'Type', 'Studios', 'Source', 'Ranked', 'Popularity', 'Duration', 'user_score']
anime_df_slice = anime_data_df[common_features]

In [87]:
dataset_processed = user_recommendation_df.merge(anime_df_slice, on='anime_id', how='left')

In [88]:
dataset_processed.head()

Unnamed: 0,user_id,anime_id,rating,watching_status,episodes_watched,Score,Genres,Type,Studios,Source,Ranked,Popularity,Duration,user_score
0,247449,27831,0,completed,12.0,8.01,"Action, Mystery, Supernatural",TV,Shuka,Light novel,529.0,448.0,288.0,8.012107
1,329360,29785,8,completed,13.0,6.91,"Comedy, Supernatural, Romance, Vampire, Fantas...",TV,"TMS Entertainment, 3xCube",Manga,4004.0,761.0,299.0,6.882443
2,26819,10417,7,completed,1.0,5.45,Horror,OVA,ufotable,Manga,9806.0,2337.0,70.0,5.445792
3,228283,5114,7,completed,64.0,9.19,"Action, Military, Adventure, Comedy, Drama, Ma...",TV,Bones,Manga,1.0,3.0,1536.0,9.071444
4,322675,1015,8,completed,1.0,7.69,Comedy,Special,Kyoto Animation,Manga,1051.0,1884.0,30.0,7.700794


In [90]:
dataset_processed.to_csv(Path(os.getcwd()).parent / 'data' / 'processed' / 'dataset_processed.csv', index=False)

### Load File Data

In [3]:
import pandas as pd
from pathlib import Path
import os

In [4]:
dataset_processed = pd.read_csv(Path(os.getcwd()).parent / 'data' / 'processed' / 'dataset_processed.csv')

In [6]:
from sentence_transformers import SentenceTransformer, util

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')

.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 455kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 92.0kB/s]
README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 7.91MB/s]
config.json: 100%|██████████| 612/612 [00:00<00:00, 1.03MB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 145kB/s]
data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 195kB/s]
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:03<00:00, 30.3MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 27.1kB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 58.6kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 757kB/s]
tokenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 124kB/s]
train_script.py: 100%|██████████| 13.2k/13.2k [00:00<00:00, 7.94MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 568kB/s]
modules.json: 100%|██████████| 349/349 [00:00<00:00, 178kB/s]


In [8]:
sentences = dataset_processed['Genres']

In [9]:
embeddings = model.encode(sentences)

KeyboardInterrupt: 

In [None]:
MODEL_BASE_PATH = Path(os.getcwd()).parent / 'models'
EMBEDDINGS_PATH = MODEL_BASE_PATH / 'embeddings.joblib'
MODEL_PATH = MODEL_BASE_PATH / 'model.joblib'

In [None]:
import joblib
joblib.dump(embeddings, EMBEDDINGS_PATH)
joblib.dump(model, MODEL_PATH)

### Model Running

In [1]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_recommenders as tfrs
import pickle
from pathlib import Path

In [2]:
class AnimeRecommenderModel(tfrs.Model):
    def __init__(
        self,
        user_model: keras.Model,
        anime_model: keras.Model,
        task: tfrs.tasks.Retrieval
    ):
        super().__init__()
        self.user_model = user_model
        self.anime_model = anime_model
        self.task = task
    
    def compute_loss(self, features, training=False) -> tf.Tensor:
        user_embeddings = self.user_model(features['user_id'])
        anime_embeddings = self.anime_model(features['Name'])
        
        return self.task(user_embeddings, anime_embeddings)
    

In [19]:
class ILPipeline:
    def __init__(self, user_model_path: Path, anime_model_path: Path, task_path: Path, model_path: Path, index_path: Path, user_dataset_path: Path = None, anime_dataset_path: Path = None):
        self._user_model = tf.saved_model.load(user_model_path)
        self._anime_model = tf.saved_model.load(anime_model_path)
        # with open(task_path, 'rb') as f:
        #     self._task = pickle.load(f)
        self._task = tfrs.tasks.Retrieval()
        self._model = AnimeRecommenderModel(self._user_model, self._anime_model, self._task)
        self._model.load_weights(model_path)
        self._index = tf.saved_model.load(index_path)
        self._user_dataset = pd.read_csv(user_dataset_path) if user_dataset_path else None
        self._anime_dataset = pd.read_csv(anime_dataset_path) if anime_dataset_path else None
        self._epochs = 100
    
    def train_incremental(self):
        assert self._user_dataset is not None
        self._model.fit(self._user_dataset, epochs=self._epochs)
    
    def get_top_k_recommendations(self, user_id, k = 5):
        _, animes = self._index([user_id])
        animes = animes.numpy()[0]
        return [a.decode() for a in animes[:k]]

In [20]:
ilp = ILPipeline(
    "../models/user_model/",
    "../models/anime_model/",
    "../models/task.pkl",
    "../models/recommender_model_weights/",
    "../models/recommender_index/"
)



In [36]:
ilp.get_top_k_recommendations(4590, k=10)

['Nurarihyon no Mago: Sennen Makyou',
 'Naruto: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo! - Konoha no Sato no Dai Undoukai',
 'Sidonia no Kishi',
 'Donten ni Warau',
 'Guilty Crown',
 'Koroshiya-san: The Hired Gun',
 "I''s",
 'Kumo no Mukou, Yakusoku no Basho',
 '5-toubun no Hanayome',
 'Tatakau Shisho: The Book of Bantorra']

In [38]:
import pandas as pd
df = pd.read_csv('../recommendation_list_3k.csv')

In [43]:
df[df['user_id'] == 64912]

Unnamed: 0.1,Unnamed: 0,user_id,rating,Name
