# CS247 Anime Recommendation Engine

This notebook serves as project for CS247 Advanced Data Mining.

Group member:
* Yiming Shi 905525611
* Penghai Wei 105726519
* Yongqian Li 004997466
* Yanxun Li 005712570

In this notebook we will build a recomender system using dataset **Anime Recommendation Database 2020**.

The notebook contains the following serveral sections:
* Input Processing
* Baseline Construction
* Side Feature Embedding
* Utilizing Side Feature

# Input Processing

In this section, we mainly extract the basic input from "animelist.csv". We cleaned the data, remove duplicate data, normalized the score value, and present an overview of the rating matrix.

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Loading input
INPUT_DIR = '/kaggle/input/anime-recommendation-database-2020'
!ls {INPUT_DIR}

In [3]:
# Loading Rating Data
rating_df = pd.read_csv(INPUT_DIR + '/animelist.csv',
                        nrows=5000000,
                        low_memory=False, 
                        usecols=["user_id", "anime_id", "rating"]
                        )

# User should rate atleast 400 animies
n_ratings = rating_df['user_id'].value_counts()
rating_df = rating_df[rating_df['user_id'].isin(n_ratings[n_ratings >= 400].index)].copy()
len(rating_df)

In [4]:
# Scaling BTW (0 , 1.0)
min_rating = min(rating_df['rating'])
max_rating = max(rating_df['rating'])
rating_df['rating'] = rating_df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values.astype(np.float64)

AvgRating = np.mean(rating_df['rating'])
print('Avg', AvgRating)

In [5]:
# Removing Duplicated Rows
duplicates = rating_df.duplicated()

if duplicates.sum() > 0:
    print('> {} duplicates'.format(duplicates.sum()))
    rating_df = rating_df[~duplicates]

print('> {} duplicates'.format(rating_df.duplicated().sum()))

In [6]:
# Quick review of our rating matrix
g = rating_df.groupby('user_id')['rating'].count()
top_users = g.dropna().sort_values(ascending=False)[:20]
top_r = rating_df.join(top_users, rsuffix='_r', how='inner', on='user_id')

g = rating_df.groupby('anime_id')['rating'].count()
top_animes = g.dropna().sort_values(ascending=False)[:20]
top_r = top_r.join(top_animes, rsuffix='_r', how='inner', on='anime_id')

pd.crosstab(top_r.user_id, top_r.anime_id, top_r.rating, aggfunc=np.sum)

In [7]:
# Generate Training and testing data.

# Encoding categorical data
user_ids = rating_df["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
user_encoded2user = {i: x for i, x in enumerate(user_ids)}
rating_df["user"] = rating_df["user_id"].map(user2user_encoded)
n_users = len(user2user_encoded)

anime_ids = rating_df["anime_id"].unique().tolist()
anime2anime_encoded = {x: i for i, x in enumerate(anime_ids)}
anime_encoded2anime = {i: x for i, x in enumerate(anime_ids)}
rating_df["anime"] = rating_df["anime_id"].map(anime2anime_encoded)
n_animes = len(anime2anime_encoded)

print("Num of users: {}, Num of animes: {}".format(n_users, n_animes))
print("Min rating: {}, Max rating: {}".format(min(rating_df['rating']), max(rating_df['rating'])))

In [8]:
# Shuffle
rating_df = rating_df.sample(frac=1, random_state=73)

X = rating_df[['user', 'anime']].values
y = rating_df["rating"]

In [9]:
# Split
test_set_size = 1000 #1k for test set
train_indices = rating_df.shape[0] - test_set_size 
test_indicies = rating_df.shape[0] - test_set_size 

X_train, X_test, y_train, y_test = (
    X[:train_indices],
    X[test_indicies:],
    y[:train_indices],
    y[test_indicies:],
)

print('> Train set ratings: {}'.format(len(y_train)))
print('> Test set ratings: {}'.format(len(y_test)))

# Baseline Construction

In this baseline, we use the naive collaborative filtering method. The general idea behind this is that we are predicting what users will like based on their similarity to other users. We simply use the rating matrix generated in the last section, and perform matrix factorization on it.


In [34]:
# set the training and testing array
X_train_array = [X_train[:, 0], X_train[:, 1]]
X_test_array = [X_test[:, 0], X_test[:, 1]]

In [10]:
# Setup TPU
import tensorflow as tf

TPU_INIT = False

if TPU_INIT:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    !nvidia-smi
    
print(tf.__version__)

## Baseline Model

In [36]:
import keras
from keras import layers 
import tensorflow as tf
from keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras.layers import Add, Activation, Lambda, BatchNormalization, Concatenate, Dropout, Input, Embedding, Dot, Reshape, Dense, Flatten

def BaselineNet():
    embedding_size = 128
    
    user = Input(name = 'user', shape = [1])
    user_embedding = Embedding(name = 'user_embedding',
                       input_dim = n_users, 
                       output_dim = embedding_size)(user)
    
    anime = Input(name = 'anime', shape = [1])
    anime_embedding = Embedding(name = 'anime_embedding',
                       input_dim = n_animes, 
                       output_dim = embedding_size)(anime)
    
    x = Dot(name = 'dot_product', normalize = True, axes = 2)([user_embedding, anime_embedding])
    x = Flatten()(x)
    x = Dense(1, kernel_initializer='he_normal')(x)
    x = BatchNormalization()(x)
    x = Activation("sigmoid")(x)
    
    model = Model(inputs=[user, anime], outputs=x)
    model.compile(loss='binary_crossentropy', metrics=["mae", "mse"], optimizer='Adam')
    
    return model

if TPU_INIT:    
    with tpu_strategy.scope():
        model = BaselineNet()
else:
    model = BaselineNet()

model.summary()

In [37]:
# Callbacks
from tensorflow.keras.callbacks import Callback, ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping, ReduceLROnPlateau

start_lr = 0.00001
min_lr = 0.00001
max_lr = 0.00005
batch_size = 10000

if TPU_INIT:
    max_lr = max_lr * tpu_strategy.num_replicas_in_sync
    batch_size = batch_size * tpu_strategy.num_replicas_in_sync

rampup_epochs = 5
sustain_epochs = 0
exp_decay = .8

def lrfn(epoch):
    if epoch < rampup_epochs:
        return (max_lr - start_lr)/rampup_epochs * epoch + start_lr
    elif epoch < rampup_epochs + sustain_epochs:
        return max_lr
    else:
        return (max_lr - min_lr) * exp_decay**(epoch-rampup_epochs-sustain_epochs) + min_lr


lr_callback = LearningRateScheduler(lambda epoch: lrfn(epoch), verbose=0)

checkpoint_filepath = './weights.h5'

model_checkpoints = ModelCheckpoint(filepath=checkpoint_filepath,
                                        save_weights_only=True,
                                        monitor='val_loss',
                                        mode='min',
                                        save_best_only=True)

early_stopping = EarlyStopping(patience = 3, monitor='val_loss', 
                               mode='min', restore_best_weights=True)

my_callbacks = [
    model_checkpoints,
    lr_callback,
    #early_stopping,   
]

In [38]:
# Model training
history = model.fit(
    x=X_train_array,
    y=y_train,
    batch_size=batch_size,
    epochs=20,
    verbose=1,
    validation_data=(X_test_array, y_test),
    callbacks=my_callbacks
)

model.load_weights(checkpoint_filepath)

In [39]:
#Training results
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(history.history["loss"][0:-2])
plt.plot(history.history["val_loss"][0:-2])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "test"], loc="upper left")
plt.show()

# Side Feature Embedding

Noticed in our dataset, we also have the anime name, synopsis, and genres information. We want to use this side features, since it contains many info that might be useful. We utilize a pretrained bert model for embedding and use tensorboard for embeding visualization.

## Loading Synopsis Data

In [17]:
# loading Synopsis Data
anime_synopsis_df = pd.read_csv(INPUT_DIR + '/anime_with_synopsis.csv', 
                        low_memory=False, 
                        )
len(anime_synopsis_df)

We tried bert model with different size from HuggingFace. We stored the embeding result in pickle files so that we don't need to rerun this section again.

Here is the links for the model we tried.
* https://huggingface.co/bert-base-uncased
* https://huggingface.co/distilbert-base-uncased
* https://huggingface.co/prajjwal1/bert-tiny

In [None]:
# loading pretrained bert
from transformers import AutoTokenizer, AutoModel #for embeddings

tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny",)
bert_model = AutoModel.from_pretrained("prajjwal1/bert-tiny",output_hidden_states=True)

In [None]:
#create embeddings
def get_embeddings(text,token_length):
    tokens=tokenizer(text,max_length=token_length,padding='max_length',truncation=True)
    output=bert_model(torch.tensor(tokens.input_ids).unsqueeze(0),
                 attention_mask=torch.tensor(tokens.attention_mask).unsqueeze(0)).hidden_states[-1]
    return torch.mean(output,axis=1).detach().numpy()

get_embeddings("Action", 1).shape

In [None]:
# processing genres
# get all unique genres
from tqdm import tqdm
import pickle

genres_embeddings_map = {}

for _, j in tqdm(anime_synopsis_df.iterrows(), total=anime_synopsis_df.shape[0]):
    gList = j["Genres"].split(', ')
    for g in gList:
        if g not in genres_embeddings_map:
            genre_embedding = get_embeddings(g, 1)
            genres_embeddings_map[g] = genre_embedding

f = open("genre_embedding_map_small.pkl","wb")
pickle.dump(genres_embeddings_map,f)
f.close()

In [None]:
# translate animies into embedding
from tqdm import tqdm
import pickle

# check for Nan
anime_synopsis_df.fillna('', inplace=True)

anime_embedding_map = {}

for _, j in tqdm(anime_synopsis_df.iterrows(), total=anime_synopsis_df.shape[0]):
    name_embedding = get_embeddings(j["Name"], 10)    
    sypnopsis_embedding = get_embeddings(j["sypnopsis"], 50)
    anime_embedding_map[j["MAL_ID"]] = np.append(name_embedding[0], sypnopsis_embedding[0])
    
f = open("anime_embedding_map_small.pkl","wb")
pickle.dump(anime_embedding_map,f)
f.close()

## Visualization Using Tensorboard

To visualize the embedding result. I converted the data into tsv file which tensorboard accepts.

We could use https://projector.tensorflow.org/ to visualize the embedding result.

In [None]:
# visualize 
import csv

g_list=list(genres_embeddings_map.keys())
g_embedding_list=list(genres_embeddings_map.values())
embeddings = np.concatenate(g_embedding_list, axis=0)

np.savetxt('g_embedding.tsv', embeddings, delimiter='\t')

with open('g.tsv', 'w') as f_output:
    tsv_output = csv.writer(f_output, delimiter='\n')
    tsv_output.writerow(g_list)

## Loading Embedding

Since we stored the embedding in pickle files, we don't need to recalculate the embeddings. We could load from the pickle files.

In [11]:
# loading embeding pkl file
import pickle

anime_embedding_map = {}
# with open('../input/embedding-small/anime_embedding_map.pkl', 'rb') as f: # distilbert-base-uncased
with open('../input/embedding-small/anime_embedding_map_small.pkl', 'rb') as f: # prajjwal1/bert-tiny
    anime_embedding_map = pickle.load(f)

print(len(anime_embedding_map))

# Use Side Feature Embedding

## Using Anime Embedding
First we modify the training and testing data from using ids to using anime embedding

In [12]:
# change anime to anime embedding
user_train = np.array(X_train[:, 0])
anime_train_list = X_train[:, 1].tolist()

anime_train = []
for i in anime_train_list:
    if i in anime_embedding_map:
        anime_train.append(anime_embedding_map[i])
    else:
        anime_train.append(np.random.rand(256))
anime_train = np.array(anime_train)

In [13]:
# change anime to anime embedding
user_test = np.expand_dims(np.array(X_test[:, 0]), axis = 1)
anime_test_list = X_test[:, 1].tolist()

anime_test = []
for i in anime_test_list:
    if i in anime_embedding_map:
        anime_test.append(anime_embedding_map[i])
    else:
        anime_test.append(np.random.rand(256))
anime_test = np.array(anime_test)

### Model

In [14]:
import keras
from keras import layers 
import tensorflow as tf
from keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras.layers import Add, Activation, Lambda, BatchNormalization, Concatenate, Dropout, Input, Embedding, Dot, Reshape, Dense, Flatten

# Embedding layers
def Anime_Embedding_Net():
    embedding_size = 128
    
    user = Input(name = 'user', shape = [1])
    user_embedding = Embedding(name = 'user_embedding',
                       input_dim = n_users, 
                       output_dim = embedding_size)(user)
    user_embedding = Flatten()(user_embedding)
    
    anime = Input(name = 'anime', shape = [256])
    anime_embedding = Dense(512, activation='relu')(anime)
    anime_embedding = Dense(256, activation='relu')(anime_embedding)
    anime_embedding = Dense(128, activation='softmax')(anime_embedding)

    
    x = Dot(name = 'dot_product', normalize = True, axes = 1)([user_embedding, anime_embedding])
    x = Dense(1, kernel_initializer='he_normal')(x)
    x = BatchNormalization()(x)
    x = Activation("sigmoid")(x)
    
    model = Model(inputs=[user, anime], outputs=x)
    model.compile(loss='binary_crossentropy', metrics=["mae", "mse"], optimizer='Adam')
    
    return model

print(TPU_INIT)
if TPU_INIT:    
    with tpu_strategy.scope():
        model = Anime_Embedding_Net()
else:
    model = Anime_Embedding_Net()

model.summary()

In [15]:
# Callbacks
from tensorflow.keras.callbacks import Callback, ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping, ReduceLROnPlateau

start_lr = 0.0001
min_lr = 0.00001
max_lr = 0.001
batch_size = 4096

if TPU_INIT:
    max_lr = max_lr * tpu_strategy.num_replicas_in_sync
    batch_size = batch_size * tpu_strategy.num_replicas_in_sync

rampup_epochs = 5
sustain_epochs = 0
exp_decay = .8

def lrfn(epoch):
    if epoch < rampup_epochs:
        return (max_lr - start_lr)/rampup_epochs * epoch + start_lr
    elif epoch < rampup_epochs + sustain_epochs:
        return max_lr
    else:
        return (max_lr - min_lr) * exp_decay**(epoch-rampup_epochs-sustain_epochs) + min_lr


lr_callback = LearningRateScheduler(lambda epoch: lrfn(epoch), verbose=0)

checkpoint_filepath = './weights.h5'

model_checkpoints = ModelCheckpoint(filepath=checkpoint_filepath,
                                        save_weights_only=True,
                                        monitor='val_loss',
                                        mode='min',
                                        save_best_only=True)

early_stopping = EarlyStopping(patience = 3, monitor='val_loss', 
                               mode='min', restore_best_weights=True)

my_callbacks = [
    model_checkpoints,
    lr_callback,
    #early_stopping,   
]

In [16]:
# Model training
history = model.fit(
    x=[user_train, anime_train],
    y=y_train,
    batch_size=batch_size,
    epochs=20,
    verbose=1,
    validation_data=((user_test, anime_test), y_test),
    callbacks=my_callbacks
)

model.load_weights(checkpoint_filepath)

In [17]:
#Training results
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(history.history["loss"][0:-2])
plt.plot(history.history["val_loss"][0:-2])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "test"], loc="upper left")
plt.show()

Though the model is a little bit overfit. But we could find that the actual loss is smaller than the baseline.