In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

import glob
import keras
from keras import layers
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Add, Activation, Lambda, BatchNormalization, Concatenate, Dropout, Input, Embedding, Dot, Reshape, Dense, Flatten

import tensorflow as tf
# Inspirations:
# https://www.kaggle.com/code/chaitanya99/recommendation-system-cf-anime

In [None]:
tf.config.experimental.list_physical_devices('GPU')

In [None]:
CUR_INPUT = "/mnt/d/datasets/anime2020/"
INPUT_DIR = "/mnt/d/datasets/anime2020/animelist_400+/*.parquet"

print()

In [None]:
rating_df = pd.read_parquet(
    glob.glob(INPUT_DIR)[0],
    columns=["user", "anime", "rating"]
)

rating_df

In [None]:
n_users = rating_df["user"].nunique()
n_animes = rating_df["anime"].nunique()

In [None]:
rating_df.memory_usage(index=False, deep=True)

In [None]:
print('Avg', np.mean(rating_df['rating']))

In [None]:
X = rating_df[['user', 'anime']].values
y = rating_df["rating"]

In [None]:
# Split
test_set_size = 10000 #10k for test set
train_indices = rating_df.shape[0] - test_set_size 

X_train, X_test, y_train, y_test = (
    X[:train_indices],
    X[train_indices:],
    y[:train_indices],
    y[train_indices:],
)

print('> Train set ratings: {}'.format(len(y_train)))
print('> Test set ratings: {}'.format(len(y_test)))

In [None]:
X_train_array = [X_train[:, 0], X_train[:, 1]]
X_test_array = [X_test[:, 0], X_test[:, 1]]

In [None]:
# del rating_df

In [None]:
# Embedding layers

def RecommenderNet():
    embedding_size = 64
    
    user = Input(name = 'user', shape = [1])
    user_embedding = Embedding(name = 'user_embedding',
                               input_dim = n_users, 
                               output_dim = embedding_size)(user)
    
    anime = Input(name = 'anime', shape = [1])
    anime_embedding = Embedding(name = 'anime_embedding',
                                input_dim = n_animes, 
                                output_dim = embedding_size)(anime)
    
    #x = Concatenate()([user_embedding, anime_embedding])
    x = Dot(name = 'dot_product', normalize = True, axes = 2)([user_embedding, anime_embedding])
    x = Flatten()(x)
        
    x = Dense(1, kernel_initializer='he_normal')(x)
    x = BatchNormalization()(x)
    x = Activation("sigmoid")(x)
    
    model = Model(inputs=[user, anime], outputs=x)
    model.compile(loss='binary_crossentropy', metrics=["mae", "mse"], optimizer='Adam')
    
    return model

model = RecommenderNet()
model.summary()

In [None]:
# Callbacks
from tensorflow.keras.callbacks import Callback, ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping, ReduceLROnPlateau

start_lr = 0.00001
min_lr = 0.00001
max_lr = 0.00005
batch_size = 1000

rampup_epochs = 5
sustain_epochs = 0
exp_decay = .8

def lrfn(epoch):
    if epoch < rampup_epochs:
        return (max_lr - start_lr)/rampup_epochs * epoch + start_lr
    elif epoch < rampup_epochs + sustain_epochs:
        return max_lr
    else:
        return (max_lr - min_lr) * exp_decay**(epoch-rampup_epochs-sustain_epochs) + min_lr


lr_callback = LearningRateScheduler(lambda epoch: lrfn(epoch), verbose=0)

checkpoint_filepath = './weights.h5'

model_checkpoints = ModelCheckpoint(filepath=checkpoint_filepath,
                                        save_weights_only=True,
                                        monitor='val_loss',
                                        mode='min',
                                        save_best_only=True)

early_stopping = EarlyStopping(patience = 3, monitor='val_loss', 
                               mode='min', restore_best_weights=True)

my_callbacks = [
    model_checkpoints,
    lr_callback,
    early_stopping,   
]

In [None]:
history = model.fit(
    x=X_train_array,
    y=y_train,
    batch_size=batch_size,
    epochs=20,
    verbose=1,
    validation_data=(X_test_array, y_test),
    callbacks=my_callbacks
)

In [None]:
# model.load_weights(checkpoint_filepath)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(history.history["loss"][0:-2])
plt.plot(history.history["val_loss"][0:-2])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "test"], loc="upper left")
plt.show()

In [None]:
def extract_weights(name, model):
    weight_layer = model.get_layer(name)
    weights = weight_layer.get_weights()[0]
    weights = weights / np.linalg.norm(weights, axis = 1).reshape((-1, 1))
    return weights

anime_embeddings = extract_weights('anime_embedding', model)
user_embeddings = extract_weights('user_embedding', model)

In [None]:
import os
import pickle

with open(os.path.join(CUR_INPUT, "anime_embeddings.pickle"), "wb") as f:
    pickle.dump(anime_embeddings, f)

with open(os.path.join(CUR_INPUT, "user_embeddings.pickle"), "wb") as f:
    pickle.dump(user_embeddings, f)

In [None]:
np.savetxt(os.path.join(CUR_INPUT,"anime_embeddings.tsv"), anime_embeddings, delimiter="\t")

In [None]:
df_anime = pd.read_csv(os.path.join(CUR_INPUT, "anime.csv"), low_memory=True)
df_anime = df_anime.rename(columns={"MAL_ID": "anime_id"})

In [None]:
with open(os.path.join(CUR_INPUT, "animelist_400+", "anime2anime_encoded.pickle"), "rb") as input_file:
    anime2anime_encoded = pickle.load(input_file)

In [None]:
df_anime["anime_id_mapped"] = df_anime["anime_id"].map(anime2anime_encoded, na_action="ignore")
df_anime = df_anime.dropna(subset=["anime_id_mapped"])
df_anime["anime_id_mapped"] = df_anime["anime_id_mapped"].astype(int)

In [None]:
df_anime.sort_values("anime_id_mapped")[["Name"]].to_csv(os.path.join(CUR_INPUT,"anime.tsv"), sep="\t", index=False, header=False)


In [None]:
df_anime.drop_duplicates(subset=["anime_id"])