# Collaborative Filtering using Neural Networks

Inspirations
- https://www.kaggle.com/code/chaitanya99/recommendation-system-cf-anime
- fast.ai CF lib

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

import glob
import keras
import os
import pickle

from models import EmbeddingDotBias

import tensorflow as tf
print(tf.config.experimental.list_physical_devices('GPU'))

sns.set_theme()

In [None]:
CUR_INPUT = "/mnt/d/datasets/anime2020/"
INPUT_DIR = "/mnt/d/datasets/anime2020/animelist_400+/"
OUTPUT_DIR = "/mnt/d/datasets/anime2020/animelist_400+_outputs/"

In [None]:
df_anime = pd.read_csv(os.path.join(CUR_INPUT, "anime.csv"), low_memory=True)
df_anime = df_anime.drop([f"Score-{x}" for x in range(1,11)], axis=1)
df_anime = df_anime.rename(columns={"MAL_ID": "anime_id"})

df_anime

In [None]:
rating_df = pd.read_parquet(
    glob.glob(os.path.join(INPUT_DIR, "*.parquet"))[0],
    columns=["user", "anime", "rating"]
)

rating_df

In [None]:
n_users = rating_df["user"].nunique()
n_animes = rating_df["anime"].nunique()

In [None]:
X = rating_df[['user', 'anime']].values
y = rating_df["rating"]

In [None]:
# Split
test_set_size = int(rating_df.shape[0] * 0.01) # 1% for test
train_indices = rating_df.shape[0] - test_set_size 

X_train, X_test, y_train, y_test = (
    X[:train_indices],
    X[train_indices:],
    y[:train_indices],
    y[train_indices:],
)

print('> Train set ratings: {}'.format(len(y_train)))
print('> Test set ratings: {}'.format(len(y_test)))

In [None]:
X_train_array = [X_train[:, 0], X_train[:, 1]]
X_test_array = [X_test[:, 0], X_test[:, 1]]

In [None]:
model, model_config = EmbeddingDotBias(
    n_users=n_users, n_animes=n_animes, n_factors=64, learning_rate=0.0005,
    batch_norm=True, use_bias=True, loss="binary_crossentropy"
)

model_sufix = f'{model_config["model_name"]}_lr{model_config["learning_rate"]}_{model_config["loss"]}_fc{model_config["n_factors"]}_bn{model_config["batch_norm"]}_bias{model_config["use_bias"]}'
print(model_sufix)

model.summary()

In [None]:
# Callbacks
from tensorflow.keras.callbacks import Callback, ModelCheckpoint, TensorBoard, EarlyStopping, ReduceLROnPlateau

checkpoint_filepath = f'./weights_{model_config["n_factors"]}_.h5'
model_checkpoints = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True
)

early_stopping = EarlyStopping(
    patience = 3,
    monitor='val_loss', 
    mode='min',
    restore_best_weights=True
)

my_callbacks = [
    model_checkpoints,
    early_stopping,   
]

In [None]:
history = model.fit(
    x=X_train_array,
    y=y_train,
    batch_size=1024,
    epochs=15,
    verbose=1,
    validation_data=(X_test_array, y_test),
    callbacks=my_callbacks
)

In [None]:
plot_df = pd.DataFrame(history.history)
plot_df["epochs"] = plot_df.index
plot_df = plot_df[["epochs", "loss", "val_loss"]]
plot_df.to_csv(f"loss_{model_sufix}.csv", index=False)
plot_df = pd.melt(plot_df, ['epochs'], value_name="loss_value")

axs = sns.lineplot(plot_df, x="epochs", y="loss_value", hue="variable")
axs.figure.savefig(f"loss_{model_sufix}.png")

In [None]:
model.load_weights(checkpoint_filepath)

In [None]:
def extract_weights(name, model):
    weight_layer = model.get_layer(name)
    weights = weight_layer.get_weights()[0]
    weights = weights / np.linalg.norm(weights, axis = 1).reshape((-1, 1))
    return weights

anime_embeddings = extract_weights('anime_embedding', model)
user_embeddings = extract_weights('user_embedding', model)

In [None]:


with open(os.path.join(OUTPUT_DIR, f"anime_embeddings_{model_sufix}.pickle"), "wb") as f:
    pickle.dump(anime_embeddings, f)

with open(os.path.join(OUTPUT_DIR, f"user_embeddings_{model_sufix}.pickle"), "wb") as f:
    pickle.dump(user_embeddings, f)

In [None]:
with open(os.path.join(INPUT_DIR, "anime2anime_encoded.pickle"), "rb") as input_file:
    anime2anime_encoded = pickle.load(input_file)

In [None]:
df_anime["anime_id_mapped"] = df_anime["anime_id"].map(anime2anime_encoded, na_action="ignore")
df_anime = df_anime.dropna(subset=["anime_id_mapped"])
df_anime["anime_id_mapped"] = df_anime["anime_id_mapped"].astype(int)

In [None]:
df_anime.sort_values("anime_id_mapped")[["Name"]].to_csv(os.path.join(OUTPUT_DIR, "anime.tsv"), sep="\t", index=False, header=False)


# Loss per number of anime reviewed

In [None]:
train_rating_df = rating_df.iloc[:train_indices]
test_rating_df = rating_df.iloc[train_indices:]

In [None]:
ratings_by_user = train_rating_df["user"].value_counts()
ratings_by_user = ratings_by_user.to_frame(name="count").reset_index().rename(columns={"index": "user"})

test_rating_df = test_rating_df.merge(ratings_by_user, on="user")
# print(train_rating_df.shape, test_rating_df.shape)
# df2 = df.groupby(['Courses'])['Courses'].count()

In [None]:
ratings_by_user.describe()

In [None]:
scores = model.predict(X_test_array)

In [None]:
test_rating_df.loc[:,"predict"] = scores.flatten()

In [None]:
test_rating_df

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
test_rating_df['category'] = pd.qcut(test_rating_df['count'], q=4)

In [None]:
grouped = test_rating_df.groupby('category')
mse_by_category = grouped.apply(lambda x: mean_squared_error(x['rating'], x['predict']))
print(mse_by_category)

# Loss per number of reviews that an anime has

In [None]:
# Calcular Loss 

# Most similar animes