# Collaborative Filtering using Neural Networks

Inspirations
- https://www.kaggle.com/code/chaitanya99/recommendation-system-cf-anime
- fast.ai CF lib

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

import glob
import keras
import os
import pickle

from imblearn.over_sampling import RandomOverSampler

from models import EmbeddingDotBias


import tensorflow as tf
print(tf.config.experimental.list_physical_devices('GPU'))

sns.set_theme()

In [None]:
CUR_INPUT = "/mnt/d/datasets/anime2020/"
INPUT_DIR = "/mnt/d/datasets/anime2020/animelist_sample/"
OUTPUT_DIR = "/mnt/d/datasets/anime2020/animelist_sample_outputs/"

other_sufix = []

In [None]:
df_anime = pd.read_csv(os.path.join(CUR_INPUT, "anime.csv"), low_memory=True)
df_anime = df_anime.drop([f"Score-{x}" for x in range(1,11)], axis=1)
df_anime = df_anime.rename(columns={"MAL_ID": "anime_id"})

df_anime

In [None]:
rating_df = pd.read_parquet(
    glob.glob(os.path.join(INPUT_DIR, "*.parquet"))[0],
    # columns=["user", "anime", "rating"]
)

rating_df

In [None]:
n_users = rating_df["user"].nunique()
n_animes = rating_df["anime"].nunique()

In [None]:
# Split
test_set_size = int(rating_df.shape[0] * 0.01) # 1% for test
train_indices = rating_df.shape[0] - test_set_size 

train_rating_df = rating_df.iloc[:train_indices]
test_rating_df = rating_df.iloc[train_indices:]

In [None]:
oversample = 80
if oversample:
    train_rating_df

    counts = train_rating_df.groupby('user')['rating'].count()
    users_to_oversample = counts[counts < oversample].index
    
    df_filtered = train_rating_df.loc[train_rating_df['user'].isin(users_to_oversample)]
    
    ros = RandomOverSampler(sampling_strategy='not majority')
    df_oversampled, _ = ros.fit_resample(df_filtered, df_filtered['user'])
    
    train_rating_final_df = pd.concat([train_rating_df, df_oversampled]).sample(frac=1).reset_index(drop=True)
    
    del df_oversampled
    
    other_sufix.append(f"oversample{oversample}")
else:
    train_rating_final_df = train_rating_df

In [None]:
X = rating_df[['user', 'anime']].values
y = rating_df["rating"]

In [None]:
X_train, X_test, y_train, y_test = (
    train_rating_final_df[["user", "anime"]].values,
    test_rating_df[["user", "anime"]].values,
    train_rating_final_df["rating"],
    test_rating_df["rating"],
)

print('> Train set ratings: {}'.format(len(y_train)))
print('> Test set ratings: {}'.format(len(y_test)))

In [None]:
X_train_array = [X_train[:, 0], X_train[:, 1]]
X_test_array = [X_test[:, 0], X_test[:, 1]]

In [None]:
model, model_config = EmbeddingDotBias(
    n_users=n_users, n_animes=n_animes, n_factors=64, learning_rate=0.0005,
    batch_norm=True, use_bias=True, loss="binary_crossentropy"
)

model_sufix = f'{model_config["model_name"]}_lr{model_config["learning_rate"]}_{model_config["loss"]}_fc{model_config["n_factors"]}_bn{model_config["batch_norm"]}_bias{model_config["use_bias"]}'
for sufix in other_sufix:
    model_sufix += (f"_{sufix}")

print(model_sufix)

model.summary()

In [None]:
# Callbacks
from tensorflow.keras.callbacks import Callback, ModelCheckpoint, TensorBoard, EarlyStopping, ReduceLROnPlateau

checkpoint_filepath = f'./weights_{model_sufix}_.h5'
model_checkpoints = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True
)

early_stopping = EarlyStopping(
    patience = 3,
    monitor='val_loss', 
    mode='min',
    restore_best_weights=True
)

my_callbacks = [
    model_checkpoints,
    early_stopping,   
]

In [None]:
history = model.fit(
    x=X_train_array,
    y=y_train,
    batch_size=1024,
    epochs=15,
    verbose=1,
    validation_data=(X_test_array, y_test),
    callbacks=my_callbacks
)

In [None]:
plot_df = pd.DataFrame(history.history)
plot_df["epochs"] = plot_df.index
plot_df = plot_df[["epochs", "loss", "val_loss"]]
plot_df.to_csv(f"loss_{model_sufix}.csv", index=False)
plot_df = pd.melt(plot_df, ['epochs'], value_name="loss_value")

axs = sns.lineplot(plot_df, x="epochs", y="loss_value", hue="variable")
axs.figure.savefig(f"loss_{model_sufix}.png")

In [None]:
model.load_weights(checkpoint_filepath)

# Save Weights

In [None]:
def extract_weights(name, model):
    weight_layer = model.get_layer(name)
    weights = weight_layer.get_weights()[0]
    weights = weights / np.linalg.norm(weights, axis = 1).reshape((-1, 1))
    return weights

anime_embeddings = extract_weights('anime_embedding', model)
user_embeddings = extract_weights('user_embedding', model)

In [None]:
with open(os.path.join(OUTPUT_DIR, f"anime_embeddings_{model_sufix}.pickle"), "wb") as f:
    pickle.dump(anime_embeddings, f)

with open(os.path.join(OUTPUT_DIR, f"user_embeddings_{model_sufix}.pickle"), "wb") as f:
    pickle.dump(user_embeddings, f)

In [None]:
with open(os.path.join(INPUT_DIR, "anime2anime_encoded.pickle"), "rb") as input_file:
    anime2anime_encoded = pickle.load(input_file)

In [None]:
df_anime["anime_id_mapped"] = df_anime["anime_id"].map(anime2anime_encoded, na_action="ignore")
df_anime = df_anime.dropna(subset=["anime_id_mapped"])
df_anime["anime_id_mapped"] = df_anime["anime_id_mapped"].astype(int)

In [None]:
df_anime.sort_values("anime_id_mapped")[["Name"]].to_csv(os.path.join(OUTPUT_DIR, "anime.tsv"), sep="\t", index=False, header=False)


# Loss per number of anime reviewed

In [None]:
nb_ratings_by_user = train_rating_df["user"].value_counts()
nb_ratings_by_anime = train_rating_df["anime"].value_counts()

nb_ratings_by_user = nb_ratings_by_user.to_frame(name="count_user_ratings").reset_index().rename(columns={"index": "user"})
nb_ratings_by_anime = nb_ratings_by_anime.to_frame(name="count_anime_ratings").reset_index().rename(columns={"index": "anime"})


test_rating_df = test_rating_df.merge(nb_ratings_by_user, on="user", how="left")
test_rating_df = test_rating_df.merge(nb_ratings_by_anime, on="anime", how="left")

In [None]:
nb_ratings_by_user.describe()

In [None]:
nb_ratings_by_anime.describe()

In [None]:
scores = model.predict(X_test_array)

In [None]:
test_rating_df.loc[:,"predict"] = scores.flatten()

In [None]:
test_rating_df["rating_10"] = test_rating_df["rating"] * 10
test_rating_df["predict_10"] = test_rating_df["predict"] * 10

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error 

In [None]:
test_rating_df['category_count_user_ratings'] = pd.cut(test_rating_df["count_user_ratings"], bins=[0, 10, 20, 40, 80, 200, 400, 1000, 3000])
test_rating_df['category_count_anime_ratings'] = pd.cut(test_rating_df["count_anime_ratings"], bins=[10, 20, 40, 80, 200, 400, 1000, 5000, 10000, 183360])

# test_rating_df['category_count_user_ratings'] = pd.qcut(test_rating_df['count_user_ratings'], q=30)
# test_rating_df['category_count_anime_ratings'] = pd.qcut(test_rating_df['count_anime_ratings'], q=20)

In [None]:
grouped = test_rating_df.groupby('category_count_user_ratings')
mae_by_category = grouped.apply(lambda x: mean_absolute_error(x['rating_10'], x['predict_10']))
print(mae_by_category)

# Loss per number of reviews that an anime has

In [None]:
grouped = test_rating_df.groupby('category_count_anime_ratings')
mae_by_category = grouped.apply(lambda x: mean_absolute_error(x['rating_10'], x['predict_10']))
print(mae_by_category)

# Most similar animes

In [None]:
df_anime

In [None]:
df_anime["encoded_id"] = df_anime["anime_id"].map(anime2anime_encoded).astype("int32")

In [None]:
df_anime = df_anime.set_index("encoded_id")

In [None]:
df_anime[df_anime["Name"].str.startswith("Haikyuu")]["Name"].values

In [None]:
df_anime[df_anime["Name"] == "Haikyuu!!"].anime_id.values

In [None]:
# def getAnimeByName(df_animes, 
# Haikyuu!!
mal_id = df_anime[df_anime["Name"] == "Haikyuu!!"].anime_id.values[0]
encoded_id = anime2anime_encoded[mal_id]


anime_weight = anime_embeddings[encoded_id]
distances = np.dot(anime_embeddings, anime_weight)

closest_ids = np.flip(np.argsort(distances))[:10]

In [None]:
df_anime.iloc[closest_ids,:]