# Embedding Ratings - Skip Gram Approach

In [None]:
# TensorFlow Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model

# Other
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.stats import pearsonr
from scipy.sparse import csr_matrix, lil_matrix, vstack, load_npz
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfTransformer

import warnings
warnings.filterwarnings('ignore')
tf.config.run_functions_eagerly(True)
#tf.data.experimental.enable_debug_mode()

## Classes and Methods

In [None]:
class EmbeddingRatings:
    def __init__(self, hidden_dim: int, k: int, input_dim: int):
        self.input_dim = input_dim
        input_vec = keras.Input(shape=(self.input_dim,))
        encoded = layers.Dense(hidden_dim, activation='relu')(input_vec)
        decoded = layers.Dense(input_dim, activation='relu')(encoded)
        
        self.autoencoder = keras.Model(input_vec, decoded)
        self.encoder = keras.Model(input_vec, encoded)
        self.autoencoder.compile(optimizer='adam', loss='mean_squared_error')
        
        self.knn = NearestNeighbors(metric='cosine', n_neighbors=k)
    
    def fit(self, training_ratings, generator, epochs):
        self.training_ratings = training_ratings
        
        print('Fitting autoencoder...')
        # Batch size determined in generator constructor
        self.autoencoder.fit(generator, 
                             epochs=epochs,
                             shuffle=True)
        
        print('Creating embeddings...')
        self.embeddings = self.encoder.predict(self.training_ratings)
        self.knn.fit(self.embeddings)
        
    def predict(self, user_ratings):
        print('Embedding test users...')
        pred_embeddings = self.encoder.predict(user_ratings)
        
        print('Performing nearest-neighbor search in embedding space...')
        user_neighbors = self.knn.kneighbors(pred_embeddings, return_distance=False)
        
        print('Aggregating neighbor ratings...')
        pred = lil_matrix(user_ratings.shape)
        for idx, neighbor_indices in tqdm(enumerate(user_neighbors), total=user_neighbors.shape[0]):
            neighbors = self.training_ratings[neighbor_indices]
            divisor = neighbors.getnnz(axis=0)
            divisor[divisor == 0] = 1
            pred[idx] = neighbors.sum(axis=0) / divisor
 
        return pred.tocsr()

In [None]:
class SkipGramDataGenerator(keras.utils.Sequence):
    def __init__(self, contexts, targets, batch_size):
        self.contexts = contexts
        self.targets = targets
        self.batch_size = batch_size
        self.indices = np.arange(contexts.shape[0])

    def __len__(self):
        return int(np.ceil(self.contexts.shape[0] / self.batch_size))

    def __getitem__(self, idx):
        batch_indices = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
        context_batch = self.contexts[batch_indices].toarray()
        targets_batch = self.targets[batch_indices].toarray()
        return context_batch, targets_batch

In [None]:
def seen_unseen_split(ratings, fraction_seen=0.8):
    seen = lil_matrix(ratings.shape)
    unseen = lil_matrix(ratings.shape)

    for user_id in tqdm(range(ratings.shape[0])):
        rated_items_indices = ratings[user_id].nonzero()[1]
        np.random.shuffle(rated_items_indices)
        num_seen_items = int(fraction_seen * len(rated_items_indices))

        seen[user_id, rated_items_indices[:num_seen_items]] = ratings[user_id, rated_items_indices[:num_seen_items]]
        unseen[user_id, rated_items_indices[num_seen_items:]] = ratings[user_id, rated_items_indices[num_seen_items:]]

    return seen.tocsr(), unseen.tocsr()

## Data Preparation

In [None]:
sessions_train = load_npz("data/sessions_train.npz")
sessions_test = load_npz("data/sessions_test.npz")

# Cached context/target pairs generated from training sessions
session_train_contexts = load_npz("data/session_train_contexts.npz")
session_train_targets = load_npz("data/session_train_targets.npz")

In [None]:
seen, unseen = seen_unseen_split(sessions_test)

In [None]:
sessions_train.shape, session_train_contexts.shape

## Usage

In [None]:
K.clear_session()

In [None]:
er = EmbeddingRatings(hidden_dim=128, k=250, input_dim=sessions_train.shape[1])
er.autoencoder.summary()

In [None]:
generator = SkipGramDataGenerator(session_train_contexts, session_train_targets, batch_size=256)
er.fit(sessions_train, generator, epochs=1)

In [None]:
er.embeddings.shape

In [None]:
pred = er.predict(seen)

In [None]:
output = np.asarray(pred[unseen.nonzero()]).flatten() # Predictions lined up with unseen
#plt.scatter(unseen.data, output)
sns.boxenplot(x=unseen.data, y=output)
plt.xlabel('True Ratings')
plt.xticks(rotation=45)
plt.ylabel('Predicted Ratings')
plt.show()

In [None]:
RMSE = root_mean_squared_error(unseen.data, output)
correlation_coefficient, _ = pearsonr(unseen.data, output)
print(f"RMSE: {RMSE}\nR2: {correlation_coefficient ** 2}")

In [None]:
frac_nonzero_ratings = pred.nnz / np.prod(pred.shape)
frac_ratings_predicted = len(output.nonzero()[0]) / len(output)
print(f"Percent Nonzero: {frac_nonzero_ratings * 100}\nRecall (% unseen ratings predicted): {frac_ratings_predicted * 100}")