In [None]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam
import tempfile
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils
import apache_beam as beam

In [None]:
tfds.list_builders()

In [None]:
# movielens/25m-ratings
ds = tfds.load('movielens/latest-small-ratings', split='train', shuffle_files=True).prefetch(1024)

In [None]:
df = tfds.as_dataframe(ds.take(100))

In [None]:
df.head()

In [None]:
raw_data = ({
        k : elem[k].numpy()
        for k in ['movie_id', 'user_id', 'user_rating']
    } for elem in ds    
)

In [None]:
def preprocessing_fn(inputs):
    return {
        "movie_id": tft.compute_and_apply_vocabulary(inputs["movie_id"], vocab_filename="movies_vocabulary"),
        "user_id": tft.compute_and_apply_vocabulary(inputs["user_id"], vocab_filename="users_vocabulary"),
        "user_rating": tft.scale_to_0_1(inputs ["user_rating"])
    }
   
raw_data_metadata = dataset_metadata.DatasetMetadata(
      schema_utils.schema_from_feature_spec({
        'movie_id': tf.io.FixedLenFeature([], tf.string),
        'user_id': tf.io.FixedLenFeature([], tf.string),
        'user_rating': tf.io.FixedLenFeature([], tf.float32),
    }))

def make_map_map(k1, k2):
    def map_map(v):
        for x in v[1]:
            yield (v[0], x[k2], x['user_rating'])
    return map_map


with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
    transformed_data, transform_fn = ( (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
    transformed_data, _     = transformed_data
    grouped_by_user_id      = (transformed_data | beam.GroupBy(lambda v: int(v['user_id'])))
    grouped_by_user_id      = (grouped_by_user_id) | beam.ParDo(make_map_map('user_id', 'movie_id'))

grouped_by_user_id


In [None]:
grouped_by_user_id = np.array(grouped_by_user_id)
grouped_by_user_id

grouped_by_user_id[:, [0, 1]].astype(int)

In [None]:
indices = grouped_by_user_id[:, [0, 1]].astype(int)
values = grouped_by_user_id[:, 2].flatten()

users = np.max(indices[:, 0].flatten())
movies = np.max(indices[:, 1].flatten())

user_ratings_matrix = tf.SparseTensor(
    indices = indices, 
    values = values, 
    dense_shape = (users, movies)
)

def loss_fn(user_ratings_matrix, user_embeddings, movies_embeddings):

    user_embeddings = tf.gather(user_embeddings, user_ratings_matrix.indices[:, 0])
    movies_embeddings = tf.gather(movie_embeddings, user_ratings_matrix.indices[:, 1])

    return tf.losses.mean_squared_error(
        user_ratings_matrix.values,
        tf.reduce_sum(user_embeddings * movies_embeddings, axis = 1)
    )

user_embeddings = tf.Variable(tf.keras.initializers.HeNormal()(shape=(users, 15)))
movie_embeddings = tf.Variable(tf.keras.initializers.HeNormal()(shape=(movies, 15)))

print(user_embeddings.shape)
print(movie_embeddings.shape)

print(tf.gather(user_embeddings, user_ratings_matrix.indices[:, 0]).shape)
print(tf.gather(movie_embeddings, user_ratings_matrix.indices[:, 1]).shape)

loss_fn(user_ratings_matrix, user_embeddings, movie_embeddings)

In [None]:
epochs = 10000
for epoch in range(epochs):
    with tf.GradientTape() as tape:
        ls = loss_fn(user_ratings_matrix, user_embeddings, movie_embeddings)
        
    gradients = tape.gradient(ls, [user_embeddings, movie_embeddings])

    g = [tf.IndexedSlices(
            indices=g.indices,
            values=g.values * 50,
            dense_shape=g.dense_shape
    ) for g in gradients]

    user_embeddings.assign_sub(g[0])
    movie_embeddings.assign_sub(g[1])

    if (epoch % 1000 == 0): print(ls.numpy())

In [None]:
def predict(user_id):
    return tf.matmul(tf.reshape(user_embeddings[user_id], shape=(1,15)), tf.transpose(movie_embeddings))

predict(24)