In [1]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam
import tempfile
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils
import apache_beam as beam

In [2]:
tfds.list_builders()


['abstract_reasoning',
 'accentdb',
 'aeslc',
 'aflw2k3d',
 'ag_news_subset',
 'ai2_arc',
 'ai2_arc_with_ir',
 'amazon_us_reviews',
 'anli',
 'arc',
 'bair_robot_pushing_small',
 'bccd',
 'beans',
 'big_patent',
 'bigearthnet',
 'billsum',
 'binarized_mnist',
 'binary_alpha_digits',
 'blimp',
 'bool_q',
 'c4',
 'caltech101',
 'caltech_birds2010',
 'caltech_birds2011',
 'cars196',
 'cassava',
 'cats_vs_dogs',
 'celeb_a',
 'celeb_a_hq',
 'cfq',
 'cherry_blossoms',
 'chexpert',
 'cifar10',
 'cifar100',
 'cifar10_1',
 'cifar10_corrupted',
 'citrus_leaves',
 'cityscapes',
 'civil_comments',
 'clevr',
 'clic',
 'clinc_oos',
 'cmaterdb',
 'cnn_dailymail',
 'coco',
 'coco_captions',
 'coil100',
 'colorectal_histology',
 'colorectal_histology_large',
 'common_voice',
 'coqa',
 'cos_e',
 'cosmos_qa',
 'covid19',
 'covid19sum',
 'crema_d',
 'curated_breast_imaging_ddsm',
 'cycle_gan',
 'd4rl_adroit_door',
 'd4rl_adroit_hammer',
 'd4rl_adroit_pen',
 'd4rl_adroit_relocate',
 'd4rl_mujoco_ant',
 'd4

In [3]:
# movielens/25m-ratings
ds = tfds.load('movielens/latest-small-ratings', split='train', shuffle_files=True).prefetch(1024)

2022-01-23 17:18:07.221366: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-01-23 17:18:07.347208: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-01-23 17:18:07.348188: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-01-23 17:18:07.350475: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate

In [4]:
df = tfds.as_dataframe(ds.take(100))

2022-01-23 17:18:09.655002: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [5]:
df.head()

Unnamed: 0,movie_genres,movie_id,movie_title,timestamp,user_id,user_rating
0,"[7, 8, 13, 15]",b'4874',b'K-PAX (2001)',1446749868,b'105',5.0
1,"[7, 18]",b'527',"b""Schindler's List (1993)""",1305696664,b'17',4.5
2,"[5, 9]",b'7943',"b'Killers, The (1946)'",1166068511,b'309',4.0
3,"[10, 13, 16]",b'1644',b'I Know What You Did Last Summer (1997)',1518640852,b'111',0.5
4,"[1, 2, 3, 4, 12, 14]",b'8360',b'Shrek 2 (2004)',1127221149,b'182',3.0


In [6]:
raw_data = ({
        k : elem[k].numpy()
        for k in ['movie_id', 'user_id', 'user_rating']
    } for elem in ds    
)

In [7]:
def preprocessing_fn(inputs):
    return {
        "movie_id": tft.compute_and_apply_vocabulary(inputs["movie_id"], vocab_filename="movies_vocabulary"),
        "user_id": tft.compute_and_apply_vocabulary(inputs["user_id"], vocab_filename="users_vocabulary"),
        "user_rating": tft.scale_to_0_1(inputs ["user_rating"])
    }
   
raw_data_metadata = dataset_metadata.DatasetMetadata(
      schema_utils.schema_from_feature_spec({
        'movie_id': tf.io.FixedLenFeature([], tf.string),
        'user_id': tf.io.FixedLenFeature([], tf.string),
        'user_rating': tf.io.FixedLenFeature([], tf.float32),
    }))

def make_map_map(k1, k2):
    def map_map(v):
        for x in v[1]:
            yield (v[0], x[k2], x['user_rating'])
    return map_map


with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
    transformed_data, transform_fn = ( (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
    transformed_data, _     = transformed_data
    grouped_by_user_id      = (transformed_data | beam.GroupBy(lambda v: int(v['user_id'])))
    grouped_by_user_id      = (grouped_by_user_id) | beam.ParDo(make_map_map('user_id', 'movie_id'))

grouped_by_user_id








Instructions for updating:
Use ref() instead.


Instructions for updating:
Use ref() instead.




2022-01-23 17:18:53.594221: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /tmp/tmpsguinmr9/tftransform_tmp/59fd6b7a41c44ae897f84d25a13c1540/assets


INFO:tensorflow:Assets written to: /tmp/tmpsguinmr9/tftransform_tmp/59fd6b7a41c44ae897f84d25a13c1540/assets


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:Assets written to: /tmp/tmpsguinmr9/tftransform_tmp/ed44da1fa5f649d99c216418bed8e3bd/assets


INFO:tensorflow:Assets written to: /tmp/tmpsguinmr9/tftransform_tmp/ed44da1fa5f649d99c216418bed8e3bd/assets


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


[(24, 788, 1.0),
 (24, 1677, 0.7777778),
 (24, 8925, 1.0),
 (24, 1567, 0.7777778),
 (24, 272, 0.8888889),
 (24, 1108, 0.5555556),
 (24, 186, 0.8888889),
 (24, 674, 0.7777778),
 (24, 130, 0.7777778),
 (24, 66, 0.6666667),
 (24, 361, 0.7777778),
 (24, 202, 0.6666667),
 (24, 807, 0.7777778),
 (24, 688, 1.0),
 (24, 6224, 0.8888889),
 (24, 6646, 0.8888889),
 (24, 3508, 0.8888889),
 (24, 62, 0.6666667),
 (24, 9249, 0.8888889),
 (24, 9126, 1.0),
 (24, 1145, 0.5555556),
 (24, 9591, 0.8888889),
 (24, 439, 0.7777778),
 (24, 260, 0.6666667),
 (24, 1367, 0.6666667),
 (24, 1754, 0.6666667),
 (24, 430, 0.7777778),
 (24, 93, 0.7777778),
 (24, 3764, 0.6666667),
 (24, 2834, 0.8888889),
 (24, 259, 0.7777778),
 (24, 1375, 0.8888889),
 (24, 3370, 0.8888889),
 (24, 444, 0.6666667),
 (24, 4201, 0.7777778),
 (24, 7411, 0.8888889),
 (24, 1020, 0.6666667),
 (24, 2283, 0.8888889),
 (24, 670, 0.6666667),
 (24, 168, 0.7777778),
 (24, 944, 0.7777778),
 (24, 789, 0.6666667),
 (24, 1047, 1.0),
 (24, 4190, 1.0),
 (24

In [8]:
grouped_by_user_id = np.array(grouped_by_user_id)
grouped_by_user_id

grouped_by_user_id[:, [0, 1]].astype(int)

array([[  24,  788],
       [  24, 1677],
       [  24, 8925],
       ...,
       [ 511,  633],
       [ 511,  212],
       [ 511,  476]])

In [36]:
indices = grouped_by_user_id[:, [0, 1]].astype(int)
values = grouped_by_user_id[:, 2].flatten()

users = np.max(indices[:, 0].flatten())
movies = np.max(indices[:, 1].flatten())

user_ratings_matrix = tf.SparseTensor(
    indices = indices, 
    values = values, 
    dense_shape = (users, movies)
)

def loss_fn(user_ratings_matrix, user_embeddings, movies_embeddings):

    user_embeddings = tf.gather(user_embeddings, user_ratings_matrix.indices[:, 0])
    movies_embeddings = tf.gather(movie_embeddings, user_ratings_matrix.indices[:, 1])

    return tf.losses.mean_squared_error(
        user_ratings_matrix.values,
        tf.reduce_sum(user_embeddings * movies_embeddings, axis = 1)
    )

user_embeddings = tf.Variable(tf.keras.initializers.HeNormal()(shape=(users, 15)))
movie_embeddings = tf.Variable(tf.keras.initializers.HeNormal()(shape=(movies, 15)))

print(user_embeddings.shape)
print(movie_embeddings.shape)

print(tf.gather(user_embeddings, user_ratings_matrix.indices[:, 0]).shape)
print(tf.gather(movie_embeddings, user_ratings_matrix.indices[:, 1]).shape)

loss_fn(user_ratings_matrix, user_embeddings, movie_embeddings)

(609, 15)
(9723, 15)
(100836, 15)
(100836, 15)


<tf.Tensor: shape=(), dtype=float32, numpy=0.49858186>

In [62]:
epochs = 10000
for epoch in range(epochs):
    with tf.GradientTape() as tape:
        ls = loss_fn(user_ratings_matrix, user_embeddings, movie_embeddings)
        
    gradients = tape.gradient(ls, [user_embeddings, movie_embeddings])

    g = [tf.IndexedSlices(
            indices=g.indices,
            values=g.values * 50,
            dense_shape=g.dense_shape
    ) for g in gradients]

    user_embeddings.assign_sub(g[0])
    movie_embeddings.assign_sub(g[1])

    if (epoch % 1000 == 0): print(ls.numpy())

0.4985145
0.015025715
0.011500259
0.010258061
0.00961527
0.009210411
0.008924202
0.008707679
0.008536415
0.0083966255


In [73]:
def predict(user_id):
    return tf.matmul(tf.reshape(user_embeddings[user_id], shape=(1,15)), tf.transpose(movie_embeddings))

predict(24)


<tf.Tensor: shape=(1, 9723), dtype=float32, numpy=
array([[0.79733914, 0.8699984 , 0.9116251 , ..., 0.7316714 , 0.6028092 ,
        0.57951206]], dtype=float32)>