#Sistema de recomendación TF sin ratings

Este ejemplo es muy similar al del archivo modelado en TF del repositorio pero en este no tendremos solo tendremos en cuenta datos de calificación positiva es decir cuya calificación sea superior a 3 estrellas. Por este motivo obviaremos mucha documentación de los pasos realizados. Leemos los datasets de tfrecords previamente desarrollados

In [1]:
import tensorflow as tf

filenames = ['traindata.tfrecord']
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset_test = tf.data.TFRecordDataset(['testdata.tfrecord'])
raw_dataset

<TFRecordDatasetV2 shapes: (), types: tf.string>

In [2]:
# Create a description of the features.
feature_description = {
    'review_id': tf.io.FixedLenFeature([], tf.string, default_value=''),
    'product_id': tf.io.FixedLenFeature([], tf.string, default_value=''),
    'reviewer_id': tf.io.FixedLenFeature([], tf.string, default_value=''),
    'stars': tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'review_body': tf.io.FixedLenFeature([], tf.string, default_value=''),
    'review_title': tf.io.FixedLenFeature([], tf.string, default_value=''),
    'language': tf.io.FixedLenFeature([], tf.string, default_value=''),
    'product_category': tf.io.FixedLenFeature([], tf.string, default_value=''),
}

def _parse_function(example_proto):
  # Parse the input `tf.train.Example` proto using the dictionary above.
  return tf.io.parse_single_example(example_proto, feature_description)

# mapper en paralelo para optimizar tiempo de ejecución
parsed_dataset = raw_dataset.map(_parse_function)
parsed_test_dataset = raw_dataset_test.map(_parse_function)

parsed_dataset

<MapDataset shapes: {language: (), product_category: (), product_id: (), review_body: (), review_id: (), review_title: (), reviewer_id: (), stars: ()}, types: {language: tf.string, product_category: tf.string, product_id: tf.string, review_body: tf.string, review_id: tf.string, review_title: tf.string, reviewer_id: tf.string, stars: tf.int64}>

Filtramos los datos de los datos que si deberían recomendarse


In [3]:
import pprint
def dataset_filter_stars(ds):
  return ds.filter(lambda x: x['stars'] > 4)


parsed_dataset = parsed_dataset.apply(dataset_filter_stars)

parsed_dataset = parsed_dataset.map(lambda x: {
    'reviewer_id':x['reviewer_id'], 
    'product_id':x['product_id']
})

parsed_test_dataset = parsed_test_dataset.map(lambda x: {
    'reviewer_id':x['reviewer_id'], 
    'product_id':x['product_id']
})

In [4]:
parsed_test_dataset

<MapDataset shapes: {reviewer_id: (), product_id: ()}, types: {reviewer_id: tf.string, product_id: tf.string}>

In [5]:
train = parsed_dataset
test = parsed_test_dataset

In [6]:
import numpy as np

products = parsed_dataset.batch(1000).map(lambda x: x["product_id"])
user_ids = parsed_dataset.batch(1000).map(lambda x: x["reviewer_id"])

unique_products = np.unique(np.concatenate(list(products)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_products[:10]

array([b'product_de_0000060', b'product_de_0000070',
       b'product_de_0000122', b'product_de_0000192',
       b'product_de_0000260', b'product_de_0000295',
       b'product_de_0000322', b'product_de_0000349',
       b'product_de_0000418', b'product_de_0000527'], dtype=object)

In [7]:
embedding_dimension = 32 #hay que probar valores que mejor se ajusten

In [8]:
user_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

In [9]:
product_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=unique_products, mask_token=None),
  tf.keras.layers.Embedding(len(unique_products) + 1, embedding_dimension)
])

In [10]:
#!pip install -q tensorflow-recommenders
#!pip install -q --upgrade tensorflow-datasets
#!pip install -q scann

In [11]:
#import tensorflow_recommenders as tfrs

In [15]:
tmp = parsed_test_dataset.map(lambda x: x['product_id'])

In [16]:
import tensorflow_recommenders as tfrs

metrics = tfrs.metrics.FactorizedTopK(
  candidates= tmp.batch(128).map(product_model)
)

In [17]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [18]:
import os
import pprint
import tempfile

from typing import Dict, Text


class MovielensModel(tfrs.Model):

  def __init__(self, user_model, movie_model):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["reviewer_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_movie_embeddings = self.movie_model(features["product_id"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_movie_embeddings)

In [19]:
model = MovielensModel(user_model, product_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [20]:
train

<MapDataset shapes: {reviewer_id: (), product_id: ()}, types: {reviewer_id: tf.string, product_id: tf.string}>

In [21]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4100).cache()

In [22]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f46d94aced0>

In [23]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_100_categorical_accuracy': 0.9724000096321106,
 'factorized_top_k/top_10_categorical_accuracy': 0.0013333333190530539,
 'factorized_top_k/top_1_categorical_accuracy': 0.00013333333481568843,
 'factorized_top_k/top_50_categorical_accuracy': 0.007333333138376474,
 'factorized_top_k/top_5_categorical_accuracy': 0.0006666666595265269,
 'loss': 27647.16796875,
 'regularization_loss': 0,
 'total_loss': 27647.16796875}

## Conclusiones
Se observa que este acercamiento no es muy bueno para la recomendación porque hay demasiados datos nulos y no pueden y el algoritmo intenta asignar un producto sin tener en cuenta muchas cosas.