<a href="https://colab.research.google.com/github/bellazeng2016/playground/blob/main/recommendation_sys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q --upgrade tensorflow-datasets

In [2]:
import tensorflow_datasets as tfds

In [4]:
# Download the data, save them as `tfrecord` files, load the `tfrecord` files
# and create the `tf.data.Dataset` object containing the dataset.
ratings_dataset, ratings_dataset_info = tfds.load(
    name='movielens/100k-ratings',
    # MovieLens dataset is not splitted into `train` and `test` sets by default.
    # So TFDS has put it all into `train` split. We load it completely and split
    # it manually.
    split ='train',
    with_info= True
)

[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to ~/tensorflow_datasets/movielens/100k-ratings/0.1.1...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/movielens/100k-ratings/0.1.1.incomplete1Z02DH/movielens-train.tfrecord*...:   …

[1mDataset movielens downloaded and prepared to ~/tensorflow_datasets/movielens/100k-ratings/0.1.1. Subsequent calls will reuse this data.[0m


In [5]:
import tensorflow as tf

In [6]:
assert isinstance(ratings_dataset, tf.data.Dataset)

In [9]:
print(f'ratings_dataset size is {ratings_dataset.__len__()}')

ratings_dataset size is 100000


In [13]:
print(tfds.as_dataframe(ratings_dataset.take(5), ratings_dataset_info))

   bucketized_user_age movie_genres movie_id  \
0                 45.0          [7]   b'357'   
1                 25.0      [4, 14]   b'709'   
2                 18.0          [4]   b'412'   
3                 50.0       [5, 7]    b'56'   
4                 50.0     [10, 16]   b'895'   

                                 movie_title  raw_user_age  timestamp  \
0  b"One Flew Over the Cuckoo's Nest (1975)"          46.0  879024327   
1                b'Strictly Ballroom (1992)'          32.0  875654590   
2             b'Very Brady Sequel, A (1996)'          24.0  882075110   
3                     b'Pulp Fiction (1994)'          50.0  883326919   
4                         b'Scream 2 (1997)'          55.0  891409199   

   user_gender user_id  user_occupation_label user_occupation_text  \
0         True  b'138'                      4            b'doctor'   
1         True   b'92'                      5     b'entertainment'   
2         True  b'301'                     17           b'stud

In [16]:
# feature selection
ratings_dataset = ratings_dataset.map( lambda rating: {'user_id': rating['user_id'],
                                     'movie_id': rating['movie_id'],
                                     'movie_title': rating['movie_title'],
                                     'user_rating': rating['user_rating'],
                                     'timestamp': rating['timestamp']})

In [19]:
tfds.as_dataframe(ratings_dataset.take(5), ratings_dataset_info)

Unnamed: 0,movie_id,movie_title,timestamp,user_id,user_rating
0,b'357',"b""One Flew Over the Cuckoo's Nest (1975)""",879024327,b'138',4.0
1,b'709',b'Strictly Ballroom (1992)',875654590,b'92',2.0
2,b'412',"b'Very Brady Sequel, A (1996)'",882075110,b'301',4.0
3,b'56',b'Pulp Fiction (1994)',883326919,b'60',4.0
4,b'895',b'Scream 2 (1997)',891409199,b'197',3.0


In [24]:
## split dataset randomly (80% for train and 20% for test)
trainset_size = 0.8 * ratings_dataset.__len__().numpy()

tf.random.set_seed(42)
#shuffle the elements of the dataset randomly
ratings_dataset_shuffled = ratings_dataset.shuffle(
      buffer_size=100_000,
      seed =42,
      reshuffle_each_iteration=False
)

In [27]:
tfds.as_dataframe(ratings_dataset_shuffled.take(10))

Unnamed: 0,movie_id,movie_title,timestamp,user_id,user_rating
0,b'898',"b'Postman, The (1997)'",885409515,b'681',4.0
1,b'367',b'Clueless (1995)',883388887,b'442',2.0
2,b'484',"b'Maltese Falcon, The (1941)'",891249586,b'932',5.0
3,b'494',b'His Girl Friday (1940)',878044851,b'506',5.0
4,b'58',b'Quiz Show (1994)',880130613,b'18',4.0
5,b'76',"b""Carlito's Way (1993)""",892778202,b'551',4.0
6,b'591',b'Primal Fear (1996)',879959212,b'12',5.0
7,b'95',b'Aladdin (1992)',877131685,b'901',4.0
8,b'160',b'Glengarry Glen Ross (1992)',891034219,b'835',3.0
9,b'313',b'Titanic (1997)',885328727,b'284',3.0


In [29]:
ratings_trainset = ratings_dataset_shuffled.take(trainset_size)
ratings_testset = ratings_dataset_shuffled.skip(trainset_size)

In [30]:
print(f'training set size is {ratings_trainset.__len__()}')

training set size is 80000


In [31]:
print(f'test set size is {ratings_testset.__len__()}')

test set size is 20000


## Preprocess raw features and make embeddings with Keras preprocessing layers

* **Numerical features**( ratings, prices, timestamps, etc）need to be normalized so their values lie in small interval around 0
* **Categorical features**(ids, usernames/emails/titles ) are usually string features and have to be translated into embedding vectors(numerical feature representations )
* **Text features** ( descriptions, comments, etc) need to be at first, tokenized and then translated into embeddings


In [33]:
from pprint import pprint
for rating in ratings_trainset.take(1).as_numpy_iterator():
  pprint(rating)

{'movie_id': b'898',
 'movie_title': b'Postman, The (1997)',
 'timestamp': 885409515,
 'user_id': b'681',
 'user_rating': 4.0}


### Normalize numeric features

In [35]:
# make a keras Normalization layer to standardize a numerical feature
timestamp_normalization_layer = tf.keras.layers.experimental.preprocessing.Normalization(axis=None)

In [36]:
timestamp_normalization_layer.adapt(
    ratings_trainset.map(
        lambda x: x['timestamp']
    )
)

In [37]:
for rating in ratings_trainset.take(5).as_numpy_iterator():
  print(
      f"raw timestampe: {rating['timestamp']} ->",
      f"normalized timestamp: {timestamp_normalization_layer(rating['timestamp'])}"
  )

raw timestampe: 885409515 -> normalized timestamp: 0.3537561595439911
raw timestampe: 883388887 -> normalized timestamp: -0.02487170137465
raw timestampe: 891249586 -> normalized timestamp: 1.4480509757995605
raw timestampe: 878044851 -> normalized timestamp: -1.0262154340744019
raw timestampe: 880130613 -> normalized timestamp: -0.6353915929794312


### Turning categorical features into embeddings

In [38]:
## 1. Build a mapping( called 'vocabulary')
user_id_lookup_layer = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token= None)

In [39]:
user_id_lookup_layer.adapt(
    ratings_trainset.map(
        lambda x: x['user_id']
    )
)

In [40]:
print(f"Vocabulary[:10] -> {user_id_lookup_layer.get_vocabulary()[:10]}")

Vocabulary[:10] -> ['[UNK]', '405', '655', '13', '450', '276', '303', '416', '537', '234']


In [41]:
print(
    "Mapped integer for user ids: ['-2', '13', '655', 'xxx']\n",
    user_id_lookup_layer(
        ['-2', '13', '655', 'xxx']
    )
)

Mapped integer for user ids: ['-2', '13', '655', 'xxx']
 tf.Tensor([0 3 2 0], shape=(4,), dtype=int64)


In [45]:
## 2. Turn these integers into embedding vectors
user_id_embedding_dim =32
user_id_embedding_layer = tf.keras.layers.Embedding(
    # size of the vocab
    input_dim = user_id_lookup_layer.vocabulary_size(),
    # Dimension of the dense embedding
    output_dim = user_id_embedding_dim
)

In [46]:
# A model that takes raw string feature values (user_id) in and yields embeddings
user_id_model = tf.keras.Sequential(
    [
        user_id_lookup_layer,
        user_id_embedding_layer
    ]
)

In [47]:
print(
    "Embeddings for user ids: ['-2', '13', '655', 'xxx']\n",
    user_id_model(
        ['-2', '13', '655', 'xxx']
    )
)



Embeddings for user ids: ['-2', '13', '655', 'xxx']
 tf.Tensor(
[[ 0.01645621 -0.00589932 -0.01471175 -0.00355174 -0.04663396  0.01846724
   0.02401174  0.03724445 -0.02736737 -0.02768031 -0.01896119  0.02223358
  -0.03668128  0.00480639  0.00746088  0.03996835 -0.04905364  0.00212307
   0.01345445 -0.03006717  0.02294225  0.00458346 -0.03924345  0.01767061
   0.01602763 -0.01630496  0.01014177 -0.02893742  0.03527372 -0.00593783
   0.04485276 -0.02624741]
 [ 0.04355587 -0.04048269 -0.04138212  0.01247839 -0.01294935  0.00139042
   0.01233207  0.03024682 -0.03334862 -0.02790955  0.01242272 -0.04128085
   0.04214266  0.04348017  0.01045523 -0.00205957 -0.03556986 -0.01739997
   0.04255753  0.02757342  0.0136765   0.01282351 -0.01459817 -0.00855327
  -0.03894869 -0.0358853  -0.03112409  0.01894793  0.02213276 -0.02511839
   0.00912381 -0.00024097]
 [ 0.00016055 -0.03171784 -0.03682018  0.01463613  0.04559476  0.01670735
  -0.01924447 -0.01310781 -0.0052641  -0.03164054  0.00288255  0.020

In [48]:
## do the same thing for movie id

movie_id_lookup_layer = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token= None)
movie_id_lookup_layer.adapt(
    ratings_trainset.map(
        lambda x: x['movie_id']
    )
)
movie_id_embedding_dim = 32

movie_id_embedding_layer = tf.keras.layers.Embedding(
    input_dim = movie_id_lookup_layer.vocabulary_size(),
    output_dim = movie_id_embedding_dim
)
movie_id_model = tf.keras.Sequential(
    [ 
        movie_id_lookup_layer,
        movie_id_embedding_layer
    ]
)

## Tokenize textual features and translate them into embeddings

In [None]:
movie_title_vectorization_layer = tf.keras.layers.experimental.preprocessing.TextVectorization()