In [53]:
import pandas as pd
import pprint
from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

In [2]:
data = pd.read_csv("../data/dataset_TSMC2014_NYC.csv")
data.head()

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012


In [3]:
df = data[['userId', 'venueId']]
df.head()

Unnamed: 0,userId,venueId
0,470,49bbd6c0f964a520f4531fe3
1,979,4a43c0aef964a520c6a61fe3
2,69,4c5cc7b485a1e21e00d35711
3,395,4bc7086715a7ef3bef9878da
4,87,4cf2c5321d18a143951b5cec


In [4]:
df2 = df.groupby(["userId", "venueId"])["userId"].count().reset_index(name="user_checkin_count")
df2

Unnamed: 0,userId,venueId,user_checkin_count
0,1,3fd66200f964a52035e81ee3,1
1,1,3fd66200f964a52048e31ee3,1
2,1,3fd66200f964a52048e81ee3,1
3,1,3fd66200f964a52074e71ee3,1
4,1,3fd66200f964a52075ea1ee3,1
...,...,...,...
91019,1083,4f19b2bbe4b0808f62845026,3
91020,1083,4f2a6d55e4b04e33ba6416f9,1
91021,1083,4f34533f754a657ca23ac8d7,1
91022,1083,4f7eff09e4b02164d9a1a52c,2


In [None]:
df2.loc[(df2["userId"]==1) & (df2["venueId"]=="4f3283f0e4b057434d8fdc81")]

In [None]:
df2.loc[(df2["user_checkin_count"]==257)]

In [5]:
def normalize(df, col_name):
    result = df.copy()
    max_value = df[col_name].max()
    min_value = df[col_name].min()
    result[col_name] = (df[col_name] - min_value) / (max_value - min_value)
    return result

In [6]:
df2 = normalize(df2, "user_checkin_count")
df2

Unnamed: 0,userId,venueId,user_checkin_count
0,1,3fd66200f964a52035e81ee3,0.000000
1,1,3fd66200f964a52048e31ee3,0.000000
2,1,3fd66200f964a52048e81ee3,0.000000
3,1,3fd66200f964a52074e71ee3,0.000000
4,1,3fd66200f964a52075ea1ee3,0.000000
...,...,...,...
91019,1083,4f19b2bbe4b0808f62845026,0.007812
91020,1083,4f2a6d55e4b04e33ba6416f9,0.000000
91021,1083,4f34533f754a657ca23ac8d7,0.000000
91022,1083,4f7eff09e4b02164d9a1a52c,0.003906


In [7]:
users = df2
users['userId'] = users["userId"].astype("str")
users["venueId"] = users["venueId"].astype("str")
users["user_checkin_count"] = users["user_checkin_count"].astype("str")
users.head(5)

Unnamed: 0,userId,venueId,user_checkin_count
0,1,3fd66200f964a52035e81ee3,0.0
1,1,3fd66200f964a52048e31ee3,0.0
2,1,3fd66200f964a52048e81ee3,0.0
3,1,3fd66200f964a52074e71ee3,0.0
4,1,3fd66200f964a52075ea1ee3,0.0


In [31]:
unique_user_ids = users["userId"].unique()
unique_user_ids_df = pd.DataFrame({'userId': unique_user_ids})

unique_user_ids_tensor = tf.data.Dataset.from_tensor_slices(dict(unique_user_ids_df))

In [32]:
unique_venue_ids = data["venueId"].unique()
unique_venue_ids_df = pd.DataFrame({'venueId': unique_venue_ids})

unique_venue_ids_tensor = tf.data.Dataset.from_tensor_slices(dict(unique_venue_ids_df))

In [33]:
ratings = tf.data.Dataset.from_tensor_slices(dict(users))
ratings

<TensorSliceDataset element_spec={'userId': TensorSpec(shape=(), dtype=tf.string, name=None), 'venueId': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_checkin_count': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [34]:
for x in ratings.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'userId': b'1',
 'user_checkin_count': b'0.0',
 'venueId': b'3fd66200f964a52035e81ee3'}


In [43]:
venues_df = data[["venueId"]]
# venues["venueId"] = venues["venueId"].astype("str")

venues_tensor = tf.data.Dataset.from_tensor_slices(dict(venues_df))
for x in venues_tensor.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'venueId': b'49bbd6c0f964a520f4531fe3'}


In [49]:
ratings = ratings.map(lambda x: {
    "userId": x["userId"],
    "venueId": x["venueId"],
    "user_checkin_count": x["user_checkin_count"]
})
venues = venues_tensor.map(lambda x: x["venueId"])

In [36]:
# let's use a random split, putting 75% of the ratings in the train set, and 25% in the test set:
# Assign a seed=42 for consistency of results and reproducibility:
seed = 42
l = len(ratings)

tf.random.set_seed(seed)
shuffled = ratings.shuffle(l, seed=seed, reshuffle_each_iteration=False)

#Save 75% of the data for training and 25% for testing:
train_ = int(0.75 * l)
test_ = int(0.25 * l)

train = shuffled.take(train_)
test = shuffled.skip(train_).take(test_)

In [37]:
embedding_dimension = 32

In [38]:
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

In [40]:
venue_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_venue_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_venue_ids) + 1, embedding_dimension)
])

In [47]:
type(venues_tensor)

tensorflow.python.data.ops.dataset_ops.TensorSliceDataset

In [50]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=venues.batch(128).map(venue_model)
)

In [51]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [59]:
class VenueRecoModel(tfrs.Model):

    def __init__(self, user_model, venue_model):
        super().__init__()
        self.venue_model: tf.keras.Model = venue_model
        self.user_model: tf.keras.Model = user_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["userId"])
        # And pick out the movie features and pass them into the movie model,
        # getting embeddings back.
        positive_venue_embeddings = self.venue_model(features["venueId"])

        # The task computes the loss and the metrics.
        return self.task(user_embeddings, positive_venue_embeddings)

In [60]:
model = VenueRecoModel(user_model, venue_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [63]:
cached_train = train.shuffle(1).batch(8192).cache()
cached_test = test.batch(2048).cache()

In [64]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x290d8b280>

In [65]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0,
 'factorized_top_k/top_5_categorical_accuracy': 0.0,
 'factorized_top_k/top_10_categorical_accuracy': 0.0,
 'factorized_top_k/top_50_categorical_accuracy': 4.3944455683231354e-05,
 'factorized_top_k/top_100_categorical_accuracy': 0.0002636667340993881,
 'loss': 1319.135986328125,
 'regularization_loss': 0,
 'total_loss': 1319.135986328125}

In [67]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
# recommends movies out of the entire movies dataset.
index.index(venues.batch(100).map(model.venue_model), venues)



<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x29ad22080>

In [68]:
# Get recommendations.
_, venues = index(tf.constant(["42"]))
print(f"Recommendations for user 42: {venues[0, :5]}")


Recommendations for user 42: [b'4cd5fe292944b1f744a760ec' b'4e32a78cae60f21828d63591'
 b'50097824e4b038aa5007646e' b'4bb3e6e3643cd13a3c27395c'
 b'4bfc2271e05e0f47043fcfa8']
