In [None]:
---
title: Tensorflow2 实现 EmbeddingMLP
tags: 小书匠,tensorflow2,embedding,keras,movielens
grammar_cjkRuby: true
# renderNumberedHeading: true
---

[toc!]

# Tensorflow2 实现 EmbeddingMLP

## 数据预处理

In [20]:
%%bash

filename=ml-latest-small

rm -rf ${filename}
rm -rf ${filename}.zip
wget https://files.grouplens.org/datasets/movielens/${filename}.zip --no-check-certificate
unzip ${filename}.zip

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


--2021-04-01 15:37:43--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’

     0K .......... .......... .......... .......... ..........  5%  119K 8s
    50K .......... .......... .......... .......... .......... 10%  238K 5s
   100K .......... .......... .......... .......... .......... 15% 5.51M 3s
   150K .......... .......... .......... .......... .......... 20%  230K 3s
   200K .......... .......... .......... .......... .......... 26% 3.71M 2s
   250K .......... .......... .......... .......... .......... 31%  258K 2s
   300K .......... .......... .......... .......... .......... 36% 1.98M 2s
   350K .......... .......... .......... ...

In [21]:
import tensorflow as tf

datapaths = {
    "ratings": "ml-latest-small/ratings.csv",
    "movies": "ml-latest-small/movies.csv",
}

In [22]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

ratings = pd.read_csv(datapaths['ratings'])
ratings['click'] = ratings['rating'].apply(lambda x: 1 if x > 3.5 else 0)
ratings.drop(["rating"], inplace=True, axis=1)
scaler = StandardScaler()
ratings[['timestamp']] = scaler.fit_transform(ratings[['timestamp']]) # 时间归一化

movies = pd.read_csv(datapaths['movies'])
genres =  pd.DataFrame(list(movies['genres'].str.split('|').values))
genres.columns = ["genres_{}".format(column_name) for column_name in genres.columns]
movies = pd.concat([movies, genres], axis=1, sort=False)
movies.drop(["genres", "title",], inplace=True, axis=1)

In [23]:
df = pd.merge(ratings, movies, on='movieId')

In [24]:
import itertools

genre_set = set(itertools.chain(*genres.values.tolist()))
genre_set.remove(None)
genre_vocab = list(genre_set)
user_vocab =  list(df['userId'].unique())
item_vocab =  list(df['movieId'].unique())

In [25]:
n_trainset = int(0.8 * df.shape[0])
traindf = df[:n_trainset]
testdf = df[n_trainset:]

In [26]:
traindf.to_csv("train.csv", index=None)
testdf.to_csv("test.csv", index=None)

## 构建数据集

In [27]:
# load sample as tf dataset
def get_dataset(file_path):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=12,
        label_name='click',
        na_value="0",
        num_epochs=1 # 我们在 train 中指定 epochs，因此这里是 1
    )
    return dataset

# split as test dataset and training dataset
train_dataset = get_dataset("train.csv")
test_dataset = get_dataset("test.csv")

In [28]:
EMBEDDING_DIM = 10
# all categorical features
categorical_columns = []

# genre features vocabulary
GENRE_FEATURES = {"genres_{}".format(i): genre_vocab for i in range(2)}
for feature, vocab in GENRE_FEATURES.items():
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature, vocabulary_list=vocab)
    emb_col = tf.feature_column.embedding_column(cat_col, EMBEDDING_DIM)
    categorical_columns.append(emb_col)

In [29]:
# movie id embedding feature
movie_col = tf.feature_column.categorical_column_with_vocabulary_list(key='movieId', vocabulary_list=item_vocab)
movie_emb_col = tf.feature_column.embedding_column(movie_col, EMBEDDING_DIM)
categorical_columns.append(movie_emb_col)

In [30]:
# user id embedding feature
user_col = tf.feature_column.categorical_column_with_vocabulary_list(key='userId', vocabulary_list=user_vocab)
user_emb_col = tf.feature_column.embedding_column(user_col, EMBEDDING_DIM)
categorical_columns.append(user_emb_col)

In [31]:
# all numerical features
numerical_columns = [
    tf.feature_column.numeric_column('timestamp'),
]

In [32]:
# embedding + MLP model architecture
model = tf.keras.Sequential([
    tf.keras.layers.DenseFeatures(numerical_columns + categorical_columns),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

# compile the model, set loss function, optimizer and evaluation metrics
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC'), tf.keras.metrics.AUC(curve='PR')]
)

# train the model
model.fit(train_dataset, epochs=5)

# evaluate the model
test_loss, test_accuracy, test_roc_auc, test_pr_auc = model.evaluate(test_dataset)
print('\n\nTest Loss {}, Test Accuracy {}, Test ROC AUC {}, Test PR AUC {}'.format(test_loss, test_accuracy,
                                                                                   test_roc_auc, test_pr_auc))

# print some predict results
predictions = model.predict(test_dataset)
for prediction, goodRating in zip(predictions[:12], list(test_dataset)[0][1][:12]):
    print("Predicted good rating: {:.2%}".format(prediction[0]),
          " | Actual rating label: ",
          ("Good Rating" if bool(goodRating) else "Bad Rating"))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
   1681/Unknown - 5s 3ms/step - loss: 0.6378 - accuracy: 0.6441 - auc_2: 0.6937 - auc_3: 0.5650- 5s 3ms/step - loss: 0.6376 - accuracy: 0.6442 - auc_2: 0.6939 - auc_3: 0.56

Test Loss 0.6378426959938693, Test Accuracy 0.6440896391868591, Test ROC AUC 0.6937441229820251, Test PR AUC 0.5650314092636108
Predicted good rating: 65.23%  | Actual rating label:  Good Rating
Predicted good rating: 54.12%  | Actual rating label:  Good Rating
Predicted good rating: 72.58%  | Actual rating label:  Good Rating
Predicted good rating: 71.26%  | Actual rating label:  Bad Rating
Predicted good rating: 82.23%  | Actual rating label:  Good Rating
Predicted good rating: 74.91%  | Actual rating label:  Good Rating
Predicted good rating: 84.56%  | Actual rating label:  Good Rating
Predicted good rating: 52.84%  | Actual rating label:  Good Rating
Predicted good rating: 1.19%  | Actual rating label:  Bad Rating
Predicted good rating: 53.50%  | Actual rating l

# References
1. http://localhost:8888/lab/tree/RecommenderSystem/EmbeddingMLP/Tensorflow2%20EmbeddingMLP.ipynb
2. https://github.com/wzhe06/SparrowRecSys/blob/90d20f84aa6184963290ee87b4766a82b1c1280e/TFRecModel/src/com/sparrowrecsys/offline/tensorflow/EmbeddingMLP.py