# Separate movie and user embedding into two layers

Previously I manually concatenated one hot vectors for movies and users so that I could keep a simple DL pipeline architecture. This made it complicated to extract the embeddings for just the movies. I suspect I will have better luck with separate layers that I join after the embeddings.

In [1]:
import pandas as pd
import numpy as np
import re
np.set_printoptions(precision=2, suppress=True, linewidth=3000, threshold=20000)

def load(n = 10):
    df_ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
    df_ratings = df_ratings.drop('timestamp', axis=1)
    df_ratings = df_ratings.sample(n=n).reset_index(drop=True)
    # Merge in the title and genres
    df_movies = pd.read_csv('data/ml-latest-small/movies.csv')
    df = df_ratings.merge(df_movies, on='movieId')
    # Strip the "(1999)" dates from the titles
    p = re.compile(r'[()0-9]+$')
    df['title'] = df['title'].map(lambda x: p.sub('', x).strip())
    return df

def compress_cats(df, colname):
    df[colname] = df[colname].astype('category').cat.as_ordered()
    df[colname] = df[colname].cat.codes + 1 # encode 1..n

df = load(n=20_000)
df = df[(df['genres']=='Comedy')|(df['genres']=='Drama')]
nmovies = len(df.groupby('movieId').count())
compress_cats(df, 'movieId')
compress_cats(df, 'userId')
print(len(df), 'movies that are Comedy or Drama')
df.head(3)

2641 movies that are Comedy or Drama


Unnamed: 0,userId,movieId,rating,title,genres
66,167,718,3.5,Borat: Cultural Learnings of America for Make ...,Comedy
67,430,718,4.5,Borat: Cultural Learnings of America for Make ...,Comedy
68,181,718,4.0,Borat: Cultural Learnings of America for Make ...,Comedy


In [2]:
# get unique movie ID and title and genres
df_movies = df.sort_values('movieId')[['movieId','title','genres']]
df_movies = df_movies.drop_duplicates()

## Training a split then joined network

We are trying to map a sparse movie and user ID vector to a dense vector of say `dimensionality`=8, 10, or 20 dimensions. To do that, we use the first parallel layers of a network that have `dimensionality` neurons.  There is a parallel layer for movie and for user ID embedding, which are then joined and run through a pipeline for predicting ratings. Each neuron will contribute a single dimension to each dense vector. The input X has, say, 10,000 rows, one for each one hot movie ID. It has nmovies columns. If there are 10 movies, there are 10 possible positions in the one hot encoding. The first layer is a transformation from nmovies or nusers space to `dimensionality` space. The key is that we want to choke that first layers into just a few neurons and then have a big layer afterwards that tries to make sense of those new compressed features. We don't care about the prediction at the end, we are just going to take the weights out of the first parallel layers to get the embeddings.

In [5]:
from tensorflow.keras import models, layers, callbacks, optimizers
import tensorflow_addons as tfa
from sklearn.metrics import accuracy_score, r2_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

def train(df,
          dimensionality = 8,
          otherlayers = (100,),
          batch_size = 10,
          epochs = 20,
          batchnorm = False,
          dropout = 0):
    # Everybody get one hot!
    nusers = len(df.groupby('userId').count())
    nmovies = len(df.groupby('movieId').count())
    X_onehot = pd.concat([pd.get_dummies(df['movieId']),
                          pd.get_dummies(df['userId'])], axis=1)
    y = df['rating']

    layer1 = dimensionality
    model = models.Sequential()
    model.add(layers.Dense(layer1, input_dim=nmovies+nusers, activation='relu',
                           name='embedding'))
    for n_hidden in otherlayers:
        model.add(layers.Dense(n_hidden, activation='relu'))
        if batchnorm:
            model.add(layers.BatchNormalization())
        model.add(layers.Dropout(dropout))

    model.add(layers.Dense(1))
    model.compile(loss='mean_squared_error',
                  optimizer=optimizers.RMSprop(),
                  metrics=['mae'])
    #print(model.summary())

    history = model.fit(X_onehot, y,
                        shuffle=True,
                        epochs=epochs,
                        validation_split=0.15,
                        batch_size=batch_size,
                        verbose=0,
                        callbacks=[tfa.callbacks.TQDMProgressBar(show_epoch_progress=False)]
                        )
    return model, history

def plot_history(history):
    plt.figure(figsize=(3.5,3))
    plt.ylabel("Rating (0..5.0) MAE")
    plt.xlabel("Epochs")
    accur = history.history['mae']
    plt.plot(accur, label='train_mae')
    val_accur = history.history['val_mae']
    plt.plot(val_accur, label='val_mae')
    # plt.xlim(0, 200)
    plt.ylim(0.0, 1.00)
    plt.legend(loc='lower right')
    plt.show()

In [None]:
model, history = train(df)

HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=20.0, style=Progr…

In [None]:
plot_history(history)