# Assignment 7 - Recommender Systems

by Bryan Carr

for University of San Diego's AAI 511 - Neural Networks and Deep Learning

13 August 2022


In this assignment, we will build recommender systems using deep learning. The dataset is a movie ratings dataset: the "ML-100k" Movielens data from Harper and Konstan (2015) at the University of Minnesota. 

In [None]:
# Importing key libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
# Mount our Google Drive
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Set random states
rs = 813
np.random.seed(rs)
tf.random.set_seed(rs)

## 7.1: Importing the Data and Exploratory Data Analysis

First we will import the data, contained in two files: u.data for the users, and u.items for the items (movies). The readme that accompanies the dataset gives us the column names.

In [None]:
# Import the Data
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

movie_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url', 'unknown_genre', 'action',
              'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film-noir',
              'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western']

demo_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']


ratings_df = pd.read_csv('/content/drive/My Drive/AAI511/ml-100k/u.data', sep=None, header=None, names=ratings_cols)
items_df = pd.read_csv('/content/drive/My Drive/AAI511/ml-100k/u.item', sep='|', header=None, names=movie_cols, encoding='latin-1', parse_dates=['release_date', 'video_release_date'])
demo_df = pd.read_csv('/content/drive/My Drive/AAI511/ml-100k/u.user', sep='|', header=None, names=demo_cols, encoding='latin-1')

  return func(*args, **kwargs)


In [None]:
ratings_df.head(10)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [None]:
ratings_df.shape

(100000, 4)

In [None]:
# We can drop the Timestep data, as we will not be using it. We only want to link Users to Movies to Ratings
ratings_df.drop(columns=['timestamp'], inplace=True)

In [None]:
# Let's have a look at the first movie id
items_df.loc[items_df['movie_id'] == 242]

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,imdb_url,unknown_genre,action,adventure,animation,children,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
241,242,Kolya (1996),1997-01-24,NaT,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
ratings_df.nunique()

# These value match the expected from the dataset documentation -- we're on the right track

user_id      943
movie_id    1682
rating         5
dtype: int64

In [None]:
# Check the distribution of Ratings
ratings_df.rating.value_counts()

4    34174
3    27145
5    21201
2    11370
1     6110
Name: rating, dtype: int64

In [None]:
items_df.head(10)

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,imdb_url,unknown_genre,action,adventure,animation,children,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),1995-01-01,NaT,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),1995-01-01,NaT,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),1995-01-01,NaT,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),1995-01-01,NaT,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),1995-01-01,NaT,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,1995-01-01,NaT,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,Twelve Monkeys (1995),1995-01-01,NaT,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,8,Babe (1995),1995-01-01,NaT,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,9,Dead Man Walking (1995),1995-01-01,NaT,http://us.imdb.com/M/title-exact?Dead%20Man%20...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,Richard III (1995),1996-01-22,NaT,http://us.imdb.com/M/title-exact?Richard%20III...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [None]:
items_df.shape

(1682, 24)

In [None]:
items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   movie_id            1682 non-null   int64         
 1   movie_title         1682 non-null   object        
 2   release_date        1681 non-null   datetime64[ns]
 3   video_release_date  0 non-null      datetime64[ns]
 4   imdb_url            1679 non-null   object        
 5   unknown_genre       1682 non-null   int64         
 6   action              1682 non-null   int64         
 7   adventure           1682 non-null   int64         
 8   animation           1682 non-null   int64         
 9   children            1682 non-null   int64         
 10  comedy              1682 non-null   int64         
 11  crime               1682 non-null   int64         
 12  documentary         1682 non-null   int64         
 13  drama               1682 non-null   int64       

In [None]:
# We can go ahead and forp Video Release Date and IMDB URL -- there are no release dates (all Null), and we will not be using the URLs.

items_df.drop(columns=['video_release_date', 'imdb_url'], inplace=True)

items_df.head()

Unnamed: 0,movie_id,movie_title,release_date,unknown_genre,action,adventure,animation,children,comedy,crime,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),1995-01-01,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),1995-01-01,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),1995-01-01,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),1995-01-01,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),1995-01-01,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [None]:
# Let's look at counts of the Genres, by looping through all Genres in the Items_DF
# Recall that the same movie can have multiple genres -- they do not need to be evenly split

for i in range (3, len(items_df.columns)):
  #print(items_df.columns[i])
  print(items_df.iloc[:,i].value_counts())
  print("")

0    1680
1       2
Name: unknown_genre, dtype: int64

0    1431
1     251
Name: action, dtype: int64

0    1547
1     135
Name: adventure, dtype: int64

0    1640
1      42
Name: animation, dtype: int64

0    1560
1     122
Name: children, dtype: int64

0    1177
1     505
Name: comedy, dtype: int64

0    1573
1     109
Name: crime, dtype: int64

0    1632
1      50
Name: documentary, dtype: int64

0    957
1    725
Name: drama, dtype: int64

0    1660
1      22
Name: fantasy, dtype: int64

0    1658
1      24
Name: film-noir, dtype: int64

0    1590
1      92
Name: horror, dtype: int64

0    1626
1      56
Name: musical, dtype: int64

0    1621
1      61
Name: mystery, dtype: int64

0    1435
1     247
Name: romance, dtype: int64

0    1581
1     101
Name: sci-fi, dtype: int64

0    1431
1     251
Name: thriller, dtype: int64

0    1611
1      71
Name: war, dtype: int64

0    1655
1      27
Name: western, dtype: int64



In [None]:
demo_df.head(10)

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
5,6,42,M,executive,98101
6,7,57,M,administrator,91344
7,8,36,M,administrator,5201
8,9,29,M,student,1002
9,10,53,M,lawyer,90703


In [None]:
demo_df.nunique()

user_id       943
age            61
gender          2
occupation     21
zip_code      795
dtype: int64

In [None]:
demo_df.describe()

Unnamed: 0,user_id,age
count,943.0,943.0
mean,472.0,34.051962
std,272.364951,12.19274
min,1.0,7.0
25%,236.5,25.0
50%,472.0,31.0
75%,707.5,43.0
max,943.0,73.0


In [None]:
demo_df.dtypes

user_id        int64
age            int64
gender        object
occupation    object
zip_code      object
dtype: object

## 7.2: Simple Recommender Architecture and Data Pre-Processing

We have 3 data files: Ratings, User Demographic Info (demo), and Movie Info (items). We will want to create embeddings from the Demographic info and Movie info, with the same number of dimensions, so that they represent the factorized matrix of user-rating pairs.

We will be passing in the User_ID and Movie_ID pairs. Then looking up the additional data for those entries and passing them in to the embeddings.

But first, let's build a simple recomender, which only works on the IDs, similar to the example given. This will give some practice with building the system.

We'll begin by building the test-train data split. I will stratify based on the User IDs, so that a relatively balanced set of User IDs is guaranteed to be in the testing data.

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    ratings_df.drop(columns=['rating']),  #X vars
    ratings_df.rating,  # Y values
    test_size=0.2, 
    stratify=ratings_df.iloc[:,0], #straify over user_ID, for an even split of user ratings
    random_state = rs
    )

Next I'll import the Keras libraries to build the model, and assemble the simple model. This simple model will only encode the User and Movie IDs in the Embedding layers, not the additional info.

In [None]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras import Model

In [None]:
# Grab the number of Users and number of Movies

num_users = ratings_df.user_id.nunique()
num_movies = ratings_df.movie_id.nunique()

(num_users, num_movies)

(943, 1682)

In [None]:
# Build the model

# User Embedding Layer
input_user_layer = Input(shape=(1,))
embed_user_layer = Embedding(num_users+1, 32, name="UserEmbedding")(input_user_layer)
user_output = Flatten()(embed_user_layer)

# Movie Embedding Layer
input_movie_layer = Input(shape=(1,))
embed_movie_layer = Embedding(num_movies+1, 32, name="MovieEmbedding")(input_movie_layer)
movie_output = Flatten()(embed_movie_layer)

# Concatenate to combine
concat = Concatenate()([user_output, movie_output])

# Add Fully Conected layers with Dropouts
dense1 = Dense(128, activation='relu', name="Dense1")(concat)
dropout1 = Dropout(0.2)(dense1)

dense2 = Dense(64, activation='relu', name="Dense2")(dropout1)
dropout2 = Dropout(0.2)(dense2)

dense3 = Dense(32, activation='relu')(dropout2)

output1 = Dense(1)(dense3)


# Create Model
model1 = Model([input_user_layer, input_movie_layer], output1)
model1.compile('adam', loss='MeanSquaredError')

In [None]:
model1.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 UserEmbedding (Embedding)      (None, 1, 32)        30208       ['input_1[0][0]']                
                                                                                                  
 MovieEmbedding (Embedding)     (None, 1, 32)        53856       ['input_2[0][0]']                
                                                                                              

In [None]:
# Fit/Train the model
hist1 = model1.fit([x_train.user_id, x_train.movie_id],
                    y=y_train,
                    batch_size=100,
                    epochs=8,
                    verbose=1,
                    validation_split=0.2)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [None]:
# Make predictions

model1_pred = model1.predict([x_test.user_id, x_test.movie_id])

In [None]:
model1_pred

array([[2.8770201],
       [3.5049536],
       [3.828801 ],
       ...,
       [4.734122 ],
       [3.8806143],
       [3.2465236]], dtype=float32)

In [None]:
# Evaluate the model
from sklearn.metrics import mean_squared_error

model1_mse = mean_squared_error(y_test, model1_pred)
model1_mse

0.8636903492651987

In [None]:
model1_rmse = np.sqrt(model1_mse)
model1_rmse

0.9293494225882958

## 7.3: A more advanced model

Let's try building a model with more dimensions to capture users. Lee and Lee (2018) recommend 3 layers of size 256, with batch normalization.

In [None]:
from tensorflow.keras.layers import BatchNormalization

In [None]:
# Build the model

# User Embedding Layer
input_user_layer = Input(shape=(1,))
embed_user_layer = Embedding(num_users+1, 128, name="UserEmbedding")(input_user_layer)
user_output = Flatten()(embed_user_layer)

# Movie Embedding Layer
input_movie_layer = Input(shape=(1,))
embed_movie_layer = Embedding(num_movies+1, 128, name="MovieEmbedding")(input_movie_layer)
movie_output = Flatten()(embed_movie_layer)

# Concatenate to combine
concat = Concatenate()([user_output, movie_output])

# Add Fully Conected layers with Dropouts
dense1 = Dense(256, activation='relu', name="Dense1")(concat)
batch1 = BatchNormalization()(dense1)

dense2 = Dense(256, activation='relu', name="Dense2")(batch1)
batch2 = BatchNormalization()(dense2)

dense3 = Dense(256, activation='relu', name="Dense3")(batch2)

dense4 = Dense(5, activation='softmax', name="Output")(dense3)



# Create Model
model2 = Model([input_user_layer, input_movie_layer], dense4)
model2.compile('adam', loss='MeanSquaredError')

In [None]:
model2.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_11 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_12 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 UserEmbedding (Embedding)      (None, 1, 128)       120832      ['input_11[0][0]']               
                                                                                                  
 MovieEmbedding (Embedding)     (None, 1, 128)       215424      ['input_12[0][0]']               
                                                                                            

In [None]:
# Fit/Train the model
hist2 = model2.fit([x_train.user_id, x_train.movie_id],
                    y=y_train,
                    batch_size=128,
                    epochs=8,
                    verbose=1,
                    validation_split=0.2)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [None]:
# Make predictions

model2_pred = model2.predict([x_test.user_id, x_test.movie_id])


In [None]:
model2_pred

array([[0.20014481, 0.20002088, 0.1997903 , 0.20001742, 0.20002654],
       [0.20019971, 0.20003082, 0.19970787, 0.20002568, 0.20003586],
       [0.19975717, 0.19996445, 0.20034544, 0.19996592, 0.19996704],
       ...,
       [0.20005777, 0.19999713, 0.1999476 , 0.1999923 , 0.20000517],
       [0.20010307, 0.20001042, 0.19985439, 0.2000077 , 0.20002449],
       [0.19975282, 0.19994183, 0.20041011, 0.19994201, 0.19995318]],
      dtype=float32)

In [None]:
y_test

77924    4
64385    4
52706    4
85330    4
56236    4
        ..
48509    3
94068    3
27473    5
69132    4
33870    3
Name: rating, Length: 20000, dtype: int64

In [None]:
model2_mse = mean_squared_error(y_test, [np.argmax(p)+1 for p in model2_pred])
model2_mse

4.2898

In [None]:
model2_rmse = np.sqrt(model2_mse)
model2_rmse

2.071183236703117

This above model is performing rather poorly. I will try one more similar model, with Dropout layers instead of Normalization -- I believe the normalization may be flattening out any correlations that develop. I will also try 64 for the embedding dimensions, as 128 may be overfitting as well. The Softmax/Classification approach also does not appear to be serving the analysis well (all predictions are very close ot 0.2), so I will switch back to a straight ReLU regression-style prediction.

## 7.5: 3-Layer Model with Dropouts

In [None]:
# Build the model

# User Embedding Layer
input_user_layer = Input(shape=(1,))
embed_user_layer = Embedding(num_users+1, 64, name="UserEmbedding")(input_user_layer)
user_output = Flatten()(embed_user_layer)

# Movie Embedding Layer
input_movie_layer = Input(shape=(1,))
embed_movie_layer = Embedding(num_movies+1, 64, name="MovieEmbedding")(input_movie_layer)
movie_output = Flatten()(embed_movie_layer)

# Concatenate to combine
concat = Concatenate()([user_output, movie_output])

# Add Fully Conected layers with Dropouts
dense1 = Dense(256, activation='relu', name="Dense1")(concat)
dropout1 = Dropout(0.2)(dense1)

dense2 = Dense(256, activation='relu', name="Dense2")(dropout1)
dropout2 = Dropout(0.2)(dense2)

dense3 = Dense(256, activation='relu', name="Dense3")(dropout2)
dropout3 = Dropout(0.2)(dense3)

output1 = Dense(1, activation='relu', name='output')(dropout3)




# Create Model
model3 = Model([input_user_layer, input_movie_layer], output1)
model3.compile('adam', loss='MeanSquaredError')

In [None]:
model3.summary()

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_15 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_16 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 UserEmbedding (Embedding)      (None, 1, 64)        60416       ['input_15[0][0]']               
                                                                                                  
 MovieEmbedding (Embedding)     (None, 1, 64)        107712      ['input_16[0][0]']               
                                                                                            

In [None]:
# Fit/Train the model
hist3 = model3.fit([x_train.user_id, x_train.movie_id],
                    y=y_train,
                    batch_size=128,
                    epochs=8,
                    verbose=1,
                    validation_split=0.2)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [None]:
model3_pred = model3.predict([x_test.user_id, x_test.movie_id])

In [None]:
model3_pred

array([[2.8457189],
       [3.959443 ],
       [4.1913147],
       ...,
       [4.768856 ],
       [4.4737062],
       [3.4334526]], dtype=float32)

In [None]:
model3_mse = mean_squared_error(y_test, model3_pred)
model3_mse

0.9573634706813454

In [None]:
model3_rmse = np.sqrt(model3_mse)
model3_rmse

0.9784495238290759

## 7.5: Discussion and Conclusions

We're tried 3 approaches:

1. A simple model with 32-dim embeddings and 3 layers of decreasing size (128, 64, 32 nodes). This had our best score, with RMSE=0.929.

2. A larger model with a classification approach. This had a poor RMSE of 2.07.

3. A similarly large model, with a regression approach. This had performance closer to, but slightly worse than, the initial model, with RMSE = 0.978.

It seems we can say for certain that the classification approach is not very useful for this task. Additionally, the more complex models were not succesful. I also noticed that higher number of epochs quickly led to higher validation losses, across the board, indicating overfitting -- it seems as if these recommender systems may be seriously prone to overfitting.

## 7.x: Advanced Model to Look Up User and Movie Info

I initially wanted to use the User Demographic Info and Movie Genre Info to help build the rating system. However, I'm not sure how to incorporate this (it may be beyond my programming skill). After some thought, I also think this info is more appropriate for generating initial recomendations on a 'cold start' by traditional methods (not Deep Learning). Since we have ratings, it's really msotly ratings that we care about, and the similarity of ratings from user to user is what's key. I am leaving the remainder of the code below for posterity's sake.

========================================

We will now build a more advanced model, that uses the additional parameters provided. My intent is to pre-process the User and Movie information into a more useable form, then use custom Layers to look them up based on ID variables.

### 7.3.1: User Demographic Pre-Processing

We will need to One-Hot-Encode the Gender and Occupation info. OHE is probably best for the Zip Code as well, but I don't imagine the Zip Code will have much correlation, and there are many more zip code values than other values. Therefore OHE would dilute the importance of the other values. I will try dropping them instead.

We should also min-max scale the Ages, so put them in the same size scale as the OHE variables.

In [None]:
# Review the User
demo_df.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [None]:
demo_df.nunique()

user_id       943
age            61
gender          2
occupation     21
zip_code      795
dtype: int64

In [None]:
demo_df.drop(columns=['zip_code'], inplace=True)

In [None]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

In [None]:
# Build the User Transformer
scaler = MinMaxScaler()
binary_ohe = OneHotEncoder(drop='if_binary', sparse=False)
ohe = OneHotEncoder(sparse=False)

user_transformer = ColumnTransformer([
    ("passthrough_id", "passthrough", ['user_id']),
    ('Scaler', scaler, ['age']),
    ('binary', binary_ohe, ['gender']),
    ('OneHot', ohe, ['occupation'])]
)

In [None]:
user_df_transf = user_transformer.fit_transform(demo_df)

user_df_transf

array([[1.00000000e+00, 2.57575758e-01, 1.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [2.00000000e+00, 6.96969697e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.00000000e+00, 2.42424242e-01, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       ...,
       [9.41000000e+02, 1.96969697e-01, 1.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.42000000e+02, 6.21212121e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.43000000e+02, 2.27272727e-01, 1.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

Next we need to build a similar column transformer for the Movie info.

In [None]:
items_df

Unnamed: 0,movie_id,movie_title,release_date,unknown_genre,action,adventure,animation,children,comedy,crime,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),1995-01-01,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),1995-01-01,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),1995-01-01,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),1995-01-01,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),1995-01-01,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),1998-02-06,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),1998-02-06,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),1998-01-01,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),1994-01-01,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Build the Movie Info transformer

movie_transformer = ColumnTransformer([
    ('passthrough_id', "passthrough", ['movie_id']),
    ('binary', binary_ohe, movie_cols[5:])
    ],
    remainder='drop'
)


In [None]:
movie_df_transf = movie_transformer.fit_transform(items_df)
movie_df_transf

array([[1.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.000e+00, 0.000e+00, 1.000e+00, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [3.000e+00, 0.000e+00, 0.000e+00, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [1.680e+03, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.681e+03, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.682e+03, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [None]:
movie_df_transf.shape

(1682, 20)

### 7.3.2: Build Custom Layers

I now will build custom layers that look up the row of the given ID, and return its values.

First I will devise a way to pull the information I want from the transformed arrays.