In [2]:
# %pip install scikit_learn
import pandas as pd
import os
import glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error
from tensorflow import keras
import tensorflow as tf
import random
from datetime import datetime
from tensorflow.keras.models import load_model

%load_ext tensorboard
%matplotlib inline
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("Tensorflow version:", tf.__version__)

physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    
# Attempt to make runs more reproducible
seed_value=20212042
print("Using seed value: %d" % seed_value)
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value) # tensorflow 2.x

# Set paths
base_dir = "/workspace/C683_Assignment"
data_dir = os.path.join(base_dir, "prize-data")
movie_titles = os.path.join(data_dir, "movie_titles.csv")
model_filepath_tmpl = os.path.join(base_dir, "model-weights", "weights-%s-%d-%.04f.h5")
all_data_filepath  = os.path.join(data_dir, "all_data.csv")
train_data_filepath  = os.path.join(data_dir, "train_data.pkl")
test_data_filepath = os.path.join(data_dir, "test_data.pkl")

# Utility classes / functions
def get_genres(titles):
    return [col for col in titles if col.startswith("genre.")]

class TrainTestSequence(keras.utils.Sequence):
    def __init__(self, data_set, titles, batch_size, shuffle):
        self.data_set = data_set
        self.titles = titles
        self.genre_cols = get_genres(titles)
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()
        
    def on_epoch_end(self):
        self.data_set = shuffle(self.data_set)
        
    def __len__(self):
        return self.data_set.shape[0] // self.batch_size
    
    def __getitem__(self, idx):
        batch = self.data_set[idx*self.batch_size:(idx+1)*self.batch_size]
        batch = batch.merge(titles, how="inner", left_on="movie", right_on="id")
        return [np.array(batch["user_ord"]), np.array(batch["movie_ord"]), np.array(batch[self.genre_cols])], \
                np.array(batch["rating"])
    
def make_generator(data_set, titles, batch_size, shuffle, worker_count, use_multiprocessing):
    seq = TrainTestSequence(data_set, titles, batch_size, shuffle)
    enq = keras.utils.OrderedEnqueuer(seq, use_multiprocessing=use_multiprocessing)
    enq.start(workers=worker_count, max_queue_size=worker_count*20)
    return enq

Num GPUs Available: 1
Tensorflow version: 2.4.0
Using seed value: 20212042


In [4]:
# Constants
BATCH_SIZE = 4096
USER_COUNT = 480189
MOVIE_COUNT = 17770
EMBEDDING_SIZE = 64

# Prepare train/test if needed
if not os.path.isfile(train_data_filepath) or not os.path.isfile(test_data_filepath):
    print("Loading Netflix Prize data set")
    all_data = pd.read_csv(all_data_filepath,
                           names=[ "movie", "user", "rating", "date" ],
                           dtype={ "movie": np.int, "user": np.int, "movie": np.int },
                           parse_dates=["date"])
    print("Creating ordinal encodings for movie and user")
    ord_enc = OrdinalEncoder(dtype=np.int)
    all_data["movie_ord"] = ord_enc.fit_transform(all_data[["movie"]])
    all_data["user_ord"] = ord_enc.fit_transform(all_data[["user"]])
    print(all_data.head())
    print("Splitting into train and test data")
    train_data, test_data = train_test_split(all_data, test_size=0.15)
    print("Writing train data to a file")
    train_data.to_pickle(train_data_filepath)
    print("Writing test data to a file")
    test_data.to_pickle(test_data_filepath)
else:
    print("Loading Prize train data")
    train_data = pd.read_pickle(train_data_filepath)
    display(train_data.head())
    print("Loading Prize test data")
    test_data = pd.read_pickle(test_data_filepath)
    display(test_data.head())
    
train_batch_count = train_data.shape[0] // BATCH_SIZE
test_batch_count = test_data.shape[0] // BATCH_SIZE

print("Loading movie titles")
titles = pd.read_csv(movie_titles)
display(titles.head())

Loading Prize train data


Unnamed: 0,movie,user,rating,date,movie_ord,user_ord
36317616,6450,673882,4,2004-09-20,6449,122759
47281712,8524,772991,2,2004-10-18,8523,140628
5113855,1035,1848170,5,2003-04-21,1034,335353
26456292,4931,2297858,1,2005-07-29,4930,416403
18640695,3579,1054225,2,2004-12-13,3578,191482


Loading Prize test data


Unnamed: 0,movie,user,rating,date,movie_ord,user_ord
22665343,4302,972089,3,2005-09-25,4301,176723
76499487,13853,114048,3,2004-07-01,13852,20682
90558129,16128,1027246,4,2005-04-20,16127,186665
62417389,11323,186884,3,2003-10-22,11322,33801
28603643,5293,2551541,5,2000-10-03,5292,462493


Loading movie titles


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,year,title,id_moviedb,genre_ids,vote_average,media_type,popularity,...,genre.Romance,genre.Fantasy,genre.Family,genre.Crime,genre.Drama,genre.Music,genre.History,genre.TV Movie,genre.War,genre.Western
0,0,0,17,2005,7 Seconds,9721,"[28, 80, 53]",5,movie,12.895,...,0,0,0,1,0,0,0,0,0,0
1,1,1,85,2005,Elfen Lied,550300,"[28, 16, 18, 27, 10749, 878]",7,movie,2.681,...,1,0,0,0,1,0,0,0,0,0
2,2,2,91,2005,WWE: Royal Rumble 2005,58649,"[28, 18]",7,movie,11.739,...,0,0,0,0,1,0,0,0,0,0
3,3,3,149,2005,The Edward R. Murrow Collection,0,,0,unknown,0.0,...,0,0,0,0,0,0,0,0,0,0
4,4,4,151,2005,Sleepover Nightmare,13568,[27],3,movie,6.8,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Define model
user_input = keras.layers.Input(shape=(1,))
user_embedding = keras.layers.Embedding(USER_COUNT, EMBEDDING_SIZE)(user_input)
user_embedding = keras.layers.Flatten()(user_embedding)

movie_input = keras.layers.Input(shape=(1,))
movie_embedding = keras.layers.Embedding(MOVIE_COUNT, EMBEDDING_SIZE)(movie_input)
movie_embedding = keras.layers.Flatten()(movie_embedding)

genre_count = len(get_genres(titles))
genre_input = keras.layers.Input(shape=(genre_count,))
genre_dense = keras.layers.Dense(32)(genre_input)

concat = keras.layers.Concatenate()([user_embedding, movie_embedding, genre_dense])
dense = keras.layers.Dense(64)(concat)
dropout = keras.layers.Dropout(0.2)(dense)
model = keras.models.Model([user_input, movie_input, genre_input], dropout)
print(model.summary())

# Train model
model.compile(optimizer='adam', loss='mse', metrics=keras.metrics.RootMeanSquaredError())
model.fit(make_generator(train_data, titles, BATCH_SIZE, True, 6, True).get(),
          epochs=1, steps_per_epoch=train_batch_count,
          verbose=1)

# Evaluate and model
y_pred = model.predict(make_generator(test_data, titles, BATCH_SIZE, False, 6, True).get(), steps=test_batch_count)
rmse = mean_squared_error(test_data[:y_pred.shape[0]]["rating"], y_pred)
print("RMSE: %.04f. Saving model" % rmse)
model_filepath = model_filepath_tmpl % (datetime.now().strftime("%d_%m_%Y_%H_%M"), 0, rmse)
model.save(model_filepath)

# Start tensorboard
# TODO

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1, 64)        30732096    input_8[0][0]                    
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 1, 64)        1137280     input_9[0][0]                    
______________________________________________________________________________________________

Process Keras_worker_ForkPoolWorker-5:
Process Keras_worker_ForkPoolWorker-1:
Process Keras_worker_ForkPoolWorker-3:
Process Keras_worker_ForkPoolWorker-2:
Traceback (most recent call last):
Process Keras_worker_ForkPoolWorker-6:
Process Keras_worker_ForkPoolWorker-4:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multip

KeyboardInterrupt: 

  File "/usr/lib/python3.8/multiprocessing/queues.py", line 355, in get
    with self._rlock:
  File "/usr/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/usr/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/usr/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 356, in get
    res = self._reader.recv_bytes()
KeyboardInterrupt
KeyboardInterrupt
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
    buf = self._recv(4)
