In [46]:
%load_ext tensorboard

In [47]:
import numpy as np
import streamlit as st
import pandas as pd
# from imdb import IMDb
import pprint
import datetime


from tensorflow.keras.models import load_model
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

from sklearn.utils import shuffle

from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from keras.layers import Dropout, BatchNormalization, Activation
from keras.regularizers import l2
from keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow_datasets as tfds

In [48]:
data = pd.read_csv('../../../data/movielens/ratings.csv')
links = pd.read_csv('../../../data/movielens/links.csv')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [49]:
data.shape

(20000263, 4)

In [50]:
data = data.merge(links, how='left', on='movieId')

In [51]:
data['imdbId'] = data['imdbId'].astype('int')
data['imdbId'] = data['imdbId'].apply(lambda x: f'tt{x:07d}')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,1,2,3.5,1112486027,tt0113497,8844.0
1,1,29,3.5,1112484676,tt0112682,902.0
2,1,32,3.5,1112484819,tt0114746,63.0
3,1,47,3.5,1112484727,tt0114369,807.0
4,1,50,3.5,1112484580,tt0114814,629.0


In [52]:
print(f"Number of unique users: {data['userId'].nunique()}")
print(f"Number of unique movies: {data['movieId'].nunique()}")

Number of unique users: 138493
Number of unique movies: 26744


In [53]:
movie_id_map = {movie_id:idx for idx, movie_id in enumerate(data['movieId'].unique())}
user_id_map = {user_id:idx for idx, user_id in enumerate(data['userId'].unique())}

# data['movieId'] = data['movie_id'].map(movie_id_map)
# data['userId'] = data['user_id'].map(user_id_map)



In [54]:
N = data['userId'].max()+1
M = data['movieId'].max()+1

data = shuffle(data)
cutoff = int(0.8*len(data))
df_train = data.iloc[:cutoff]
df_test = data.iloc[cutoff:]

In [55]:
K = 5   #Determining the number of latent factors for embeddings
mu = df_train['rating'].mean()
epochs=100 
reg = 0.1 #Setting regularization strenght

# keras model
u = Input(shape=(1,)) #Defining input layer shape for users
m = Input(shape=(1,)) #Defining input layer shape for movies
u_embedding = Embedding(N, K, embeddings_regularizer=l2(reg))(u) # (N, 1, K)
m_embedding = Embedding(M, K, embeddings_regularizer=l2(reg))(m) # (N, 1, K)
u_embedding = Flatten()(u_embedding) # (N, K)
m_embedding = Flatten()(m_embedding) # (N, K)
x = Concatenate()([u_embedding, m_embedding]) # (N, 2K)

# the neural network
x = Dense(5000)(x)
# x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.5)(x)
x = Dense(2500)(x)
x = Dropout(0.5)(x)
x = Dense(1250)(x)
x = Dropout(0.5)(x)
x = Dense(500)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dense(1)(x)

early_stopping = EarlyStopping(monitor='val_mse', patience=3)

model = Model(inputs=[u, m], outputs=x)
model.compile(
  loss='mse',
  optimizer='adam',
  # optimizer=Adam(lr=0.01),
# optimizer=SGD(lr=0.08, momentum=0.9),
  metrics=['mse'],
)

In [56]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [57]:
r = model.fit(
  x=[df_train.userId.values, df_train.movieId.values],
  y=df_train.rating.values - mu,
  epochs=epochs,
  callbacks = [early_stopping, tensorboard_callback],
  batch_size=128,
  validation_data=(
    [df_test.userId.values, df_test.movieId.values],
    df_test.rating.values - mu
  )
)

Epoch 1/100
 25667/125002 [=====>........................] - ETA: 2:38:12 - loss: 1.2799 - mse: 0.9768

KeyboardInterrupt: 

In [None]:
# plot losses
plt.plot(r.history['loss'], label="train loss")
plt.plot(r.history['val_loss'], label="test loss")
plt.legend()
plt.show()

In [None]:
# plot mse
plt.plot(r.history['mse'], label="train mse")
plt.plot(r.history['val_mse'], label="test mse")
plt.legend()
plt.show()

In [None]:
%tensorboard --logdir logs/fit

In [None]:
model.save('../models/reco_v2.h5')