In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.optimizers import Adam, SGD
from keras.losses import MeanSquaredError
from keras.metrics import RootMeanSquaredError
from keras.layers import Input, Dense, Dropout, concatenate
from keras.models import Model
from keras import regularizers
from keras import backend as K
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

In [2]:
# Use random seed
tf.random.set_seed(100)

In [3]:
""" import dataset """

df1 = pd.read_csv('user_5cluster_rate.csv')
df1

Unnamed: 0,user_id,movie_id,rating,cluster
0,1,61,4,0
1,1,270,5,0
2,1,40,3,0
3,1,211,3,0
4,1,141,3,0
...,...,...,...,...
99995,943,559,4,1
99996,943,121,3,1
99997,943,402,2,1
99998,943,720,1,1


In [4]:
""" number of clusters """
num_cluster = df1['cluster'].unique().max() + 1

In [5]:
items = pd.read_csv('item_genres01.csv', usecols=['movie_id', 'Action', 
                'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
                'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], engine='python')
items

Unnamed: 0,movie_id,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1678,1679,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1679,1680,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1680,1681,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
'''merge rates with recipes nutrition dataset'''
df2 = pd.merge(df1,items)
df2

Unnamed: 0,user_id,movie_id,rating,cluster,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,61,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,13,61,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18,61,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,58,61,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,59,61,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,863,1678,1,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99996,863,1679,3,2,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
99997,863,1680,2,2,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
99998,896,1681,3,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
def show_error(history, skip):
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    plt.plot(np.arange(skip, len(loss), 1), loss[skip:])
    plt.plot(np.arange(skip, len(loss), 1), val_loss[skip:])
    plt.title('Cluster {}'.format(k))
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='best')
    plt.show()

In [8]:
""" Functions """

def dataPreprocessor(rating_df, num_users, num_items, init_value=0):
    """
        INPUT: 
            data: pandas DataFrame. columns=['userID', 'itemID', 'rating' ...]
            num_row: int. number of users
            num_col: int. number of items
            
        OUTPUT:
            matrix: 2D numpy array. 
    """
    matrix = np.full((num_items, num_users), init_value)
    for (_, rating, userID, itemID) in rating_df.itertuples():
      matrix[itemID, userID] = rating

    return matrix

In [9]:
def masked_mse(y_true, y_pred):
        # masked function
        mask_true = K.cast(K.not_equal(y_true, 0), K.floatx())
        # masked squared error
        masked_squared_error = K.square(mask_true * (y_true - y_pred))
        masked_mse = K.sum(masked_squared_error, axis=-1) / K.maximum(K.sum(mask_true, axis=-1), 1)
        return masked_mse

In [10]:
def masked_mse_clip(y_true, y_pred):
        # masked function
        mask_true = K.cast(K.not_equal(y_true, 0), K.floatx())
        y_pred = K.clip(y_pred, 1, 5)
        # masked squared error
        masked_squared_error = K.square(mask_true * (y_true - y_pred))
        masked_mse = K.sum(masked_squared_error, axis=-1) / K.maximum(K.sum(mask_true, axis=-1), 1)
        return masked_mse

In [11]:
def masked_rmse(y_true, y_pred):
        # masked function
        mask_true = K.cast(K.not_equal(y_true, 0), K.floatx())
        # masked squared error
        masked_squared_error = K.square(mask_true * (y_true - y_pred))
        masked_mse = K.sqrt(K.sum(masked_squared_error, axis=-1) / K.maximum(K.sum(mask_true, axis=-1), 1))
        return masked_mse

In [12]:
def masked_rmse_clip(y_true, y_pred):
        # masked function
        mask_true = K.cast(K.not_equal(y_true, 0), K.floatx())
        y_pred = K.clip(y_pred, 1, 5)
        # masked squared error
        masked_squared_error = K.square(mask_true * (y_true - y_pred))
        masked_mse = K.sqrt(K.sum(masked_squared_error, axis=-1) / K.maximum(K.sum(mask_true, axis=-1), 1))
        return masked_mse

In [13]:
# Creating a custom function for MAE 
def masked_mae_clip(y_true, y_pred):
    mask_true = K.cast(K.not_equal(y_true, 0), K.floatx())
    y_pred = K.clip(y_pred, 1, 5)
    masked_abs_error = K.abs(mask_true * (y_true - y_pred))
    masked_mae = K.sum(masked_abs_error, axis=-1) / K.maximum(K.sum(mask_true, axis=-1), 1)
    return masked_mae

In [14]:
""" execute DAE """

#loop over clusters

result = []

for k in range(num_cluster):
    #select rates of cluster k
    df3 = df2.loc[df2['cluster'] == k]
    df3['users_id'] = pd.factorize(df3.user_id)[0]
    df3['movies_id'] = pd.factorize(df3.movie_id)[0]
    
    #remove unnessory columns
    df = df3.drop(['user_id', 'movie_id', 'cluster'], axis=1)

    #+1 is the real size, as they are zero based
    num_users = df['users_id'].unique().max() + 1
    num_movies = df['movies_id'].unique().max() + 1
        
    side_info = df.drop(['rating', 'users_id'], axis=1).groupby(['movies_id']).first()
    side_info_np = side_info.to_numpy()
    
    new_df = df.drop(['Action', 'Adventure', 'Animation', 'Children\'s', 
                      'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
                      'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'],
                     axis=1).sort_values(by=['movies_id'])

    #divide ds int trin and test sets
    train_df, test_df = train_test_split(new_df, stratify=df['users_id'], test_size=0.2, random_state=100)  
    
    # Creating a sparse pivot table with users in rows and items in columns
    users_items_matrix_train_zero = dataPreprocessor(train_df, num_users, num_movies, 0)
    users_items_matrix_test = dataPreprocessor(test_df, num_users, num_movies, 0)
    

    #convert int array to float
    users_items_matrix_train_zero_float = users_items_matrix_train_zero.astype(float)
    users_items_matrix_test_float = users_items_matrix_test.astype(float)
    
    #build the model
    input_layer = x = Input(shape=(users_items_matrix_train_zero.shape[1],), name='UserRating')
    x = Dropout(rate = 0.5)(x)
    item = Input(shape=(side_info_np.shape[1],), name='ItemInfo')
    con = concatenate([x, item], name='Concatenate')
    h = Dense(128, activation='sigmoid', name='LatentSpace', kernel_regularizer=regularizers.l2(0.0001))(con)
    output_layer = Dense(users_items_matrix_train_zero.shape[1], activation='linear', name='UserScorePred', 
                         kernel_regularizer=regularizers.l2(0.0001))(h)
    Rec = Model([input_layer, item], output_layer)
    
    Rec.compile(optimizer = Adam(learning_rate=0.0001), loss=masked_mse, metrics=[masked_mae_clip, masked_rmse_clip,
                                                                                    masked_mse_clip])
    Rec.summary()

    #Run the model
    es = EarlyStopping(monitor='val_masked_rmse_clip', mode='min', verbose=0, patience=20)
    hist_Rec = Rec.fit(x=[users_items_matrix_train_zero_float, side_info_np], y=users_items_matrix_train_zero_float, 
                                epochs=500, batch_size=128, verbose = 0,  
                                validation_data=[[users_items_matrix_train_zero_float, side_info_np],
                                                 users_items_matrix_test_float], callbacks=es)

    #Evaluate the model
    test_result_deep = Rec.evaluate([users_items_matrix_train_zero_float, side_info_np], users_items_matrix_test_float)
    #show_error(hist_Rec, 20)
    #store the RMSE of each cluster
    result.append(test_result_deep[2])

1570
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 UserRating (InputLayer)        [(None, 234)]        0           []                               
                                                                                                  
 dropout (Dropout)              (None, 234)          0           ['UserRating[0][0]']             
                                                                                                  
 ItemInfo (InputLayer)          [(None, 18)]         0           []                               
                                                                                                  
 Concatenate (Concatenate)      (None, 252)          0           ['dropout[0][0]',                
                                                                  'ItemInfo[0][0]']      

 Layer (type)                   Output Shape         Param #     Connected to                     
 UserRating (InputLayer)        [(None, 135)]        0           []                               
                                                                                                  
 dropout_4 (Dropout)            (None, 135)          0           ['UserRating[0][0]']             
                                                                                                  
 ItemInfo (InputLayer)          [(None, 18)]         0           []                               
                                                                                                  
 Concatenate (Concatenate)      (None, 153)          0           ['dropout_4[0][0]',              
                                                                  'ItemInfo[0][0]']               
                                                                                                  
 LatentSpa

In [15]:
# find the average of RMSE
average = sum(result) / len(result)

# Printing average of the list
print("Average of RMSE =", round(average, 4))

Average of RMSE = 0.558
